## Load Data

In [1]:
# Use all processor cores
from sklearnex import patch_sklearn
patch_sklearn()

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Load libraries
import jsonlines
import pandas as pd

In [3]:
%%time
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

Wall time: 2.81 s


In [4]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [5]:
# Check out the categories
categories = cat_comments_df.cat.unique()
print("The categories are:")
for category in categories:
    print(" -", category)

The categories are:
 - sports
 - science_and_technology
 - video_games


## Preprocess Text

In [6]:
# Load libraries
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

# from sklearn.feature_extraction.text import CountVectorizer

In [7]:
df = cat_comments_df.copy()

In [8]:
# df.head()

In [9]:
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [10]:
%%time
# Clean up the text in the 'txt' column
df.txt = df.txt.apply(lambda string: cleanText(string))

Wall time: 9.55 s


In [11]:
# df.head()

In [12]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()

df['txt_stems'] = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 2min 57s


In [13]:
%%time
df['txt_str'] = df.txt_stems.apply(lambda s: ' '.join(map(str, s)))

Wall time: 1.19 s


In [14]:
df.head()

Unnamed: 0,cat,txt,txt_stems,txt_str
0,sports,"[barely, better, gabbert, significantly, bette...","[bare, better, gabbert, significantli, better,...",bare better gabbert significantli better year ...
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]","[fuck, duck, angel, welcom, new, niner, fan]",fuck duck angel welcom new niner fan
2,sports,"[drafted, wrs, matt, millen, probably]","[draft, wr, matt, millen, probabl]",draft wr matt millen probabl
3,sports,[done],[done],done
4,sports,[noo],[noo],noo


## Prepare Text for Model-Building

In [15]:
# Load libraries
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Back up, let's sample to equal sized groups:
https://stackoverflow.com/questions/41345289/getting-a-random-sample-in-python-dataframe-by-category

In [16]:
cat_group = df.groupby('cat', as_index=False, group_keys=False)

In [17]:
balancedDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

In [18]:
balancedDF.cat.value_counts()

sports                    25000
video_games               25000
science_and_technology    25000
Name: cat, dtype: int64

In [19]:
balancedDF.head()

Unnamed: 0,cat,txt,txt_stems,txt_str
9949,science_and_technology,"[shit, 99, probably, never, written, lick, cod...","[shit, 99, probabl, never, written, lick, code...",shit 99 probabl never written lick code either...
7800,science_and_technology,[qualcomm],[qualcomm],qualcomm
18759,science_and_technology,"[rule, 4]","[rule, 4]",rule 4
3583,science_and_technology,"[iphones, copied, htc, way, around]","[iphon, copi, htc, way, around]",iphon copi htc way around
22746,science_and_technology,"[biggest, gripe, cant, customize, skip, interv...","[biggest, gripe, cant, custom, skip, interv, p...",biggest gripe cant custom skip interv podcast ...


## NN

In [20]:
# Set up data and labels
X = balancedDF.txt_str
y = balancedDF.cat

### Train Test Split

In [21]:
import numpy as np
from scipy.sparse import csr_matrix

In [22]:
%%time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 19.4 ms


## 2. Define Keras Model

In [23]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

In [24]:
# define the keras model
model = Sequential()
model.add(Dense(12, input_dim=1, activation='relu')) # 
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='softmax'))

## 3. Compile Keras Model

In [25]:
from keras.optimizers import RMSprop

In [26]:
# compile the keras model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #  optimizer=RMSprop(lr=0.01)

## 4. Fit Keras Model

In [27]:
# %%time
# # fit the keras model on the dataset
# model.fit(X_train, y_train, epochs=50, batch_size=10)

# and now for something completely different...

## 20.4 Training a Multiclass Classifier

In [28]:
# Load libraries
import numpy as np
from keras.datasets import reuters
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers

In [45]:
# # Set random seed
# np.random.seed(0)

# # Set the number of features we want
number_of_features = 5000

# # Load feature and target data
# # data = reuters.load_data(num_words=number_of_features)
# data = balancedDF.copy()
# X = data.txt_str
# y = data.cat
# # (data_train, target_vector_train), (data_test, target_vector_test) = data
# from sklearn.model_selection import train_test_split
data_train, data_test, target_vector_train, target_vector_test = train_test_split(X, y) # Default is 1/4 --> test

# # Convert feature data to a one-hot encoded feature matrix
# tokenizer = Tokenizer(num_words=number_of_features)
# features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
# features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")

# One-hot encode target vector to create a target matrix
# target_train = to_categorical(target_vector_train)
# target_test = to_categorical(target_vector_test)


In [32]:
# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=number_of_features)

In [33]:
text_data, string = [], " "

for text in balancedDF.txt_stems:
    text_data.append(string.join(text))

In [38]:
data_train = text_data

In [41]:
# features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
features_train = tokenizer.texts_to_matrix(data_train, mode="binary")

In [42]:
# features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")
features_test = tokenizer.texts_to_matrix(data_test, mode="binary")

In [43]:
%%time

# Start neural network
network = models.Sequential()

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=100,
                         activation="relu",
                         input_shape=(number_of_features,)))

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=100, activation="relu"))

# Add fully connected layer with a softmax activation function
network.add(layers.Dense(units=46, activation="softmax"))

# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

# Train neural network
history = network.fit(features_train, # Features
                      target_train, # Target
                      epochs=3, # Three epochs
                      verbose=0, # No output
                      batch_size=100, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data

NameError: name 'target_train' is not defined

In [46]:
# View target matrix
target_train

NameError: name 'target_train' is not defined

In [47]:
target_train.shape

NameError: name 'target_train' is not defined

In [None]:
txt_stems

In [None]:
from win32com.client import Dispatch
speak = Dispatch("SAPI.SpVoice").Speak

In [None]:
speak("modeling complete")

## 20.4 Training a Multiclass Classifier

In [48]:
# Load libraries
import numpy as np
from keras.datasets import reuters
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers

In [49]:
%%time
# Set random seed
np.random.seed(0)

# Set the number of features we want
number_of_features = 5000

# Load feature and target data
data = reuters.load_data(num_words=number_of_features)
(data_train, target_vector_train), (data_test, target_vector_test) = data

# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")

# One-hot encode target vector to create a target matrix
target_train = to_categorical(target_vector_train)
target_test = to_categorical(target_vector_test)

# Start neural network
network = models.Sequential()

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=100,
                         activation="relu",
                         input_shape=(number_of_features,)))

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=100, activation="relu"))

# Add fully connected layer with a softmax activation function
network.add(layers.Dense(units=46, activation="softmax"))

# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

# Train neural network
history = network.fit(features_train, # Features
                      target_train, # Target
                      epochs=3, # Three epochs
                      verbose=0, # No output
                      batch_size=100, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


Wall time: 4.37 s


In [50]:
data_train.shape

(8982,)

In [51]:
data_train

array([list([1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]),
       list([1, 3267, 699, 3434, 2295, 56, 2, 2, 9, 56, 3906, 1073, 81, 5, 1198, 57, 366, 737, 132, 20, 4093, 7, 2, 49, 2295, 2, 1037, 3267, 699, 3434, 8, 7, 10, 241, 16, 855, 129, 231, 783, 5, 4, 587, 2295, 2, 2, 775, 7, 48, 34, 191, 44, 35, 1795, 505, 17, 12]),
       list([1, 53, 12, 284, 15, 14, 272, 26, 53, 959, 32, 818, 15, 14, 272, 26, 39, 684, 70, 11, 14, 12, 3886, 18, 180, 183, 187, 70, 11, 14, 102, 32, 11, 29, 53, 44, 704, 15, 14, 19, 758, 15, 53, 959, 47, 1013, 15, 14, 19, 132, 15, 39, 965, 32, 11, 14, 147, 72, 11, 180, 183, 187, 44, 11, 14, 102, 19, 11, 123, 186, 90, 67, 960, 4, 78, 13, 68, 467, 511, 110, 59

In [None]:
features_train.shape

In [None]:
target_train

In [None]:
text_data[:5]

In [None]:
type(text_data)

In [None]:
# Create text
text_data = np.array(text_data)

In [None]:
type(text_data)

In [None]:
text_data[:5]

In [None]:
len(text_data)

In [None]:
# Show feature matrix
bag_of_words

In [None]:
# # Show feature names
# count.get_feature_names()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
target_le = le.fit_transform(y)

In [None]:
target_le

In [53]:
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

In [54]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [55]:
target_le = le.fit_transform(y)

In [56]:
target_le

array([0, 0, 0, ..., 2, 2, 2])

## NN

In [57]:
# Set up data and labels
X = bag_of_words #text_data
y = target_le #balancedDF.cat

Still getting: Cast string to float is not supported

### Train Test Split

In [58]:
import numpy as np
from scipy.sparse import csr_matrix

In [59]:
%%time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 316 ms


In [60]:
tokenizer = Tokenizer(num_words=5000)

In [61]:
features_train = tokenizer.sequences_to_matrix(X_train, mode="binary")

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [64]:
X_train

<56250x41400 sparse matrix of type '<class 'numpy.int64'>'
	with 712244 stored elements in Compressed Sparse Row format>

In [65]:
# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=5000, split=',', char_level=False)

In [66]:
testDF = df.copy()
testDF.head(2)

Unnamed: 0,cat,txt,txt_stems,txt_str
0,sports,"[barely, better, gabbert, significantly, bette...","[bare, better, gabbert, significantli, better,...",bare better gabbert significantli better year ...
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]","[fuck, duck, angel, welcom, new, niner, fan]",fuck duck angel welcom new niner fan


In [67]:
%%time
tokenizer.fit_on_texts(testDF.txt_str)

Wall time: 5.88 s


In [68]:
%%time
features = tokenizer.texts_to_matrix(balancedDF.txt_str, mode='binary')

Wall time: 664 ms


In [69]:
features.shape

(75000, 5000)

In [70]:
y = testDF.cat

In [71]:
# One-hot encode target vector to create a target matrix
targets = to_categorical(target_le, num_classes=3)

In [72]:
targets.shape

(75000, 3)

In [73]:
a = to_categorical([0,1,2,3], num_classes=4)

In [74]:
a

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]], dtype=float32)

## 2. Define Keras Model

In [19]:
# # Importing the Keras libraries and packages
# import keras
# from keras.models import Sequential
# from keras.layers import Dense

In [20]:
# # define the keras model
# model = Sequential()
# model.add(Dense(12, activation='relu')) # , input_dim=X_train.shape[1]
# model.add(Dense(8, activation='relu'))
# model.add(Dense(1, activation='softmax'))

## 3. Compile Keras Model

In [21]:
# from keras.optimizers import RMSprop

In [22]:
# # compile the keras model
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #  optimizer=RMSprop(lr=0.01)

## 4. Fit Keras Model

In [79]:
# %%time
# # fit the keras model on the dataset
# model.fit(features, targets, epochs=50, batch_size=10)

In [80]:
# %%time
# # fit the keras model on the dataset
# model.fit(X_train, y_train, epochs=50, batch_size=10)

In [23]:
X = balancedDF.txt_str
y = balancedDF.cat

In [24]:
%%time
# Train Test Split
from sklearn.model_selection import train_test_split
data_train, data_test, target_vector_train, target_vector_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 30.7 ms


In [25]:
number_of_features = 5000

In [26]:
from keras.preprocessing.text import Tokenizer
# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=number_of_features, split=',', char_level=False)

In [27]:
%%time
tokenizer.fit_on_texts(X)

Wall time: 823 ms


In [28]:
features_train = tokenizer.texts_to_matrix(data_train, mode='binary')

In [29]:
features_test = tokenizer.texts_to_matrix(data_test, mode='binary')

Maybe skip this and use LabelBinarizer instead?

In [None]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()

In [None]:
# le.fit(y)

In [None]:
# target_train_le = le.transform(target_vector_train)
# target_test_le = le.transform(target_vector_test)

In [None]:
# from keras.utils.np_utils import to_categorical
# # One-hot encode target vector to create a target matrix
# target_train = to_categorical(target_train_le, num_classes=3)
# target_test = to_categorical(target_test_le, num_classes=3)

In [30]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb.fit(y)
target_train = lb.transform(target_vector_train)
target_test = lb.transform(target_vector_test)

In [31]:
from keras.models import Sequential
# Start neural network
network = Sequential()

In [32]:
from keras import layers
# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=500,
                         activation="relu",
                         input_shape=(number_of_features,)))

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=150, activation="relu"))

# Add fully connected layer with a softmax activation function
network.add(layers.Dense(units=3, activation="softmax"))

In [33]:
# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

In [35]:
# Train neural network
history = network.fit(features_train, # Features
                      target_train, # Target
                      epochs=5, # Three epochs
                      verbose=1, # Some output
                      batch_size=75, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
from keras.models import Sequential
from keras import layers

# Create function returning a compiled network
def create_network(optimizer='rmsprop'):
    # Start neural network
    network = Sequential()
    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=500,
                             activation="relu",
                             input_shape=(number_of_features,)))

    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=150, activation="relu"))

    # Add fully connected layer with a softmax activation function
    network.add(layers.Dense(units=3, activation="softmax"))
    
    # Compile neural network
    network.compile(loss="categorical_crossentropy", # Cross-entropy
                    optimizer="rmsprop", # Root Mean Square Propagation
                    metrics=["accuracy"]) # Accuracy performance metric
    
    # Return compiled network
    return network

In [None]:
import numpy as np
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
%%time 
# Set random seed
np.random.seed(42)

# Wrap Keras model so it can be used by scikit-learn
neural_network = KerasClassifier(build_fn=create_network, verbose=1)

# Create hyperparameter space
epochs = [5, 15, 50]
batches = [5, 25, 100]
optimizers = ["rmsprop", "adam"]

# Create hyperparameter options
hyperparameters = dict(optimizer=optimizers, epochs=epochs, batch_size=batches)

# Create grid search
grid = GridSearchCV(estimator=neural_network, param_grid=hyperparameters)

# Fit grid search
# grid_result = grid.fit(features, target)
grid_result = grid.fit(features_train, # Features
                      target_train, # Target
#                       epochs=150, # Three epochs
                      verbose=1, # Some output
#                       batch_size=100, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data


In [None]:
# View hyperparameters of best neural network
grid_result.best_params_

In [None]:
from win32com.client import Dispatch
speak = Dispatch("SAPI.SpVoice").Speak

In [None]:
speak("modeling complete")