## Load Data

In [1]:
# Use all processor cores
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Load libraries
import jsonlines
import pandas as pd

In [3]:
%%time
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

Wall time: 2.84 s


In [4]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [5]:
# Check out the categories
categories = cat_comments_df.cat.unique()
print("The categories are:")
for category in categories:
    print(" -", category)

The categories are:
 - sports
 - science_and_technology
 - video_games


## Preprocess Text

In [6]:
# Load libraries
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

# from sklearn.feature_extraction.text import CountVectorizer

In [7]:
df = cat_comments_df.copy()

In [8]:
# df.head()

In [9]:
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [10]:
%%time
# Clean up the text in the 'txt' column
df.txt = df.txt.apply(lambda string: cleanText(string))

Wall time: 6.95 s


In [11]:
# df.head()

In [12]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()

df['txt_stems'] = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 2min 48s


In [13]:
%%time
df['txt_str'] = df.txt_stems.apply(lambda s: ' '.join(map(str, s)))

Wall time: 1.21 s


In [14]:
df.head()

Unnamed: 0,cat,txt,txt_stems,txt_str
0,sports,"[barely, better, gabbert, significantly, bette...","[bare, better, gabbert, significantli, better,...",bare better gabbert significantli better year ...
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]","[fuck, duck, angel, welcom, new, niner, fan]",fuck duck angel welcom new niner fan
2,sports,"[drafted, wrs, matt, millen, probably]","[draft, wr, matt, millen, probabl]",draft wr matt millen probabl
3,sports,[done],[done],done
4,sports,[noo],[noo],noo


## Prepare Text for Model-Building

In [15]:
# Load libraries
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Back up, let's sample to equal sized groups:
https://stackoverflow.com/questions/41345289/getting-a-random-sample-in-python-dataframe-by-category

In [16]:
cat_group = df.groupby('cat', as_index=False, group_keys=False)

In [17]:
balancedDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

In [18]:
balancedDF.cat.value_counts()

science_and_technology    25000
sports                    25000
video_games               25000
Name: cat, dtype: int64

In [19]:
balancedDF.head()

Unnamed: 0,cat,txt,txt_stems,txt_str
11609,science_and_technology,"[one, features, implemented, custom, room, yea...","[one, featur, implement, custom, room, year, g...",one featur implement custom room year googl ye...
12671,science_and_technology,"[seems, like, price, sensitive, times, introdu...","[seem, like, price, sensit, time, introduc, ne...",seem like price sensit time introduc new smart...
8514,science_and_technology,[deleted],[delet],delet
24013,science_and_technology,"[know, samsung, hides, well, illuminated, illu...","[know, samsung, hide, well, illumin, illumin, ...",know samsung hide well illumin illumin light s...
19265,science_and_technology,"[gt, lg, launched, nice, incentive, american, ...","[gt, lg, launch, nice, incent, american, carri...",gt lg launch nice incent american carrier sell...


## NN

In [62]:
# Set up data and labels
X = balancedDF.txt_str
y = balancedDF.cat

### Train Test Split

In [20]:
import numpy as np
from scipy.sparse import csr_matrix

In [63]:
%%time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 17.5 ms


## 2. Define Keras Model

In [23]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

In [64]:
# define the keras model
model = Sequential()
model.add(Dense(12, input_dim=1, activation='relu')) # 
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='softmax'))

## 3. Compile Keras Model

In [30]:
from keras.optimizers import RMSprop

In [65]:
# compile the keras model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #  optimizer=RMSprop(lr=0.01)

## 4. Fit Keras Model

In [81]:
# %%time
# # fit the keras model on the dataset
# model.fit(X_train, y_train, epochs=50, batch_size=10)

# and now for something completely different...

## 20.4 Training a Multiclass Classifier

In [83]:
# Load libraries
import numpy as np
from keras.datasets import reuters
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers

In [84]:
# Set random seed
np.random.seed(0)

# Set the number of features we want
number_of_features = 5000

# Load feature and target data
# data = reuters.load_data(num_words=number_of_features)
data = balancedDF.copy()
X = data.txt_str
y = data.cat
# (data_train, target_vector_train), (data_test, target_vector_test) = data
from sklearn.model_selection import train_test_split
data_train, data_test, target_vector_train, target_vector_test = train_test_split(X, y) # Default is 1/4 --> test

# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")

# One-hot encode target vector to create a target matrix
target_train = to_categorical(target_vector_train)
target_test = to_categorical(target_vector_test)


TypeError: '>=' not supported between instances of 'str' and 'int'

In [92]:
# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=number_of_features)

In [94]:
text_data, string = [], " "

for text in balancedDF.txt_stems:
    text_data.append(string.join(text))

In [95]:
data_train = text_data

In [97]:
features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")

TypeError: '>=' not supported between instances of 'str' and 'int'

In [None]:
features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")

In [10]:
%%time

# Start neural network
network = models.Sequential()

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=100,
                         activation="relu",
                         input_shape=(number_of_features,)))

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=100, activation="relu"))

# Add fully connected layer with a softmax activation function
network.add(layers.Dense(units=46, activation="softmax"))

# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

# Train neural network
history = network.fit(features_train, # Features
                      target_train, # Target
                      epochs=3, # Three epochs
                      verbose=0, # No output
                      batch_size=100, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


Wall time: 4.61 s


In [11]:
# View target matrix
target_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
target_train.shape

(8982, 46)

In [None]:
txt_stems

In [42]:
from win32com.client import Dispatch
speak = Dispatch("SAPI.SpVoice").Speak

In [73]:
speak("modeling complete")

1

## 20.4 Training a Multiclass Classifier

In [98]:
# Load libraries
import numpy as np
from keras.datasets import reuters
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers

In [99]:
%%time
# Set random seed
np.random.seed(0)

# Set the number of features we want
number_of_features = 5000

# Load feature and target data
data = reuters.load_data(num_words=number_of_features)
(data_train, target_vector_train), (data_test, target_vector_test) = data

# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")

# One-hot encode target vector to create a target matrix
target_train = to_categorical(target_vector_train)
target_test = to_categorical(target_vector_test)

# Start neural network
network = models.Sequential()

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=100,
                         activation="relu",
                         input_shape=(number_of_features,)))

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=100, activation="relu"))

# Add fully connected layer with a softmax activation function
network.add(layers.Dense(units=46, activation="softmax"))

# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

# Train neural network
history = network.fit(features_train, # Features
                      target_train, # Target
                      epochs=3, # Three epochs
                      verbose=0, # No output
                      batch_size=100, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


Wall time: 4.45 s


In [105]:
data_train.shape

(8982,)

In [106]:
data_train

array([list([1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]),
       list([1, 3267, 699, 3434, 2295, 56, 2, 2, 9, 56, 3906, 1073, 81, 5, 1198, 57, 366, 737, 132, 20, 4093, 7, 2, 49, 2295, 2, 1037, 3267, 699, 3434, 8, 7, 10, 241, 16, 855, 129, 231, 783, 5, 4, 587, 2295, 2, 2, 775, 7, 48, 34, 191, 44, 35, 1795, 505, 17, 12]),
       list([1, 53, 12, 284, 15, 14, 272, 26, 53, 959, 32, 818, 15, 14, 272, 26, 39, 684, 70, 11, 14, 12, 3886, 18, 180, 183, 187, 70, 11, 14, 102, 32, 11, 29, 53, 44, 704, 15, 14, 19, 758, 15, 53, 959, 47, 1013, 15, 14, 19, 132, 15, 39, 965, 32, 11, 14, 147, 72, 11, 180, 183, 187, 44, 11, 14, 102, 19, 11, 123, 186, 90, 67, 960, 4, 78, 13, 68, 467, 511, 110, 59

In [104]:
features_train.shape

(8982, 5000)

In [103]:
target_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [109]:
text_data[:5]

['yeah ask bmw',
 'wine contain importantli realiz contain nativ emul kind plu contain passiv beyond daemon launch contain differ process requir basic zero overhead contain cach stay intact share librari share even across differ oss program run one os read share differ os cool trick two use common path get map inod program found common kernel write obvious uniqu',
 'wait day go iphon 6 gt s8',
 'ok googl beep sound fix',
 'yeah work well facebook']

In [110]:
type(text_data)

list

In [111]:
# Create text
text_data = np.array(text_data)

In [112]:
type(text_data)

numpy.ndarray

In [113]:
text_data[:5]

array(['yeah ask bmw',
       'wine contain importantli realiz contain nativ emul kind plu contain passiv beyond daemon launch contain differ process requir basic zero overhead contain cach stay intact share librari share even across differ oss program run one os read share differ os cool trick two use common path get map inod program found common kernel write obvious uniqu',
       'wait day go iphon 6 gt s8', 'ok googl beep sound fix',
       'yeah work well facebook'], dtype='<U3677')

In [114]:
len(text_data)

75000

In [125]:
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

In [127]:
# Show feature matrix
bag_of_words

<75000x40346 sparse matrix of type '<class 'numpy.int64'>'
	with 949665 stored elements in Compressed Sparse Row format>

In [129]:
# # Show feature names
# count.get_feature_names()

In [155]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [156]:
target_le = le.fit_transform(y)

In [159]:
target_le

array([0, 0, 0, ..., 2, 2, 2])

## NN

In [160]:
# Set up data and labels
X = bag_of_words #text_data
y = target_le #balancedDF.cat

Still getting: Cast string to float is not supported

### Train Test Split

In [214]:
import numpy as np
from scipy.sparse import csr_matrix

In [161]:
%%time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 194 ms


In [165]:
tokenizer = Tokenizer(num_words=5000)

In [166]:
features_train = tokenizer.sequences_to_matrix(X_train, mode="binary")

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [169]:
X_train

<56250x40346 sparse matrix of type '<class 'numpy.int64'>'
	with 711117 stored elements in Compressed Sparse Row format>

In [170]:
# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=5000, split=',', char_level=False)

In [173]:
testDF = df.copy()
testDF.head(2)

Unnamed: 0,cat,txt,txt_stems,txt_str
0,sports,"[barely, better, gabbert, significantly, bette...","[bare, better, gabbert, significantli, better,...",bare better gabbert significantli better year ...
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]","[fuck, duck, angel, welcom, new, niner, fan]",fuck duck angel welcom new niner fan


In [174]:
%%time
tokenizer.fit_on_texts(testDF.txt_str)

In [190]:
%%time
features = tokenizer.texts_to_matrix(balancedDF.txt_str, mode='binary')

Wall time: 647 ms


In [191]:
features.shape

(75000, 5000)

In [179]:
y = testDF.cat

In [188]:
# One-hot encode target vector to create a target matrix
targets = to_categorical(target_le, num_classes=3)

In [201]:
targets.shape

(75000, 3)

In [182]:
a = to_categorical([0,1,2,3], num_classes=4)

In [183]:
a

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]], dtype=float32)

## 2. Define Keras Model

In [119]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

In [192]:
# define the keras model
model = Sequential()
model.add(Dense(12, activation='relu')) # , input_dim=X_train.shape[1]
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='softmax'))

## 3. Compile Keras Model

In [122]:
from keras.optimizers import RMSprop

In [193]:
# compile the keras model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #  optimizer=RMSprop(lr=0.01)

## 4. Fit Keras Model

In [209]:
# %%time
# # fit the keras model on the dataset
# model.fit(features, targets, epochs=50, batch_size=10)

In [208]:
# %%time
# # fit the keras model on the dataset
# model.fit(X_train, y_train, epochs=50, batch_size=10)

In [20]:
X = balancedDF.txt_str
y = balancedDF.cat

In [21]:
%%time
# Train Test Split
from sklearn.model_selection import train_test_split
data_train, data_test, target_vector_train, target_vector_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 14.9 ms


In [36]:
number_of_features = 5000

In [23]:
from keras.preprocessing.text import Tokenizer
# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=number_of_features, split=',', char_level=False)

In [24]:
%%time
tokenizer.fit_on_texts(X)

Wall time: 647 ms


In [25]:
features_train = tokenizer.texts_to_matrix(data_train, mode='binary')

In [26]:
features_test = tokenizer.texts_to_matrix(data_test, mode='binary')

In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [28]:
le.fit(y)

LabelEncoder()

In [29]:
target_train_le = le.transform(target_vector_train)
target_test_le = le.transform(target_vector_test)

In [31]:
from keras.utils.np_utils import to_categorical
# One-hot encode target vector to create a target matrix
target_train = to_categorical(target_train_le, num_classes=3)
target_test = to_categorical(target_test_le, num_classes=3)

In [33]:
from keras.models import Sequential
# Start neural network
network = Sequential()

In [44]:
from keras import layers
# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=500,
                         activation="relu",
                         input_shape=(number_of_features,)))

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=150, activation="relu"))

# Add fully connected layer with a softmax activation function
network.add(layers.Dense(units=3, activation="softmax"))

In [45]:
# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

In [47]:
# Train neural network
history = network.fit(features_train, # Features
                      target_train, # Target
                      epochs=150, # Three epochs
                      verbose=1, # Some output
                      batch_size=100, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data

In [48]:
from keras.models import Sequential
from keras import layers

# Create function returning a compiled network
def create_network(optimizer='rmsprop'):
    # Start neural network
    network = Sequential()
    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=500,
                             activation="relu",
                             input_shape=(number_of_features,)))

    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=150, activation="relu"))

    # Add fully connected layer with a softmax activation function
    network.add(layers.Dense(units=3, activation="softmax"))
    
    # Compile neural network
    network.compile(loss="categorical_crossentropy", # Cross-entropy
                    optimizer="rmsprop", # Root Mean Square Propagation
                    metrics=["accuracy"]) # Accuracy performance metric
    
    # Return compiled network
    return network

In [54]:
import numpy as np
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [57]:
%%time 
# Set random seed
np.random.seed(42)

# Wrap Keras model so it can be used by scikit-learn
neural_network = KerasClassifier(build_fn=create_network, verbose=1)

# Create hyperparameter space
epochs = [5, 15, 50]
batches = [5, 25, 100]
optimizers = ["rmsprop", "adam"]

# Create hyperparameter options
hyperparameters = dict(optimizer=optimizers, epochs=epochs, batch_size=batches)

# Create grid search
grid = GridSearchCV(estimator=neural_network, param_grid=hyperparameters)

# Fit grid search
# grid_result = grid.fit(features, target)
grid_result = grid.fit(features_train, # Features
                      target_train, # Target
#                       epochs=150, # Three epochs
                      verbose=1, # Some output
#                       batch_size=100, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 

In [58]:
# View hyperparameters of best neural network
grid_result.best_params_

NameError: name 'grid_result' is not defined

In [None]:
from win32com.client import Dispatch
speak = Dispatch("SAPI.SpVoice").Speak

In [None]:
speak("modeling complete")