## Load Data

In [1]:
# Use all processor cores
from sklearnex import patch_sklearn
patch_sklearn()

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Load libraries
import jsonlines
import pandas as pd

In [3]:
%%time
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

Wall time: 2.52 s


In [4]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


## Preprocess Text

In [5]:
# Load libraries
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

# from sklearn.feature_extraction.text import CountVectorizer

In [6]:
df = cat_comments_df.copy()

In [7]:
# df.head()

In [8]:
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [9]:
%%time
# Clean up the text in the 'txt' column
df.txt = df.txt.apply(lambda string: cleanText(string))

Wall time: 7.39 s


In [10]:
# df.head()

In [11]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()
df['txt_stems'] = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 2min 59s


In [12]:
%%time
df['txt_str'] = df.txt_stems.apply(lambda s: ' '.join(map(str, s)))

Wall time: 1.29 s


In [13]:
df.head()

Unnamed: 0,cat,txt,txt_stems,txt_str
0,sports,"[barely, better, gabbert, significantly, bette...","[bare, better, gabbert, significantli, better,...",bare better gabbert significantli better year ...
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]","[fuck, duck, angel, welcom, new, niner, fan]",fuck duck angel welcom new niner fan
2,sports,"[drafted, wrs, matt, millen, probably]","[draft, wr, matt, millen, probabl]",draft wr matt millen probabl
3,sports,[done],[done],done
4,sports,[noo],[noo],noo


## Prepare Text for Model-Building

In [14]:
# Load libraries
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Back up, let's sample to equal sized groups:
https://stackoverflow.com/questions/41345289/getting-a-random-sample-in-python-dataframe-by-category

In [15]:
cat_group = df.groupby('cat', as_index=False, group_keys=False)

In [16]:
balancedDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

In [17]:
balancedDF.cat.value_counts()

sports                    25000
video_games               25000
science_and_technology    25000
Name: cat, dtype: int64

In [18]:
# text_data, string = [], " "

# for text in balancedDF.txt_stems:
#     text_data.append(string.join(text))

In [19]:
# # Create the bag of words feature matrix
# count = CountVectorizer()
# bag_of_words = count.fit_transform(text_data)

## 20.17 Classifying Text

In [20]:
# Load libraries
import numpy as np
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras import models
from keras import layers

In [21]:
# Set the number of features we want
number_of_features = 5000

# Set up data and labels
# X = bag_of_words
X = balancedDF.txt_str
y = balancedDF.cat

# Train Test Split
from sklearn.model_selection import train_test_split
data_train, data_test, target_vector_train, target_vector_test = train_test_split(X, y) # Default is 1/4 --> test

In [22]:
data_train[:5]

45056     haha fox isnt go like fuck greatest line got b...
541644                             love way move video isnt
5411                 your welcom dont apologis ask question
533831                                              omg lol
154606    big problem build gener mirana want get agh or...
Name: txt_str, dtype: object

In [23]:
# features_train = data_train
# features_test = data_test

In [24]:
from keras.preprocessing.text import Tokenizer
# Convert movie review data to one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=number_of_features)
# tokenizer = Tokenizer(num_words=number_of_features, split=',', char_level=False)
tokenizer.fit_on_texts(X)
features_train = tokenizer.texts_to_matrix(data_train, mode="binary")
features_test = tokenizer.texts_to_matrix(data_test, mode="binary")

In [25]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb.fit(y)
target_train = lb.transform(target_vector_train)
target_test = lb.transform(target_vector_test)

In [26]:
from keras.models import Sequential
# Start neural network
network = Sequential()

In [27]:
from keras import layers
# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=500,
                         activation="relu",
                         input_shape=(number_of_features,)))

# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=150, activation="relu"))

# Add fully connected layer with a softmax activation function
network.add(layers.Dense(units=3, activation="softmax"))

In [28]:
# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
                optimizer="rmsprop", # Root Mean Square Propagation
                metrics=["accuracy"]) # Accuracy performance metric

In [29]:
# Train neural network
history = network.fit(features_train, # Features
                      target_train, # Target
                      epochs=5, # Three epochs
                      verbose=1, # Some output
                      batch_size=75, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## 20.7 Visualize Training History

In [46]:
# import matplotlib.pyplot as plt
# # Get training and test loss histories
# training_loss = history.history["loss"]
# test_loss = history.history["val_loss"]

# # Create count of the number of epochs
# epoch_count = range(1, len(training_loss) + 1)

# # Visualize loss history
# plt.plot(epoch_count, training_loss, "r--")
# plt.plot(epoch_count, test_loss, "b-")
# plt.legend(["Training Loss", "Test Loss"])
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.show();

In [47]:
# # Get training and test accuracy histories
# training_accuracy = history.history["accuracy"]
# test_accuracy = history.history["val_accuracy"]
# plt.plot(epoch_count, training_accuracy, "r--")
# plt.plot(epoch_count, test_accuracy, "b-")

# # Visualize accuracy history
# plt.legend(["Training Accuracy", "Test Accuracy"])
# plt.xlabel("Epoch")
# plt.ylabel("Accuracy Score")
# plt.show();

## 20.13 Tuning Neural Networks

In [55]:
# Load libraries
import numpy as np
from keras import models
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification

In [56]:
# Create function returning a compiled network
def create_network(optimizer="rmsprop"):

    # Start neural network
    network = models.Sequential()

    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=16,
                             activation="relu",
                             input_shape=(number_of_features,)))

    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=16, activation="relu"))

    # Add fully connected layer with a sigmoid activation function
    network.add(layers.Dense(units=1, activation="sigmoid"))

    # Compile neural network
    network.compile(loss="binary_crossentropy", # Cross-entropy
                    optimizer=optimizer, # Optimizer
                    metrics=["accuracy"]) # Accuracy performance metric

    # Return compiled network
    return network

In [57]:
%%time
# Set random seed
np.random.seed(42)

# Wrap Keras model so it can be used by scikit-learn
neural_network = KerasClassifier(build_fn=create_network, verbose=0)

# Create hyperparameter space 
## Round 1: 3, 50, 'adam' were best
## Round 2: 5, 75, 'rmsprop' were best
epochs = [3, 5, 7]      #1[1,  3, 10] #2[ 1,  3,  5]
batches = [50, 75, 100] #1[5, 15, 50] #2[30, 50, 75]
optimizers = ["rmsprop", "adam"]

# Create hyperparameter options
hyperparameters = dict(optimizer=optimizers, epochs=epochs, batch_size=batches)

# Create grid search
grid = GridSearchCV(estimator=neural_network, param_grid=hyperparameters)

# Fit grid search
# grid_result = grid.fit(features, target)
grid_result = grid.fit(features_train, # Features
                      target_train, # Target
#                       epochs=150, # Three epochs
                      verbose=1, # Some output
#                       batch_size=100, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data

Epoch 1/3


Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\wrappers\scikit_learn.py", line 223, in fit
    return super(KerasClassifier, self).fit(x, y, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\wrappers\scikit_learn.py", line 166, in fit
    history = self.model.fit(x, y, **fit_args)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1100, in fit
    tmp_logs = self.train_function(iterator)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 828, in __call__
    result = self._call(*args, **kwds)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 871, in _call
    self._initialize(args

Epoch 1/3


ValueError: in user code:

    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:788 run_step  **
        outputs = model.train_step(data)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:755 train_step
        loss = self.compiled_loss(
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:203 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\losses.py:152 __call__
        losses = call_fn(y_true, y_pred)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\losses.py:256 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\losses.py:1608 binary_crossentropy
        K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\backend.py:4979 binary_crossentropy
        return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\nn_impl.py:173 sigmoid_cross_entropy_with_logits
        raise ValueError("logits and labels must have the same shape (%s vs %s)" %

    ValueError: logits and labels must have the same shape ((50, 1) vs (50, 3))


In [58]:
# View hyperparameters of best neural network
grid_result.best_params_

NameError: name 'grid_result' is not defined

In [36]:
# %%time
# # Get predictions
# predictions = grid_result.predict(features_test)

In [37]:
# %%time
# # Get predictions
# predictions = np.argmax(grid_result.predict(features_test), axis=-1)

### Predictions / Evaluation

In [51]:
# %%time
# # Get predictions
# predictions = network.predict(features_test)

In [48]:
%%time
# Get predictions
predictions = network.predict(features_test)
predictions = np.argmax(predictions, axis=1)
y_test = np.argmax(target_test, axis=1)

Wall time: 1.59 s


In [40]:
# %%time
# # Get predictions
# predictions = np.argmax(network.predict(features_test), axis=-1)

Wall time: 3.03 s


### Evaluate how well our model performed

In [38]:
from sklearn.metrics import classification_report, confusion_matrix

#### Confusion matrix

In [49]:
cm = confusion_matrix(y_test, predictions)

In [50]:
print(cm)

[[4866  659  733]
 [ 590 4675  950]
 [ 827 1273 4177]]


In [52]:
# print(confusion_matrix(target_test, predictions))

#### Precision / Recall / F1 / Suport (Classification report)

In [53]:
cr = classification_report(y_test, predictions)

In [54]:
print(cr)

              precision    recall  f1-score   support

           0       0.77      0.78      0.78      6258
           1       0.71      0.75      0.73      6215
           2       0.71      0.67      0.69      6277

    accuracy                           0.73     18750
   macro avg       0.73      0.73      0.73     18750
weighted avg       0.73      0.73      0.73     18750



In [32]:
from win32com.client import Dispatch
speak = Dispatch("SAPI.SpVoice").Speak

In [33]:
speak("modeling complete")

1