## Load Data

In [1]:
# Use all processor cores
from sklearnex import patch_sklearn
patch_sklearn()

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Load libraries
import jsonlines
import pandas as pd

In [3]:
%%time
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

Wall time: 2.96 s


In [4]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [5]:
# Check out the categories
categories = cat_comments_df.cat.unique()
print("The categories are:")
for category in categories:
    print(" -", category)

The categories are:
 - sports
 - science_and_technology
 - video_games


## Preprocess Text

In [6]:
# Load libraries
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

# from sklearn.feature_extraction.text import CountVectorizer

In [7]:
df = cat_comments_df.copy()

In [8]:
# df.head()

In [9]:
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [10]:
%%time
# Clean up the text in the 'txt' column
df.txt = df.txt.apply(lambda string: cleanText(string))

Wall time: 7.3 s


In [11]:
# df.head()

In [12]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()
df['txt_stems'] = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 2min 44s


In [13]:
%%time
df['txt_str'] = df.txt_stems.apply(lambda s: ' '.join(map(str, s)))

Wall time: 1.23 s


In [14]:
df.head()

Unnamed: 0,cat,txt,txt_stems,txt_str
0,sports,"[barely, better, gabbert, significantly, bette...","[bare, better, gabbert, significantli, better,...",bare better gabbert significantli better year ...
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]","[fuck, duck, angel, welcom, new, niner, fan]",fuck duck angel welcom new niner fan
2,sports,"[drafted, wrs, matt, millen, probably]","[draft, wr, matt, millen, probabl]",draft wr matt millen probabl
3,sports,[done],[done],done
4,sports,[noo],[noo],noo


## Prepare Text for Model-Building

In [15]:
# Load libraries
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Back up, let's sample to equal sized groups:
https://stackoverflow.com/questions/41345289/getting-a-random-sample-in-python-dataframe-by-category

In [16]:
cat_group = df.groupby('cat', as_index=False, group_keys=False)

In [17]:
balancedDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

In [18]:
balancedDF.cat.value_counts()

science_and_technology    25000
sports                    25000
video_games               25000
Name: cat, dtype: int64

In [19]:
balancedDF.head()

Unnamed: 0,cat,txt,txt_stems,txt_str
9879,science_and_technology,"[thats, exactly, s6, s7, cases, holes, back]","[that, exactli, s6, s7, case, hole, back]",that exactli s6 s7 case hole back
19930,science_and_technology,"[dont, refers, long, press, nonsense]","[dont, refer, long, press, nonsens]",dont refer long press nonsens
24324,science_and_technology,"[atampt, rollout, also, happened, fyi, manuall...","[atampt, rollout, also, happen, fyi, manual, u...",atampt rollout also happen fyi manual updat
11546,science_and_technology,"[hangouts, awful, first, couple, years]","[hangout, aw, first, coupl, year]",hangout aw first coupl year
15622,science_and_technology,"[need, internet, facebook]","[need, internet, facebook]",need internet facebook


## 4. Fit Keras Model

## NN

In [20]:
# Set up data and labels
X = balancedDF.txt_str
y = balancedDF.cat

In [21]:
%%time
# Train Test Split
from sklearn.model_selection import train_test_split
data_train, data_test, target_vector_train, target_vector_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 20.6 ms


In [22]:
number_of_features = 5000

In [23]:
from keras.preprocessing.text import Tokenizer
# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=number_of_features, split=',', char_level=False)

In [24]:
tokenizer.fit_on_texts(X)

In [25]:
features_train = tokenizer.texts_to_matrix(data_train, mode='binary')

In [26]:
features_test = tokenizer.texts_to_matrix(data_test, mode='binary')

In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
target_train_le = le.transform(target_vector_train)
target_test_le = le.transform(target_vector_test)

In [28]:
from keras.utils.np_utils import to_categorical
# One-hot encode target vector to create a target matrix
target_train = to_categorical(target_train_le, num_classes=3)
target_test = to_categorical(target_test_le, num_classes=3)

In [33]:
from keras.models import Sequential
from keras import layers

# Create function returning a compiled network
def create_network(optimizer='rmsprop'):
    # Start neural network
    network = Sequential()
    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=16,  # 500
                             activation="relu",
                             input_shape=(number_of_features,)))

    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=16, activation="relu")) # 150

    # Add fully connected layer with a softmax activation function
    network.add(layers.Dense(units=3, activation="softmax"))
    
    # Compile neural network
    network.compile(loss="categorical_crossentropy", # Cross-entropy
                    optimizer=optimizer, # Root Mean Square Propagation
                    metrics=["accuracy"]) # Accuracy performance metric
    
    # Return compiled network
    return network

In [34]:
import numpy as np
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [36]:
# View hyperparameters of best neural network
grid_result.best_params_

{'batch_size': 75, 'epochs': 5, 'optimizer': 'rmsprop'}

In [45]:
%%time 
# Set random seed
np.random.seed(42)

# Wrap Keras model so it can be used by scikit-learn
neural_network = KerasClassifier(build_fn=create_network, verbose=1)

# Create hyperparameter space 
## Round 1: 3, 50, 'adam' were best
## Round 2: 5, 75, 'rmsprop' were best
epochs = [3, 5, 7]      #1[1,  3, 10] #2[ 1,  3,  5]
batches = [50, 75, 100] #1[5, 15, 50] #2[30, 50, 75]
optimizers = ["rmsprop", "adam"]

# Create hyperparameter options
hyperparameters = dict(optimizer=optimizers, epochs=epochs, batch_size=batches)

# Create grid search
grid = GridSearchCV(estimator=neural_network, param_grid=hyperparameters)

# Fit grid search
# grid_result = grid.fit(features, target)
grid_result = grid.fit(features_train, # Features
                      target_train, # Target
#                       epochs=150, # Three epochs
                      verbose=1, # Some output
#                       batch_size=100, # Number of observations per batch
                      validation_data=(features_test, target_test)) # Test data


In [36]:
# View hyperparameters of best neural network
grid_result.best_params_

{'batch_size': 75, 'epochs': 5, 'optimizer': 'rmsprop'}

In [39]:
grid_result.best_score_

0.3879644453525543

### Predictions / Evaluation

In [50]:
%%time
# Get predictions
predictions = grid_result.predict(features_test)

Wall time: 655 ms


In [47]:
%%time
# Get predictions
predictions = np.argmax(grid_result.predict(features_test), axis=-1)

Wall time: 777 ms


### Evaluate how well our model performed

In [48]:
from sklearn.metrics import classification_report,confusion_matrix

#### Confusion matrix

In [51]:
print(confusion_matrix(target_test,predictions))

ValueError: Classification metrics can't handle a mix of multilabel-indicator and multiclass targets

#### Precision / Recall / F1 / Suport (Classification report)

In [52]:
print(classification_report(target_test,predictions))

ValueError: Classification metrics can't handle a mix of multilabel-indicator and multiclass targets