# Scott Breitbach
## 10-May-2021
## DSC550, Week 9

# 9.3 Exercise: Neural Network Classifiers

## Step 1. Neural Network Classifier with Scikit

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

## Load Data Set

In [1]:
# Load libraries
import numpy as np
import jsonlines
import pandas as pd

# Set random seed
np.random.seed(42)

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


In [2]:
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

In [3]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


## Preprocess Text

In [4]:
# Load libraries
import sys
import unicodedata
import re

from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

In [5]:
# Create a copy of the data set to manipulate
df = cat_comments_df.copy()

In [6]:
# Create a dictionary of punctuation
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))
# Create a dictionary of stopwords
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [7]:
# Clean up the text in the 'txt' column
df.txt = df.txt.apply(lambda string: cleanText(string))

In [8]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()
df['txt_stems'] = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 4min 33s


In [9]:
%%time
# Join tokenized stem words into a string
df['txt_str'] = df.txt_stems.apply(lambda s: ' '.join(map(str, s)))

Wall time: 3.03 s


In [10]:
# Take a look at data set
df.head()

Unnamed: 0,cat,txt,txt_stems,txt_str
0,sports,"[barely, better, gabbert, significantly, bette...","[bare, better, gabbert, significantli, better,...",bare better gabbert significantli better year ...
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]","[fuck, duck, angel, welcom, new, niner, fan]",fuck duck angel welcom new niner fan
2,sports,"[drafted, wrs, matt, millen, probably]","[draft, wr, matt, millen, probabl]",draft wr matt millen probabl
3,sports,[done],[done],done
4,sports,[noo],[noo],noo


## Sample Data Set Into Equal-Sized Groups

In [11]:
# Group data by category
cat_group = df.groupby('cat', as_index=False, group_keys=False)

In [12]:
# Sample 25000 rows from each category
balancedDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

In [13]:
# Verify counts of categories
balancedDF.cat.value_counts()

video_games               25000
sports                    25000
science_and_technology    25000
Name: cat, dtype: int64

# Prepare Text for Model-Building

In [14]:
# Load libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

## Convert Feature Data to a Word-Count Vector

In [15]:
# Combine tokenized lists of words into a list word strings
text_data, string = [], " "

for text in balancedDF.txt_stems:
    text_data.append(string.join(text))

In [16]:
# Word-count vector as a sparse matrix
count = CountVectorizer(max_features=5000)
bal_sparseWCV = count.fit_transform(text_data)
bal_sparseWCV

<75000x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 874543 stored elements in Compressed Sparse Row format>

### Split Training and Testing Data

In [17]:
# Set up data and labels
X = bal_sparseWCV
y = balancedDF.cat

In [18]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Modeling

In [19]:
# Load libraries
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## Tune the model

In [22]:
from scipy.stats import uniform

After too many tries with `GridSearchCV` taking several hours and/or freezing up overnight I tried `RandomizedSearchPV` and had much better luck.

In [23]:
%%time
mlp_gs = MLPClassifier(max_iter=100, verbose=True)#, early_stopping=True) max_iter=200, 300
parameter_space = {
    'hidden_layer_sizes': [(30,), (100,)], #[(10, 10, 10), (20,20, 20), (40,), (50, 30), (500, 150)],
    'activation': ['relu', 'tanh'], 
    'solver': ['adam', 'sgd'], 
    'alpha': uniform(loc=0, scale=0.05),
    'learning_rate': ['constant', 'adaptive'], # only used when solver is 'sgd'
    'early_stopping': [True],
}
rand = RandomizedSearchCV(mlp_gs, parameter_space, random_state=42, n_iter=10, verbose=2, n_jobs=-1, cv=5)
rand_result = rand.fit(X_train, y_train) 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Iteration 1, loss = 0.73768681
Validation score: 0.740622
Iteration 2, loss = 0.54187629
Validation score: 0.741867
Iteration 3, loss = 0.49838411
Validation score: 0.744889
Iteration 4, loss = 0.47766933
Validation score: 0.746667
Iteration 5, loss = 0.46601224
Validation score: 0.745600
Iteration 6, loss = 0.45828540
Validation score: 0.739378
Iteration 7, loss = 0.45208243
Validation score: 0.740622
Iteration 8, loss = 0.44857375
Validation score: 0.739378
Iteration 9, loss = 0.44477580
Validation score: 0.738133
Iteration 10, loss = 0.44193440
Validation score: 0.738133
Iteration 11, loss = 0.43963454
Validation score: 0.739911
Iteration 12, loss = 0.43758229
Validation score: 0.735822
Iteration 13, loss = 0.43580811
Validation score: 0.736000
Iteration 14, loss = 0.43470887
Validation score: 0.738844
Iteration 15, loss = 0.43317936
Validation score: 0.736000
Validation score did not improve more than tol=0.000100 for 10 

In [24]:
print('Best parameters found:\n', rand_result.best_params_)

Best parameters found:
 {'activation': 'tanh', 'alpha': 0.0011531212520707879, 'early_stopping': True, 'hidden_layer_sizes': (30,), 'learning_rate': 'constant', 'solver': 'adam'}


#### Evaluation

In [25]:
y_true, y_pred = y_test , rand_result.predict(X_test)
from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))

Results on the test set:
                        precision    recall  f1-score   support

science_and_technology       0.81      0.78      0.80      6332
                sports       0.72      0.76      0.74      6236
           video_games       0.70      0.69      0.70      6182

              accuracy                           0.74     18750
             macro avg       0.74      0.74      0.74     18750
          weighted avg       0.75      0.74      0.74     18750



## Train the Model

In [26]:
# Create an instance of the model
bal_mlp = MLPClassifier(max_iter=500, hidden_layer_sizes=(30,), activation='tanh', 
                        solver='adam', alpha=0.001, learning_rate='constant', early_stopping=True)

In [27]:
%%time
# Fit the training data to the model
bal_mlp.fit(X_train, y_train)

Wall time: 27 s


MLPClassifier(activation='tanh', alpha=0.001, early_stopping=True,
              hidden_layer_sizes=(30,), max_iter=500)

## Evaluate the Model

In [28]:
# Load libraries
from sklearn.metrics import classification_report, confusion_matrix
predictions = bal_mlp.predict(X_test)

### Confusion matrix

In [30]:
# Print confusion matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

[[5008  543  781]
 [ 478 4687 1071]
 [ 670 1167 4345]]


### Precision / Recall / F1 / Support

In [31]:
# Print precision, recall, f1-score, and accuracy
cr = classification_report(y_test, predictions)
print(cr)

                        precision    recall  f1-score   support

science_and_technology       0.81      0.79      0.80      6332
                sports       0.73      0.75      0.74      6236
           video_games       0.70      0.70      0.70      6182

              accuracy                           0.75     18750
             macro avg       0.75      0.75      0.75     18750
          weighted avg       0.75      0.75      0.75     18750

