## Load Data

In [1]:
# pip install scikit-learn-intelex
# conda install scikit-learn-intelex -c intel

Intel(R) Extension for Scikit-learn:  
https://intel.github.io/scikit-learn-intelex/get_started.html  
https://pypi.org/project/scikit-learn-intelex/

In [1]:
# Use all processor cores
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Load libraries
import jsonlines
import pandas as pd

In [3]:
%%time
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

Wall time: 3.05 s


In [4]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [5]:
# Check out the categories
categories = cat_comments_df.cat.unique()
print("The categories are:")
for category in categories:
    print(" -", category)

The categories are:
 - sports
 - science_and_technology
 - video_games


## Preprocess Text

In [6]:
# Load libraries
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

# from sklearn.feature_extraction.text import CountVectorizer

In [7]:
df = cat_comments_df.copy()

In [8]:
# df.head()

In [9]:
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [10]:
%%time
# Clean up the text in the 'txt' column
df.txt = df.txt.apply(lambda string: cleanText(string))

Wall time: 7.8 s


In [11]:
# df.head()

In [12]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()

df['txt_stems'] = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 2min 56s


In [78]:
df.head()

Unnamed: 0,cat,txt,txt_stems
0,sports,"[barely, better, gabbert, significantly, bette...","[bare, better, gabbert, significantli, better,..."
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]","[fuck, duck, angel, welcom, new, niner, fan]"
2,sports,"[drafted, wrs, matt, millen, probably]","[draft, wr, matt, millen, probabl]"
3,sports,[done],[done]
4,sports,[noo],[noo]


## Prepare Text for Model-Building
### **--COMMENT ALL THIS CODE--**

In [14]:
# Load libraries
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Back up, let's sample to equal sized groups:
https://stackoverflow.com/questions/41345289/getting-a-random-sample-in-python-dataframe-by-category

In [15]:
cat_group = df.groupby('cat', as_index=False, group_keys=False)

In [16]:
balancedDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

In [17]:
balancedDF.cat.value_counts()

sports                    25000
video_games               25000
science_and_technology    25000
Name: cat, dtype: int64

### Convert to a word-count vector:

In [18]:
count = CountVectorizer()

In [19]:
text_data, string = [], " "

for text in balancedDF.txt_stems:
    text_data.append(string.join(text))

In [20]:
%%time
# Word-count vector as a sparse matrix
bal_sparseWCV = count.fit_transform(text_data)
bal_sparseWCV

Wall time: 1.12 s


<75000x41427 sparse matrix of type '<class 'numpy.int64'>'
	with 961832 stored elements in Compressed Sparse Row format>

In [21]:
bal_sparseWCV.shape

(75000, 41427)

## NN

In [23]:
# Set up data and labels
X = bal_sparseWCV
y = balancedDF.cat

### Train Test Split

In [24]:
%%time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 18.1 ms


### Train the model

In [25]:
from sklearn.neural_network import MLPClassifier

## First let's optimize some hyperparameters

In [68]:
%%time
mlp_gs = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(30,), (50, 30), (100,)], #[(20,20, 20),(30,),(40,)],
    'activation': ['relu'], #['tanh', 'relu'], 
    'solver': ['adam'],#, 'sgd'], 
    'alpha': [0.0001],#, 0.05],
    'learning_rate': ['constant'],#,'adaptive'], # only used when solver is 'sgd'
}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5)
clf.fit(X, y) # X is train samples and y is the corresponding labels

Wall time: 11h 28min 5s




GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=100), n_jobs=-1,
             param_grid={'activation': ['relu'], 'alpha': [0.0001],
                         'hidden_layer_sizes': [(30,), (50, 30), (100,)],
                         'learning_rate': ['constant'], 'solver': ['adam']})

In [69]:
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}


In [70]:
y_true, y_pred = y_test , clf.predict(X_test)
from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))

Results on the test set:
                        precision    recall  f1-score   support

science_and_technology       0.93      0.96      0.95      6248
                sports       0.92      0.96      0.94      6255
           video_games       0.99      0.92      0.95      6247

              accuracy                           0.95     18750
             macro avg       0.95      0.95      0.95     18750
          weighted avg       0.95      0.95      0.95     18750



In [53]:
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (30,), 'learning_rate': 'constant', 'solver': 'adam'}


In [54]:
y_true, y_pred = y_test , clf.predict(X_test)
from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))

Results on the test set:
                        precision    recall  f1-score   support

science_and_technology       0.89      0.90      0.90      6248
                sports       0.89      0.87      0.88      6255
           video_games       0.86      0.87      0.87      6247

              accuracy                           0.88     18750
             macro avg       0.88      0.88      0.88     18750
          weighted avg       0.88      0.88      0.88     18750



#### Create an instance of the model:

In [56]:
bal_mlp = MLPClassifier(max_iter=100, hidden_layer_sizes=(30,), activation='relu', 
                        solver='adam', alpha=0.0001, learning_rate='constant',)

In [69]:
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}


In [71]:
bal_mlp = MLPClassifier(max_iter=100, hidden_layer_sizes=(100,), activation='relu', 
                        solver='adam', alpha=0.0001, learning_rate='constant',)

#### Fit the training data to our model

In [72]:
%%time
bal_mlp.fit(X_train,y_train)

Wall time: 2h 26min 40s




MLPClassifier(max_iter=100)

In [42]:
from win32com.client import Dispatch
speak = Dispatch("SAPI.SpVoice").Speak

In [73]:
speak("modeling complete")

1

### Predictions / Evaluation

In [74]:
%%time
# Get predictions
predictions = bal_mlp.predict(X_test)

Wall time: 82.9 ms


### Evaluate how well our model performed

In [60]:
from sklearn.metrics import classification_report,confusion_matrix

#### Confusion matrix

In [61]:
print(confusion_matrix(y_test,predictions))

[[4978  560  710]
 [ 767 4402 1086]
 [1112 1179 3956]]


In [75]:
print(confusion_matrix(y_test,predictions))

[[4592  883  773]
 [ 439 4727 1089]
 [ 673 1478 4096]]


#### Precision / Recall / F1 / Suport (Classification report)

In [62]:
print(classification_report(y_test,predictions))

                        precision    recall  f1-score   support

science_and_technology       0.73      0.80      0.76      6248
                sports       0.72      0.70      0.71      6255
           video_games       0.69      0.63      0.66      6247

              accuracy                           0.71     18750
             macro avg       0.71      0.71      0.71     18750
          weighted avg       0.71      0.71      0.71     18750



In [76]:
print(classification_report(y_test,predictions))

                        precision    recall  f1-score   support

science_and_technology       0.81      0.73      0.77      6248
                sports       0.67      0.76      0.71      6255
           video_games       0.69      0.66      0.67      6247

              accuracy                           0.72     18750
             macro avg       0.72      0.72      0.72     18750
          weighted avg       0.72      0.72      0.72     18750



### Save the model

In [77]:
#//*** Save a model to disk using pickle
import pickle

model = bal_mlp
pkl_filename = 'mlp_bal_spWCV_optimized.pkl'

with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)