## Load Data

In [1]:
# Load libraries
import jsonlines
import pandas as pd

In [2]:
%%time
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

Wall time: 2.6 s


In [3]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [4]:
# Check out the categories
categories = cat_comments_df.cat.unique()
print("The categories are:")
for category in categories:
    print(" -", category)

The categories are:
 - sports
 - science_and_technology
 - video_games


## Preprocess Text

In [5]:
# Load libraries
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

# from sklearn.feature_extraction.text import CountVectorizer

In [6]:
df = cat_comments_df.copy()

In [7]:
df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [8]:
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [9]:
%%time
# Clean up the text in the 'txt' column
df.txt = df.txt.apply(lambda string: cleanText(string))

Wall time: 7.46 s


In [10]:
df.head()

Unnamed: 0,cat,txt
0,sports,"[barely, better, gabbert, significantly, bette..."
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]"
2,sports,"[drafted, wrs, matt, millen, probably]"
3,sports,[done]
4,sports,[noo]


In [11]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()

df['txt_stems'] = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 2min 37s


In [12]:
df.head()

Unnamed: 0,cat,txt,txt_stems
0,sports,"[barely, better, gabbert, significantly, bette...","[bare, better, gabbert, significantli, better,..."
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]","[fuck, duck, angel, welcom, new, niner, fan]"
2,sports,"[drafted, wrs, matt, millen, probably]","[draft, wr, matt, millen, probabl]"
3,sports,[done],[done]
4,sports,[noo],[noo]


## Prepare Text for Model-Building
### **--COMMENT ALL THIS CODE--**

In [36]:
# Load libraries
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Convert to a word-count vector:

In [37]:
count = CountVectorizer()

In [38]:
# text_data, string = [], " "

# for text in df.txt:
#     text_data.append(string.join(text))

In [39]:
text_data, string = [], " "

for text in df.txt_stems:
    text_data.append(string.join(text))

In [40]:
%%time
# Word-count vector as a sparse matrix
sparseWCV = count.fit_transform(text_data)
sparseWCV

Wall time: 8.29 s


<606475x153186 sparse matrix of type '<class 'numpy.int64'>'
	with 7708544 stored elements in Compressed Sparse Row format>

In [41]:
sparseWCV.shape

(606475, 153186)

In [42]:
sparseWCV.size

7708544

### Convert to a part-of-speech tag vector:

In [43]:
nltk.pos_tag(df.txt.iloc[1])[:5]

[('fuck', 'NN'),
 ('ducks', 'NNS'),
 ('angels', 'VBP'),
 ('welcome', 'JJ'),
 ('new', 'JJ')]

In [44]:
testDF = df.copy()

In [45]:
oneHotMulti = MultiLabelBinarizer()

In [46]:
# %%time
# taggedWords = []

# for words in testDF.txt: #[:100]:
#     wordTags = nltk.pos_tag(words)
#     taggedWords.append([tag for word, tag in wordTags])

In [47]:
# posMatrix = oneHotMulti.fit_transform(taggedWords)
# posMatrix

In [48]:
# posMatrix.shape

In [49]:
# len(oneHotMulti.classes_)

### Convert to a tfidf vector:

In [50]:
tfidf = TfidfVectorizer()

In [51]:
%%time
# tfidf vector as a sparse matrix:
sparseTfidf = tfidf.fit_transform(text_data)
sparseTfidf

Wall time: 7.97 s


<606475x153186 sparse matrix of type '<class 'numpy.float64'>'
	with 7708544 stored elements in Compressed Sparse Row format>

In [52]:
len(tfidf.get_feature_names())

153186

In [53]:
sparseTfidf.shape

(606475, 153186)

In [54]:
df.shape

(606475, 3)

In [55]:
df.head()

Unnamed: 0,cat,txt,txt_stems
0,sports,"[barely, better, gabbert, significantly, bette...","[bare, better, gabbert, significantli, better,..."
1,sports,"[fuck, ducks, angels, welcome, new, niners, fans]","[fuck, duck, angel, welcom, new, niner, fan]"
2,sports,"[drafted, wrs, matt, millen, probably]","[draft, wr, matt, millen, probabl]"
3,sports,[done],[done]
4,sports,[noo],[noo]


## Let's try a NN with the sparse word count vector... `sparseWCV`

In [56]:
# Set up data and labels
X = sparseWCV
y = df.cat

### Train Test Split

In [57]:
%%time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 183 ms


# ^v^v^v^v^v^v^v^v^v^v^v^v
(Let's try it first without scaling)

#### Fit Scaler to training data:

In [9]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# # Fit only to the training data
# scaler.fit(X_train)

StandardScaler()

#### Use Scaler to transform training and test data

In [11]:
# # Now apply the transformations to the data:
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

# ^v^v^v^v^v^v^v^v^v^v^v^v

### Train the model

In [60]:
from sklearn.neural_network import MLPClassifier

#### Create an instance of the model:

In [61]:
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))

#### Fit the training data to our model

In [62]:
%%time
mlp.fit(X_train,y_train)



Wall time: 12h 52min 35s


MLPClassifier(hidden_layer_sizes=(30, 30, 30))

### Predictions / Evaluation

In [63]:
%%time
# Get predictions
predictions = mlp.predict(X_test)

Wall time: 626 ms


### Evaluate how well our model performed

In [64]:
from sklearn.metrics import classification_report,confusion_matrix

#### Confusion matrix

In [65]:
print(confusion_matrix(y_test,predictions))

[[  3491    229   2550]
 [   186  22975  13323]
 [  1180   6266 101419]]


#### Precision / Recall / F1 / Suport (Classification report)

In [66]:
print(classification_report(y_test,predictions))

                        precision    recall  f1-score   support

science_and_technology       0.72      0.56      0.63      6270
                sports       0.78      0.63      0.70     36484
           video_games       0.86      0.93      0.90    108865

              accuracy                           0.84    151619
             macro avg       0.79      0.71      0.74    151619
          weighted avg       0.84      0.84      0.84    151619



In [67]:
df.cat.value_counts()

video_games               435541
sports                    145823
science_and_technology     25111
Name: cat, dtype: int64

### Back up, let's sample to equal sized groups:
https://stackoverflow.com/questions/41345289/getting-a-random-sample-in-python-dataframe-by-category

In [78]:
cat_group = df.groupby('cat', as_index=False, group_keys=False)

In [79]:
balancedDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

In [81]:
balancedDF.cat.value_counts()

science_and_technology    25000
video_games               25000
sports                    25000
Name: cat, dtype: int64

### Convert to a word-count vector:

In [83]:
count = CountVectorizer()

In [84]:
text_data, string = [], " "

for text in balancedDF.txt_stems:
    text_data.append(string.join(text))

In [85]:
%%time
# Word-count vector as a sparse matrix
bal_sparseWCV = count.fit_transform(text_data)
bal_sparseWCV

Wall time: 1.24 s


<75000x40978 sparse matrix of type '<class 'numpy.int64'>'
	with 950851 stored elements in Compressed Sparse Row format>

In [86]:
bal_sparseWCV.shape

(75000, 40978)

## NN

In [2]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [87]:
# Set up data and labels
X = bal_sparseWCV
y = balancedDF.cat

### Train Test Split

In [88]:
%%time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 29.1 ms


### Train the model

In [60]:
from sklearn.neural_network import MLPClassifier

#### Create an instance of the model:

In [89]:
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))

#### Fit the training data to our model

In [90]:
%%time
mlp.fit(X_train,y_train)

Wall time: 57min 1s


MLPClassifier(hidden_layer_sizes=(30, 30, 30))

### Predictions / Evaluation

In [91]:
%%time
# Get predictions
predictions = mlp.predict(X_test)

Wall time: 86.5 ms


### Evaluate how well our model performed

In [92]:
from sklearn.metrics import classification_report,confusion_matrix

#### Confusion matrix

In [93]:
print(confusion_matrix(y_test,predictions))

[[4797  547  814]
 [ 651 4441 1201]
 [ 916 1225 4158]]


#### Precision / Recall / F1 / Suport (Classification report)

In [94]:
print(classification_report(y_test,predictions))

                        precision    recall  f1-score   support

science_and_technology       0.75      0.78      0.77      6158
                sports       0.71      0.71      0.71      6293
           video_games       0.67      0.66      0.67      6299

              accuracy                           0.71     18750
             macro avg       0.71      0.71      0.71     18750
          weighted avg       0.71      0.71      0.71     18750



In [95]:
from win32com.client import Dispatch
speak = Dispatch("SAPI.SpVoice").Speak

In [96]:
speak("modeling complete")

1

### Save model?
From [Kurt's post](https://teams.microsoft.com/l/message/19:6c1ce296425d48fb96a31caa1067c9d5@thread.tacv2/1621032970382?tenantId=26feded2-a083-4777-8052-06ad5ef53556&groupId=123ae38b-b423-4fce-94cb-6f6a8f02756f&parentMessageId=1621032970382&teamName=DSC%20550%20Data%20Mining&channelName=DSC%20550%20T301%202215-1%20Spring%202021&createdTime=1621032970382).  

Reference: https://stackabuse.com/scikit-learn-save-and-restore-models/

In [97]:
#//*** Save a model to disk using pickle
import pickle

model = mlp
pkl_filename = 'mlp_bal_spWCV.pkl'

with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [None]:
#//*** Restore the saved model
model = pickle.load(open(pkl_filename, 'rb'))