## Load Data

In [1]:
# Use all processor cores
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Load libraries
import jsonlines
import pandas as pd

In [3]:
%%time
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

Wall time: 3.26 s


In [4]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [5]:
# Check out the categories
categories = cat_comments_df.cat.unique()
print("The categories are:")
for category in categories:
    print(" -", category)

The categories are:
 - sports
 - science_and_technology
 - video_games


## Preprocess Text

In [6]:
# Load libraries
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

# from sklearn.feature_extraction.text import CountVectorizer

In [7]:
df = cat_comments_df.copy()

In [8]:
# df.head()

In [9]:
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [10]:
%%time
# Clean up the text in the 'txt' column
df.txt = df.txt.apply(lambda string: cleanText(string))

Wall time: 15 s


In [11]:
# df.head()

In [12]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()

df['txt_stems'] = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 3min 11s


In [13]:
# df.head()

## Prepare Text for Model-Building

In [14]:
# Load libraries
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Back up, let's sample to equal sized groups:
https://stackoverflow.com/questions/41345289/getting-a-random-sample-in-python-dataframe-by-category

In [15]:
cat_group = df.groupby('cat', as_index=False, group_keys=False)

In [16]:
balancedDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

In [17]:
balancedDF.cat.value_counts()

science_and_technology    25000
video_games               25000
sports                    25000
Name: cat, dtype: int64

### Convert to a word-count vector:

In [18]:
count = CountVectorizer()

In [19]:
text_data, string = [], " "

for text in balancedDF.txt_stems:
    text_data.append(string.join(text))

In [20]:
%%time
# Word-count vector as a sparse matrix
bal_sparseWCV = count.fit_transform(text_data)
bal_sparseWCV

Wall time: 1.2 s


<75000x41441 sparse matrix of type '<class 'numpy.int64'>'
	with 955491 stored elements in Compressed Sparse Row format>

In [21]:
bal_sparseWCV.shape

(75000, 41441)

## NN

In [22]:
# Set up data and labels
X = bal_sparseWCV
y = balancedDF.cat

### Train Test Split

In [23]:
import numpy as np
from scipy.sparse import csr_matrix

In [28]:
%%time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y) # Default is 1/4 --> test

Wall time: 27.3 ms


## 2. Define Keras Model

In [72]:
# Importing the Keras libraries and packages
import keras

### Let's try this bit from ATAwP, Ch 12:

In [74]:
from keras.models import Sequential
from keras.layers import Dense

N_FEATURES = X.shape[1] # 5000
N_CLASSES = 1

def build_network():
    '''returns a compiled neural network'''
    # Build network
    nn = Sequential()
    nn.add(Dense(500, activation='relu', input_shape=(N_FEATURES,)))
    nn.add(Dense(150, activation='relu'))
    nn.add(Dense(N_CLASSES, activation='softmax'))
    
    # Compile the keras model
    nn.compile(
        loss='categorical_crossentropy', 
        optimizer='adam', 
        metrics=['accuracy']
    )
    
    return nn

In [75]:
if __name__ == '__main__':
    from sklearn.pipeline import Pipeline
#     from transformer import TextNormalizer
    from keras.wrappers.scikit_learn import KerasClassifier
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    pipeline = Pipeline([
#         ('norm', TextNormalizer()), 
        ('vect', TfidfVectorizer(max_features=N_FEATURES)), 
        ('nn', KerasClassifier(build_fn=build_network, 
                               epochs=200, 
                               batch_size=128))
    ])

In [76]:
cat_group = cat_comments_df.groupby('cat', as_index=False, group_keys=False)
testDF = cat_group.apply(lambda s: s.sample(25000, replace=False))
testDF.cat.value_counts()

science_and_technology    25000
video_games               25000
sports                    25000
Name: cat, dtype: int64

In [80]:
from sklearn.model_selection import cross_val_score
import joblib

def train_model(path, model, saveto=None, cv=12):
    '''Trains model from corpus and fits on data. Returns scores.'''
    X = text_data #testDF.txt
    y = testDF.cat
    
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')#, n_jobs=-1)
    model.fit(X, y)
    
    if saveto:
        model.steps[-1][1].model.save(saveto['keras_model'])
        model.steps.pop(-1)
        joblib.dump(model, saveto['sklearn_pipe'])
        
    return scores

In [78]:
# Convert all the text data into a list of strings, 
# with each tweet as one string in the list

text_data, string = [], " "

for text in testDF.txt:
    text_data.append(string.join(text))

In [81]:
cpath = None
mpath = {
    'keras_model'  : 'keras_nn.h5', 
    'sklearn_pipe' : 'pipeline.pkl'
}
scores = train_model(cpath, pipeline, saveto=mpath, cv=12)

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 303, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1850, in fit_transform
    X = super().fit_transform(raw_documents)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feat

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [170]:
# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=5000, split=',', char_level=False)

In [174]:
%%time
tokenizer.fit_on_texts(testDF.txt_str)

In [190]:
%%time
features = tokenizer.texts_to_matrix(balancedDF.txt_str, mode='binary')

Wall time: 647 ms


In [155]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [156]:
target_le = le.fit_transform(y)

In [188]:
# One-hot encode target vector to create a target matrix
targets = to_categorical(target_le, num_classes=3)