## Load Data

In [1]:
# Use all processor cores
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Load libraries
import jsonlines
import pandas as pd

In [3]:
%%time
# Load JSON data into a list of dictionaries
data = []
with jsonlines.open('categorized-comments.jsonl') as reader:
    for obj in reader.iter(type=dict, skip_invalid=True):
        data.append(obj)

Wall time: 7.18 s


In [4]:
# Convert data to DataFrame
cat_comments_df = pd.DataFrame(data)
cat_comments_df.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [5]:
# Check out the categories
categories = cat_comments_df.cat.unique()
print("The categories are:")
for category in categories:
    print(" -", category)

The categories are:
 - sports
 - science_and_technology
 - video_games


## Preprocess Text

In [6]:
# Load libraries
import re
import sys
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer

# from sklearn.feature_extraction.text import CountVectorizer

In [7]:
df = cat_comments_df.copy()

In [8]:
# df.head()

In [9]:
punctuation_dict = dict.fromkeys(i for i in range(sys.maxunicode) 
                            if unicodedata.category(chr(i)).startswith('P'))

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def cleanText(string):
    '''Processes string and returns cleaned up list of words'''
    
    # Convert to lowercase
    string = string.lower()
    
    # Remove URLs
    string = re.sub(r'http\S+', '', string)
    
    # Remove punctuation
    string = string.translate(punctuation_dict)
    
    # Remove newlines
    string = string.replace("\n", " ")
    
    # Remove stopwords
    string = [word for word in string.split() if word not in stopwords_dict]
    
    return string

In [10]:
%%time
# Clean up the text in the 'txt' column
df.txt = df.txt.apply(lambda string: cleanText(string))

Wall time: 15.2 s


In [11]:
# df.head()

In [12]:
%%time
# Apply PorterStemmer
porter = PorterStemmer()

df['txt_stems'] = df.txt.apply(lambda words: [porter.stem(word) for word in words])

Wall time: 6min 44s


In [13]:
# df.head()

## Prepare Text for Model-Building
### **--COMMENT ALL THIS CODE--**

In [14]:
# Load libraries
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

### Back up, let's sample to equal sized groups:
https://stackoverflow.com/questions/41345289/getting-a-random-sample-in-python-dataframe-by-category

In [15]:
cat_group = df.groupby('cat', as_index=False, group_keys=False)

In [16]:
balancedDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

In [17]:
balancedDF.cat.value_counts()

science_and_technology    25000
video_games               25000
sports                    25000
Name: cat, dtype: int64

### Convert to a word-count vector:

In [18]:
count = CountVectorizer()

In [19]:
text_data, string = [], " "

for text in balancedDF.txt_stems:
    text_data.append(string.join(text))

In [20]:
%%time
# Word-count vector as a sparse matrix
bal_sparseWCV = count.fit_transform(text_data)
bal_sparseWCV

Wall time: 2.39 s


<75000x40984 sparse matrix of type '<class 'numpy.int64'>'
	with 958962 stored elements in Compressed Sparse Row format>

In [21]:
bal_sparseWCV.shape

(75000, 40984)

## NN

In [22]:
# Set up data and labels
X = bal_sparseWCV
y = balancedDF.cat

In [39]:
y.shape

(75000,)

In [36]:
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('Country', OneHotEncoder(), [0])], remainder='passthrough')

In [38]:
from sklearn.preprocessing import OneHotEncoder
oneHotEnc = OneHotEncoder()

In [42]:
# oneHotEnc.fit_transform(y)

In [43]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb  = MultiLabelBinarizer()

In [63]:
mlb.fit([y])
mlb.classes_

MultiLabelBinarizer()

In [66]:
test_y = mlb.transform([y])
test_y.shape

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(vocabulary=)

In [70]:
test_y = count.fit_transform(y)
test_y.shape

(75000, 3)

In [80]:
test_y

<75000x3 sparse matrix of type '<class 'numpy.int64'>'
	with 75000 stored elements in Compressed Sparse Row format>

In [79]:
count.get_feature_names()

['science_and_technology', 'sports', 'video_games']

##### ~~FINALLY!~~ -- Well dang "wrong order": https://www.tensorflow.org/api_docs/python/tf/sparse/reorder

In [81]:
y_sparse = count.fit_transform(y)

In [272]:
# tf.sparse.reorder(y_sparse)

In [97]:
y_sparse[:, :2].toarray().shape

(75000, 2)

In [109]:
y_sparse[:5, :2].toarray()

array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0]], dtype=int64)

In [98]:
y_sparse = y_sparse[:, :2]

In [115]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [117]:
le.fit(y)

LabelEncoder()

In [118]:
y_lEnc = le.transform(y)

In [119]:
y_lEnc.shape

(75000,)

In [120]:
type(y_lEnc)

numpy.ndarray

In [121]:
y_lEnc

array([0, 0, 0, ..., 2, 2, 2])

## try this

In [155]:
testDF = cat_comments_df.copy()

In [156]:
testDF.head()

Unnamed: 0,cat,txt
0,sports,Barely better than Gabbert? He was significant...
1,sports,Fuck the ducks and the Angels! But welcome to ...
2,sports,Should have drafted more WRs.\n\n- Matt Millen...
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,sports,No!! NOO!!!!!


In [157]:
cat_group = testDF.groupby('cat', as_index=False, group_keys=False)

In [158]:
bal_testDF = cat_group.apply(lambda s: s.sample(25000, replace=False))

In [159]:
bal_testDF.cat.value_counts()

science_and_technology    25000
video_games               25000
sports                    25000
Name: cat, dtype: int64

In [164]:
# bal_testDF.shape

(75000, 2)

In [160]:
# Remove URLs
bal_testDF.txt = bal_testDF.txt.apply(lambda text: re.sub(r'http\S+', '', text))

In [161]:
count = CountVectorizer(strip_accents='unicode', stop_words='english')

In [198]:
%%time
test_X = count.fit_transform(bal_testDF.txt)

Wall time: 1.66 s


In [166]:
# test_X.shape

(75000, 44297)

In [169]:
test_y = bal_testDF.cat

In [181]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [182]:
test_y = le.fit_transform(test_y)

In [183]:
test_y.shape

(75000,)

In [186]:
test_y

array([0, 0, 0, ..., 2, 2, 2])

In [250]:
test_X_tf_reordered.shape

TensorShape([75000, 44297])

### Train Test Split

In [214]:
import numpy as np
from scipy.sparse import csr_matrix

In [253]:
%%time
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(test_X, test_y) # Default is 1/4 --> test

Wall time: 27.7 ms


## 2. Define Keras Model

In [100]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

In [101]:
# X.shape

(75000, 40984)

In [102]:
# y_sparse.shape

(75000, 2)

In [119]:
# y_lEnc.shape

(75000,)

In [265]:
X_train.shape

44297

In [267]:
# define the keras model
model = Sequential()
model.add(Dense(12, input_dim=X_train.shape[1], activation='relu')) # 
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='softmax'))

## 3. Compile Keras Model

In [221]:
from keras.optimizers import RMSprop

In [270]:
# compile the keras model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #  optimizer=RMSprop(lr=0.01)

## 4. Fit Keras Model

In [173]:
type(X_train)

scipy.sparse.csr.csr_matrix

In [174]:
type(y_train)

pandas.core.series.Series

In [191]:
X_train.shape

(56250, 44297)

In [190]:
y_train.shape

(56250,)

In [271]:
%%time
# fit the keras model on the dataset
model.fit(X_train, y_train, epochs=50, batch_size=10)

InvalidArgumentError: indices[2] = [0,23236] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SerializeManySparse]

In [231]:
import tensorflow as tf
from tensorflow import sparse
from tensorflow import SparseTensor

In [247]:
# https://stackoverflow.com/questions/40896157/scipy-sparse-csr-matrix-to-tensorflow-sparsetensor-mini-batch-gradient-descent
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

In [248]:
convert_sparse_matrix_to_sparse_tensor(test_X)

<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x21202313640>

In [235]:
test_X_tf = convert_sparse_matrix_to_sparse_tensor(test_X)

In [236]:
test_X_tf

<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x21205810e50>

In [237]:
sparse.reorder(test_X_tf)

<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x21205810ca0>

In [239]:
test_X_tf_reordered = sparse.reorder(test_X_tf)

In [240]:
test_X_tf_reordered

<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x212058101c0>

In [242]:
test_X_tf_reordered.shape

TensorShape([75000, 44297])

In [256]:
test_X_train_tf = convert_sparse_matrix_to_sparse_tensor(X_train)

In [257]:
test_X_train_tf_reordered = sparse.reorder(test_X_train_tf)

In [42]:
from win32com.client import Dispatch
speak = Dispatch("SAPI.SpVoice").Speak

In [73]:
speak("modeling complete")

1