# NLP Toxcicty Machine Learning Model

## Imports

In [147]:
import pandas as pd
import string
import numpy as np

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multioutput import MultiOutputClassifier

import re

## Data

In [35]:
df_train= pd.read_csv('data/train.csv', index_col = 'id')

In [36]:
df_train.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [37]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159571 entries, 0000997932d777bf to fff46fc426af1f9a
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   comment_text   159571 non-null  object
 1   toxic          159571 non-null  int64 
 2   severe_toxic   159571 non-null  int64 
 3   obscene        159571 non-null  int64 
 4   threat         159571 non-null  int64 
 5   insult         159571 non-null  int64 
 6   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 9.7+ MB


In [38]:
df_test = pd.read_csv('data/test_labels.csv')

In [39]:
df_test.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [40]:
# Function for improving parts of speech information

### get_wordnet_pos was taken from Lecture 51-nlp_modeling.ipynb 
### link to the lecture: https://github.com/dvdhartsman/NTL-DS-080723/blob/main/4phase/51-nlp_modeling.ipynb


def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [41]:
# Function for handling the transformation of data

### preprocess taken from nlp-sentiment-analysis
### link to the project: https://github.com/dvdhartsman/NLP-Sentiment-Analysis/blob/main/Text_Classification_Final_Notebook.ipynb

def preprocess(comment):
    """
    This is a function that is intended to handle all of the tokenization, lemmatization, and other
    preprocessing for our tweet data. It will make use of objects from other libraries, and will return
    a complete list of tokens that are ready to be vectorized into numerical data.
    """
    
    # Create a list of stopwords to be removed from our tokenized word list
    stops = stopwords.words("english")
    # Add punctuation to the list of stopwords
    stops += string.punctuation
    # Providing a regex pattern for the tokenizer to handle
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    # Instantiating a tokenizer
    tokenizer = RegexpTokenizer(pattern)
    # Creating a list of raw tokens
    raw_tokens = tokenizer.tokenize(comment)
    # Using a comprehension to lower case every token
    lower_tokens = [i.lower() for i in raw_tokens]
    # Remove the stopwords from the list of tokens
    stopped_words = [i for i in lower_tokens if i not in stops]
    
    # Adding parts of speech to prepare for Lemmatization
    
    # This is the initial method to get parts of speech
    stopped_words = pos_tag(stopped_words)
    
    # Get_wordnet_pos() is the function to modify the pos definitions/assignments, creates tuples of (<word>, <pos>)
    stopped_words = [(word[0], get_wordnet_pos(word[1])) for word in stopped_words]
    
    lemmatizer = WordNetLemmatizer() 
    
    # This corrects the parts of speech and maximizes the usefulness of the lemmatization!!!!!
    document = [lemmatizer.lemmatize(word[0], word[1]) for word in stopped_words]
    
    # Re-join the list of cleaned tokens
    cleaned_doc = " ".join(document)
    return cleaned_doc

In [42]:
X = df_train.comment_text
y = df_train[['toxic', 'severe_toxic', 'obscene', 'threat','insult','identity_hate']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify=['toxic', 'severe_toxic','identity_hate'])
y_toxic_train = df_train.toxic
y_severe_toxic_train = df_train.severe_toxic
y_obscene_train = df_train.obscene
y_threat_train = df_train.threat
y_insult_train = df_train['identity_hate']

In [43]:
preprocess(X.iloc[1])

"d'aww match background colour i'm seemingly stuck thanks talk january utc"

In [44]:
X_clean = X.apply(preprocess)

In [45]:
count_vec = CountVectorizer(ngram_range=(1, 2), max_features=10000)
tf_vec = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

In [46]:
count_vec.fit(X_clean)
tf_vec.fit(X_clean)

In [47]:
X_count = count_vec.transform(X_clean)
X_tfidf = tf_vec.transform(X_clean)

In [48]:
X_count

<159571x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 4444977 stored elements in Compressed Sparse Row format>

In [49]:
X_tfidf

<159571x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 4444977 stored elements in Compressed Sparse Row format>

In [50]:
X_train_cv, X_test_cv, y_train, y_test = train_test_split(X_count,y, random_state = 42)

In [51]:
X_train_tf, X_test_tf, y_train, y_test = train_test_split(X_tfidf,y, random_state = 42)

In [52]:
#X_train.apply(preprocess)

"""id
0000997932d777bf    explanation edits make username hardcore metal...
000103f0d9cfb60f    d'aww match background colour i'm seemingly st...
000113f07ec002fd    hey man i'm really try edit war guy constantly...
0001b41b1c6bb37e    can't make real suggestion improvement wonder ...
0001d958c54c6e35                 sir hero chance remember page that's
                                          ...                        
ffe987279560d7ff    second time ask view completely contradict cov...
ffea4adeee384e90                 ashamed horrible thing put talk page
ffee36eab5c267c9    spitzer umm there actual article prostitution ...
fff125370e4aaaf3    look like actually put speedy first version de...
fff46fc426af1f9a    really think understand come idea bad right aw...
Name: comment_text, Length: 159571, dtype: object"""

"id\n0000997932d777bf    explanation edits make username hardcore metal...\n000103f0d9cfb60f    d'aww match background colour i'm seemingly st...\n000113f07ec002fd    hey man i'm really try edit war guy constantly...\n0001b41b1c6bb37e    can't make real suggestion improvement wonder ...\n0001d958c54c6e35                 sir hero chance remember page that's\n                                          ...                        \nffe987279560d7ff    second time ask view completely contradict cov...\nffea4adeee384e90                 ashamed horrible thing put talk page\nffee36eab5c267c9    spitzer umm there actual article prostitution ...\nfff125370e4aaaf3    look like actually put speedy first version de...\nfff46fc426af1f9a    really think understand come idea bad right aw...\nName: comment_text, Length: 159571, dtype: object"

# Modeling

In [53]:
class MultiOutput():
    # a class to more easily test, interpret and store different classifiers for a multioutput dataset
    model_list = []
    model_df = pd.DataFrame(columns = ['Classifier', 'train_accuracy','train_prec','train_recall','train_f1',
                                      'test_accuracy','test_prec','test_recall','test_f1'])
    
    def __init__(self, name, clf, X_train, X_test, y_train, y_test):
        self.name = name
        self.clf = classifier
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
        # Measuring model
        self.train_results = cross_validate(self.model, self.X_train, self.y_train, scoring=[
            'precision', 'accuracy', 'recall', 'f1', 'neg_log_loss'], n_jobs=4, verbose=1)
        

## Baseline Dummy 

In [54]:
dummy = DummyClassifier(strategy='most_frequent')

In [55]:
dummy_clf = MultiOutputClassifier(dummy).fit(X_train_cv,y_train)

In [56]:
accuracy_score(y_train, dummy_clf.predict(X_train_cv))

0.898343889436655

In [57]:
accuracy_score(y_test, dummy_clf.predict(X_test_cv))

0.8982528263103803

In [58]:
y_train

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
c912439967ba8a35,0,0,0,0,0,0
b312f612d3394d5b,0,0,0,0,0,0
813c210bf7f27377,0,0,0,0,0,0
c231999bc75dcd9e,0,0,0,0,0,0
d60a1de68cf593c1,0,0,0,0,0,0
...,...,...,...,...,...,...
811ed72c51830f42,0,0,0,0,0,0
2acc7c7d0386401f,0,0,0,0,0,0
c1f95b89050a9ee4,1,0,0,0,0,0
32e8bdecfe1d66f0,0,0,0,0,0,0


## Logistic Regression

In [59]:
logreg_clf = MultiOutputClassifier(LogisticRegression()).fit(X_train_cv, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [60]:
accuracy_score(y_train, logreg_clf.predict(X_train_cv))

0.9319590902254382

In [61]:
accuracy_score(y_test, logreg_clf.predict(X_test_cv))

0.9147970821948713

In [62]:
logreg_clf.predict(X_test_cv)

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [63]:
logreg_clf_tf = MultiOutputClassifier(LogisticRegression()).fit(X_train_tf, y_train)

In [64]:
accuracy_score(y_train, logreg_clf_tf.predict(X_train_tf))

0.9246227376794398

In [65]:
accuracy_score(y_test, logreg_clf_tf.predict(X_test_tf))

0.9191336826009575

## Random Forest

In [66]:
from sklearn.ensemble import RandomForestClassifier

rfclf = MultiOutputClassifier(RandomForestClassifier(n_jobs = -1, random_state=42, max_depth=50, verbose = 1)).fit(X_train_cv, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0

In [67]:
accuracy_score(y_test, rfclf.predict(X_test_cv))

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0

0.9049958639360289

In [68]:
rfclf_tf = MultiOutputClassifier(RandomForestClassifier(n_jobs = -1, random_state=42, max_depth=50, verbose = 1)).fit(X_train_tf, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1

In [69]:
accuracy_score(y_test, rfclf_tf.predict(X_test_tf))

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0

0.9052716015341037

## Multinomial Naive Bayes

In [70]:
from sklearn.naive_bayes import MultinomialNB

mnb_cv = MultiOutputClassifier(MultinomialNB()).fit(X_train_cv, y_train)

In [71]:
accuracy_score(y_test, mnb_cv.predict(X_test_cv))

0.9033915724563206

In [72]:
mnb_tf = MultiOutputClassifier(MultinomialNB()).fit(X_train_tf, y_train)

In [73]:
accuracy_score(y_test, mnb_tf.predict(X_test_cv))

0.8979018875491941

In [74]:
mnb_tf.predict(X_test_cv)

array([[1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

## Sequential

In [75]:
from keras.models import Sequential
from keras import layers
from keras.layers import Dropout

In [76]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [77]:
def get_model(n_inputs, n_outputs, dropout = None, layer_amnt = 1):    
    model = Sequential()
    if dropout != None:
        model.add(layers.Dropout(0.2, input_shape = (n_inputs,)))
    else:
        model.add(layers.Dense(128, input_dim = n_inputs, activation = 'relu'))
    for i in range(layer_amnt):
        model.add(layers.Dense(128, activation = 'relu'))
    model.add(layers.Dense(n_outputs, activation = 'sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam')
    return model

In [78]:
nn1 = get_model(10000, y_train.shape[1])

In [79]:
X_train_cv_df = pd.DataFrame(X_train_cv.toarray())

In [110]:
X_train_tf_df = pd.DataFrame(X_train_tf.toarray())

In [80]:
X_train_cv_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
nn1.fit(X_train_cv_df, y_train, verbose = 1, epochs = 10, workers = -1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x18625e0c9a0>

In [83]:
print(nn1.predict(X_test_cv[1]))

[[4.4640055e-18 4.6865886e-29 2.8922893e-18 8.8298674e-20 7.1970171e-16
  9.4472370e-14]]


In [84]:
yhat_cv = nn1.predict(X_test_cv)

In [85]:
yhat_cv

array([[8.7055992e-07, 5.8218100e-24, 2.1756439e-15, 5.7454475e-14,
        5.7418892e-21, 2.1569868e-09],
       [4.4639885e-18, 4.6865528e-29, 2.8922782e-18, 8.8299010e-20,
        7.1970717e-16, 9.4472363e-14],
       [3.7733099e-07, 8.9163757e-14, 9.0958241e-07, 5.4795515e-11,
        1.1526948e-07, 6.7386784e-11],
       ...,
       [0.0000000e+00, 0.0000000e+00, 2.7088925e-23, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.0911852e-16, 0.0000000e+00, 1.6975297e-24, 1.4207748e-18,
        1.4404731e-29, 6.1836338e-13],
       [6.0687304e-09, 4.8048817e-17, 9.7626063e-10, 5.7418605e-12,
        4.0389740e-18, 2.1092930e-12]], dtype=float32)

In [86]:
yhat_cv = yhat_cv.round()

In [87]:
yhat_cv[1]

array([0., 0., 0., 0., 0., 0.], dtype=float32)

In [88]:
accuracy_score(y_test, yhat_cv)

0.901311006943574

In [89]:
yhat_train_cv = nn1.predict(X_train_cv)

In [90]:
yhat_train_cv = yhat_train_cv.round()

In [91]:
accuracy_score(y_train, yhat_train_cv)

0.992471465098013

### Neural Network 300

In [92]:
nn2 = get_model(300, y_train.shape[1])

In [93]:
X_train_cv_300 = X_train_cv_df.iloc[:,0:300]

In [94]:
X_train_cv_300.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
nn2.fit(X_train_cv_300, y_train, verbose = 1, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1861bab83a0>

In [96]:
yhat_train_cv_2 = nn2.predict(X_train_cv_300)

In [97]:
yhat_train_cv_2 = yhat_train_cv_2.round()

In [98]:
accuracy_score(y_train, yhat_train_cv_2)

0.898928792259229

### Neural Network dropout

In [100]:
nndrop = get_model(10000, 6, dropout = True, layer_amnt = 3)

In [101]:
nndrop.fit(X_train_cv_df, y_train, verbose = 1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1860e8d4550>

In [102]:
nndrop_test = nndrop.predict(X_test_cv)

In [103]:
nndrop_test = nndrop_test.round()

In [104]:
accuracy_score(y_test, nndrop_test)

0.9132429248239039

In [105]:
nndrop.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 10000)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               1280128   
_________________________________________________________________
dense_7 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_8 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_9 (Dense)              (None, 6)                 774       
Total params: 1,313,926
Trainable params: 1,313,926
Non-trainable params: 0
_________________________________________________________________


### NND 2

In [106]:
nndrop2 = get_model(10000, 6, dropout = True, layer_amnt = 5)

In [107]:
nndrop2.fit(X_train_cv_df, y_train, verbose = 1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1861319b940>

In [108]:
nndrop2_test = nndrop2.predict(X_test_cv)
nndrop2_test = nndrop2_test.round()
accuracy_score(y_test, nndrop2_test)

0.9149725515754644

In [109]:
nndrop2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_1 (Dropout)          (None, 10000)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 128)               1280128   
_________________________________________________________________
dense_11 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_12 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_13 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_14 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_15 (Dense)             (None, 6)                

### NN TF-IDF

In [131]:
X_test_tf_df = pd.DataFrame(X_test_tf.toarray())

In [115]:
nntf = get_model(X_train_tf_df.shape[1], y_train.shape[1], layer_amnt=3)

In [126]:
tf.sparse.reorder
nntf.fit(X_train_tf_df, y_train, epochs = 10, verbose=1, shuffle = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x185a00f7850>

In [133]:
nntf_test = nntf.predict(X_test_tf_df)
nntf_test = nntf_test.round()
accuracy_score(y_test, nntf_test)

0.9037425112175068

In [134]:
from sklearn.metrics import hamming_loss
hamming_loss(y_test, nntf_test)

0.02313689118391698

### NNTF Dropout

In [118]:
nntf_drop = get_model(X_train_tf_df.shape[1], y_train.shape[1],dropout = True, layer_amnt=3)

In [125]:
tf.sparse.reorder
nntf_drop.fit(X_train_tf_df, y_train, epochs = 10, verbose = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1859fb39460>

In [135]:
nntf_drop_test = nntf_drop.predict(X_test_tf_df)
nntf_drop_test = nntf_drop_test.round()
accuracy_score(y_test, nntf_drop_test)

0.9100343418644875

In [136]:
hamming_loss(y_test, nntf_drop_test)

0.021804159459888534

### Embedded NN

In [208]:
nntf_drop_em7 = Sequential()

In [209]:
nntf_drop_em7.add(layers.Embedding(10001, 32))
nntf_drop_em7.add(layers.Bidirectional(layers.LSTM(32, activation = 'tanh')))
nntf_drop_em7.add(layers.Dense(128, activation='relu'))
nntf_drop_em7.add(layers.Dense(128, activation='relu'))
nntf_drop_em7.add(layers.Dense(128, activation='relu'))
nntf_drop_em7.add(layers.Dense(6, activation='sigmoid'))
nntf_drop_em7.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [210]:
# Our vectorized labels
#https://stackoverflow.com/questions/48851558/tensorflow-estimator-valueerror-logits-and-labels-must-have-the-same-shape
y_train_rs = np.asarray(y_train).astype('float32').reshape((-1,6))
y_test_rs = np.asarray(y_test).astype('float32').reshape((-1,6))

In [213]:
tf_df = tf.convert_to_tensor(X_train_tf_df)

In [211]:
y_train_rs

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]], dtype=float32)

In [216]:
nntf_drop_em7.fit(tf_df, y_train, epochs = 10, verbose = True)

Epoch 1/10
   1/3740 [..............................] - ETA: 17:34:26 - loss: 0.0446 - accuracy: 1.0000

KeyboardInterrupt: 

In [189]:
nntf_drop_em4.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          320032    
_________________________________________________________________
dense_4 (Dense)              (None, None, 128)         4224      
_________________________________________________________________
dense_5 (Dense)              (None, None, 128)         16512     
_________________________________________________________________
dense_6 (Dense)              (None, None, 128)         16512     
_________________________________________________________________
dense_7 (Dense)              (None, None, 6)           774       
Total params: 358,054
Trainable params: 358,054
Non-trainable params: 0
_________________________________________________________________
