## GRU Model with Fully Trained Embedding

In [18]:

#Make sure to run this script in python 2.7
#You should see "Python 2" on the top right corner of the jupyter notebook to make sure it's the correct one

#This is the best model to run and test. It's been cleaned up and tested with cross validation

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dropout, Dense, Input, LSTM, Activation, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model,  model_from_json
from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.model_selection import train_test_split

import sys, os, re, csv, codecs
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


import gensim.models.keyedvectors as word2vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import  h5py
from sklearn.model_selection import KFold


###Make sure to change the path to where you place the csv file####

PATH = "/home/ubuntu/"

#We took the raw CSV file and performed the preprocessing steps on it
#The submitted CSV file has gone through the preprocessing steps and it's ready for our model
train = pd.read_csv(PATH+"Toxic_PreProc.csv")


#splitting the lables from the comments
list_classes = ["toxic", "severe_toxic", "obscene","threat", "insult", "identity_hate"]
y = train[list_classes].values
sentence_train = train["comment_text"].astype(str)


#We are keeping 20000 unique tokens
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(sentence_train)

#Changing the tokens to sequence integers
list_sent_tok=tokenizer.texts_to_sequences(sentence_train)

#maximum number of words per sentence is 100
max_words=100
Pad_Train=pad_sequences(list_sent_tok,maxlen=max_words)


#Setting up the 5 fold cross validation on our model
kfold = KFold(n_splits=5, shuffle=True, random_state=5)
cvscores = []


#Running the model with cross validation
for train, test in kfold.split(Pad_Train, y):

    #X_train, X_test, y_train, y_test = train_test_split(Pad_Train, y, test_size=0.15, random_state=42)

    inp = Input(shape=(max_words,))

    x = Embedding(max_features, 128)(inp)

    #Adding a GRU layer
    x = Bidirectional(GRU(128, return_sequences = True))(x)

    #Adding the maxpooling layer
    x = GlobalMaxPool1D()(x)

    #One more dropout layer to help with any overfitting
    x = Dropout(0.2)(x)

    #passing it through a nonlinear (relu) layer
    x = Dense(50, activation="relu")(x)

    #One more dropout layer to help with any overfitting
    x = Dropout(0.2)(x)

    #this is the output layer. we need to use a nonlinear softmax or sigmoid for classification
    # Since we have a multi label classification, we'll have to use sigmoid. softmax won't work.
    x= Dense(6, activation="sigmoid")(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(optimizer='adam',metrics=['accuracy'],loss='binary_crossentropy')
    
    batch_size = 32
    epochs = 3
    log = model.fit(Pad_Train[train],y[train], batch_size=batch_size, validation_split=0.1, epochs=epochs)
    
    #run the test set and get the prediction values    
    y_pred = model.predict(Pad_Train[test], verbose=1)
    
    #prediction values need to get transformed to binary
    for i in range(len(y_pred)):
        for j in range(6):
            y_pred[i][j] = round(y_pred[i][j])

    count = 0
    y_test = y[test]
    
    #scoring the accuracy 
    for  i in range(len(y_pred)):
        if (y_test[i]==y_pred[i]).all():
            count +=1
    score = (float(count)/len(y_pred))

    print ("The number of correct predictions: ",count)
    print ("The number of attempted predictions: ",len(y_pred))
    print ("Accuracy: ", score)
    
    cvscores.append(score * 100)
    
    print("Toxic: {}".format(confusion_matrix(y_test[:,0], y_pred[:,0])))
    print("\nSevere Toxic: \n{}".format(confusion_matrix(y_test[:,1], y_pred[:,1])))
    print("\nObscene: \n{}".format(confusion_matrix(y_test[:,2], y_pred[:,2])))
    print("\nThreat: \n{}".format(confusion_matrix(y_test[:,3], y_pred[:,3])))
    print("\nInsult: \n{}".format(confusion_matrix(y_test[:,4], y_pred[:,4])))
    print("\nIdentity Hate: \n{}".format(confusion_matrix(y_test[:,5], y_pred[:,5])))

#get the result of the cross validation mean
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))  

    

Train on 114890 samples, validate on 12766 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
('The number of correct predictions: ', 29380)
('The number of attempted predictions: ', 31915)
('Accuracy: ', 0.9205702647657841)
Toxic: [[28605   308]
 [  921  2081]]

Severe Toxic: 
[[31495   111]
 [  181   128]]

Obscene: 
[[30071   169]
 [  421  1254]]

Threat: 
[[31821     0]
 [   94     0]]

Insult: 
[[30042   334]
 [  568   971]]

Identity Hate: 
[[31600    13]
 [  295     7]]
Train on 114891 samples, validate on 12766 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
('The number of correct predictions: ', 29222)
('The number of attempted predictions: ', 31914)
('Accuracy: ', 0.9156483048192016)
Toxic: [[28394   347]
 [  960  2213]]

Severe Toxic: 
[[31554    16]
 [  309    35]]

Obscene: 
[[29907   249]
 [  385  1373]]

Threat: 
[[31817     0]
 [   97     0]]

Insult: 
[[29984   302]
 [  633   995]]

Identity Hate: 
[[31599    28]
 [  244    43]]
Train on 114891 samples, validate on 12766 samples
Epoch 1/3
E