# Importing Required Libraries

In [1]:
#Keras with Tensorflow backend
import sys, os, re, csv, codecs, numpy as np, pandas
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, concatenate
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, Conv1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score



Using TensorFlow backend.


# Setup and File Reading

In [3]:
train = pandas.read_csv('dataset/train.csv')

toxic_classifications = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[toxic_classifications].values
train_sentences_strings = train["comment_text"]
with open("generated_phrases.txt") as f:
    lines = f.readlines()

test_sentences_strings = [x.strip() for x in lines]

print(train_sentences_strings[0])

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27


# Tokenization Process

In [4]:
maxfeatures = 30000
tokenizer = Tokenizer(num_words = maxfeatures)
tokenizer.fit_on_texts(list(train_sentences_strings))
tokenized_training = tokenizer.texts_to_sequences(train_sentences_strings)
tokenized_testing = tokenizer.texts_to_sequences(test_sentences_strings)

print(tokenized_training[0])

[27904, 75, 1, 128, 130, 177, 28, 670, 4510, 12133, 1115, 87, 331, 52, 2269, 11510, 51, 6926, 16, 61, 2747, 149, 8, 2933, 35, 116, 1235, 15747, 2859, 5, 46, 60, 243, 1, 370, 32, 1, 39, 29, 144, 74, 3474, 90, 3075, 4630, 2284, 985]


# Padding for Apropriate Dimensions

In [5]:
paddingSize = 300

padded_train = pad_sequences(tokenized_training, maxlen=paddingSize)
padded_test = pad_sequences(tokenized_testing, maxlen=paddingSize)


# Assembling the Model

In [6]:
length = 300

inputLayer = Input(shape = (length, ))

embedding_size = 100

x = Embedding(maxfeatures, embedding_size) (inputLayer)

#begin LSTM layer
lstm_output_size = 100
x = LSTM(lstm_output_size, return_sequences = True, name = 'lstm')(x)

#lstm reshaping with GLOBAl pooling
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 

#dropout layer
x = Dropout(0.1)(x)

#dense
x = Dense(6,activation = "sigmoid")(x)

lstmModel = Model(inputs = inputLayer, outputs = x)

lstmModel.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

print(lstmModel.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 300)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 300, 100)      3000000     input_1[0][0]                    
____________________________________________________________________________________________________
lstm (LSTM)                      (None, 300, 100)      80400       embedding_1[0][0]                
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 298, 64)       19264       lstm[0][0]                       
___________________________________________________________________________________________

# Testing and Training

Epochs = 3
Batch = 32

Lucas : 1 dense layer, 2 dense layers, sigmoid vs relu
Robert : Bidirectional stuff
Nick : layer dimensions / output size

In [6]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(padded_train, y, train_size=0.9)

filepath="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
roc_auc = RocAucEvaluation(validation_data=(X_test, y_test), interval = 1)
callbacks_list = [roc_auc,checkpoint, early]



In [9]:
batch_size = 32
epochs = 2
lstmModel.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), callbacks=callbacks_list)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.977637

Epoch 00001: val_acc improved from -inf to 0.98263, saving model to weights_base.best.hdf5
Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.983753

Epoch 00002: val_acc improved from 0.98263 to 0.98309, saving model to weights_base.best.hdf5


<keras.callbacks.History at 0x7fa5fb493208>

In [10]:
lstmModel.save("Bidirectional_LSTM_Convolution.hdf5")

In [8]:
# load the network weights
filename = "Bidirectional_LSTM_Convolution.hdf5"
lstmModel.load_weights(filename)
lstmModel.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [23]:
y_pred = lstmModel.predict(padded_test)
generated_text_df = pandas.DataFrame()
generated_text_df["text"] = test_sentences_strings
generated_text_df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred

KeyError: "['toxic' 'severe_toxic' 'obscene' 'threat' 'insult' 'identity_hate'] not in index"

Bidirectional LSTM with 2 dense/dropout layers: loss: 0.0444 - acc: 0.9833 - val_loss: 0.0475 - val_acc: 0.9822

Bidirectional LSTM with 1 dense/dropout layer: loss: 0.0535 - acc: 0.9817 - val_loss: 0.0486 - val_acc: 0.9820

Bidirectional LSTM-Convolution: loss: 0.0431 - acc: 0.9837 - val_loss: 0.0451 - val_acc: 0.9831 => ROC-AUC - epoch: 2 - score: 0.983753