# Importing Required Libraries

In [1]:
#Keras with Tensorflow backend
import sys, os, re, csv, codecs, numpy as np, pandas
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, concatenate
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, Conv1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score



Using TensorFlow backend.


# Setup and File Reading

In [2]:
train = pandas.read_csv('../dataset/train.csv')
test = pandas.read_csv('generated_phrases.csv', encoding="latin-1")

toxic_classifications = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[toxic_classifications].values
train_sentences_strings = train["comment_text"]
test_sentences_strings = test["text"]

print(test_sentences_strings[1])

property and everything I have! I wish they were strict facts do you know what stupid Canadians do? Nothing. They speak French, and play in traffic. Are you gay son of a philosophical paradigm which is not perminently banned!


# Tokenization Process

In [3]:
maxfeatures = 30000
tokenizer = Tokenizer(num_words = maxfeatures)
tokenizer.fit_on_texts(list(train_sentences_strings))
tokenized_training = tokenizer.texts_to_sequences(train_sentences_strings)
tokenized_testing = tokenizer.texts_to_sequences(test_sentences_strings)

print(tokenized_training[0])

[688, 75, 1, 126, 130, 177, 29, 672, 4511, 12052, 1116, 86, 331, 51, 2278, 11448, 50, 6864, 15, 60, 2756, 148, 7, 2937, 34, 117, 1221, 15190, 2825, 4, 45, 59, 244, 1, 365, 31, 1, 38, 27, 143, 73, 3462, 89, 3085, 4583, 2273, 985]


# Padding for Apropriate Dimensions

In [4]:
paddingSize = 300

padded_train = pad_sequences(tokenized_training, maxlen=paddingSize)
padded_test = pad_sequences(tokenized_testing, maxlen=paddingSize)


# Assembling the Model

In [5]:
length = 300

inputLayer = Input(shape = (length, ))

embedding_size = 100

x = Embedding(maxfeatures, embedding_size) (inputLayer)

#begin LSTM layer
lstm_output_size = 100
x = LSTM(lstm_output_size, return_sequences = True, name = 'lstm')(x)

#lstm reshaping with GLOBAl pooling
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 

#dropout layer
x = Dropout(0.1)(x)

#dense
x = Dense(6,activation = "sigmoid")(x)

lstmModel = Model(inputs = inputLayer, outputs = x)

lstmModel.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

print(lstmModel.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 100)     3000000     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 300, 100)     80400       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 298, 64)      19264       lstm[0][0]                       
__________________________________________________________________________________________________
global_ave

# Testing and Training

Epochs = 3
Batch = 32

Lucas : 1 dense layer, 2 dense layers, sigmoid vs relu
Robert : Bidirectional stuff
Nick : layer dimensions / output size

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_train, y, train_size=0.9)

filepath="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
roc_auc = RocAucEvaluation(validation_data=(X_test, y_test), interval = 1)
callbacks_list = [roc_auc,checkpoint, early]

In [None]:
batch_size = 32
epochs = 2
lstmModel.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), callbacks=callbacks_list)

In [None]:
lstmModel.save("Bidirectional_LSTM_Convolution.hdf5")

In [None]:
# load the network weights
filename = "Bidirectional_LSTM_Convolution.hdf5"
lstmModel.load_weights(filename)
lstmModel.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
y_pred = lstmModel.predict(padded_test)

In [None]:
test[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
test.to_csv("generated_text_classification.csv")

Bidirectional LSTM with 2 dense/dropout layers: loss: 0.0444 - acc: 0.9833 - val_loss: 0.0475 - val_acc: 0.9822

Bidirectional LSTM with 1 dense/dropout layer: loss: 0.0535 - acc: 0.9817 - val_loss: 0.0486 - val_acc: 0.9820

Bidirectional LSTM-Convolution: loss: 0.0431 - acc: 0.9837 - val_loss: 0.0451 - val_acc: 0.9831 => ROC-AUC - epoch: 2 - score: 0.983753