# Importing Required Libraries

In [13]:
#Keras with Tensorflow backend
import sys, os, re, csv, codecs, numpy as np, pandas
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers



# Setup and File Reading

In [14]:
train = pandas.read_csv('dataset/train.csv')
test = pandas.read_csv('dataset/test.csv')

toxic_classifications = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[toxic_classifications].values
train_sentences_strings = train["comment_text"]
test_sentences_strings = test["comment_text"]

print(train_sentences_strings[0])

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27


# Tokenization Process

In [15]:
maxfeatures = 30000
tokenizer = Tokenizer(num_words = maxfeatures)
tokenizer.fit_on_texts(list(train_sentences_strings))
tokenized_training = tokenizer.texts_to_sequences(train_sentences_strings)
tokenized_testing = tokenizer.texts_to_sequences(test_sentences_strings)

print(tokenized_training[0])

[27904, 75, 1, 128, 130, 177, 28, 670, 4510, 12133, 1115, 87, 331, 52, 2269, 11510, 51, 6926, 16, 61, 2747, 149, 8, 2933, 35, 116, 1235, 15747, 2859, 5, 46, 60, 243, 1, 370, 32, 1, 39, 29, 144, 74, 3474, 90, 3075, 4630, 2284, 985]


# Padding for Apropriate Dimensions

In [19]:
paddingSize = 300

padded_train = pad_sequences(tokenized_training, maxlen=paddingSize)
padded_test = pad_sequences(tokenized_testing, maxlen=paddingSize)


# Assembling the Model

In [29]:
length = 300

inputLayer = Input(shape = (length, ))

embedding_size = 100

x = Embedding(maxfeatures, embedding_size) (inputLayer)

#begin LSTM layer
lstm_output_size = 100
x = LSTM(lstm_output_size, return_sequences = True, name = 'lstm')(x)

#lstm reshaping with GLOBAl pooling
x = GlobalAveragePooling1D(x)

#dropout layer
x = Dropout(0.1)(x)


#dense
x = Dense(50, activation = "relu")(x)

#dropout layer
x = Dropout(0.05)(x)

#dense
x = Dense(6,activation = "sigmoid")(x)

lstmModel = Model(inputs = inputLayer, outputs = x)

lstmModel.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

print(lstmModel.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 300, 100)          3000000   
_________________________________________________________________
lstm (LSTM)                  (None, 300, 100)          80400     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dropout_18 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_19 (Dropout)         (None, 50)                0         
__________

# Testing and Training

Epochs = 3
Batch = 32

Lucas : 1 dense layer, 2 dense layers, sigmoid vs relu
Robert : Bidirectional stuff
Nick : layer dimensions / output size

In [30]:
batch_size = 32
epochs = 2
lstmModel.fit(padded_train,y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x20205389978>

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
loss: 0.1032 - acc: 0.9688 - val_loss: 0.0654 - val_acc: 0.9767
Epoch 2/2
loss: 0.0574 - acc: 0.9797 - val_loss: 0.0547 - val_acc: 0.9804

lstm_output_size = 100

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
loss: 0.0934 - acc: 0.9714 - val_loss: 0.0583 - val_acc: 0.9789
Epoch 2/2
loss: 0.0520 - acc: 0.9810 - val_loss: 0.0527 - val_acc: 0.9804

relu dense layer 100

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
loss: 0.0699 - acc: 0.9774 - val_loss: 0.0491 - val_acc: 0.9818
Epoch 2/2
loss: 0.0444 - acc: 0.9833 - val_loss: 0.0475 - val_acc: 0.9822

GlobalMaxPool1D()