In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
df = pd.read_csv('toxic_comment_data.csv')
df.head()

Unnamed: 0,is_offensive,text
0,0,Then go to the village pump and suggest they c...
1,1,ANTI GREEK NATIONALIS -WIKIPEDIA \n\nHi Alexik...
2,1,Dis hoe wasnt dis violent on Lottery Ticket 😂😂
3,0,It is better for Atabay not helping the banned...
4,0,"""is in CamelCase. """"SiCKO"""" is not CamelCase,..."


In [3]:
x = df['text'].astype(str)
y = df['is_offensive']

In [4]:
train, test, y_train, y_test = train_test_split(x,y,stratify=y,random_state=42)

In [5]:
tokenizer = Tokenizer(num_words=None,lower=True,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',split=' ',char_level=False)
tokenizer.fit_on_texts(train)

In [6]:
x_train = tokenizer.texts_to_sequences(train)

In [7]:
x_test = tokenizer.texts_to_sequences(test)
word_index = tokenizer.word_index

In [8]:
vocab_size = len(word_index)
print('Vocab size: {}'.format(vocab_size))
longest = max(len(seq) for seq in x_train)
print("Longest comment size: {}".format(longest))
average = np.mean([len(seq) for seq in x_train])
print("Average comment size: {}".format(average))
stdev = np.std([len(seq) for seq in x_train])
print("Stdev of comment size: {}".format(stdev))
max_len = int(average + stdev * 3)
print('Max comment size: {}'.format(max_len))

Vocab size: 181975
Longest comment size: 1403
Average comment size: 60.596108921274364
Stdev of comment size: 95.82628401718543
Max comment size: 348


In [9]:
processed_x_train = pad_sequences(x_train, maxlen=max_len, padding='post', truncating='post')
processed_x_test = pad_sequences(x_test, maxlen=max_len, padding='post', truncating='post')

In [10]:
embeddings_index = {}
f = open(os.path.join('glove.6B/', 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))    

Found 400000 word vectors.


In [11]:
embedding_dim = 300
k = 0
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        k += 1
        embedding_matrix[i] = embedding_vector

In [12]:
import keras.backend
from keras.models import Sequential, load_model
from keras.layers import CuDNNGRU, Dense, Conv1D, MaxPooling1D
from keras.layers import Dropout, GlobalMaxPooling1D, BatchNormalization, LSTM
from keras.layers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Nadam
from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt

In [19]:
# Initate model
model = Sequential()

# Add Embedding layer
model.add(Embedding(vocab_size + 1, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True))

# Add Recurrent layer
#model.add(Bidirectional(CuDNNGRU(300, return_sequences=True)))
model.add(LSTM(60, return_sequences=True, name='lstm_layer'))
model.add(LSTM(30, return_sequences=True, name='lstm_layer2'))
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(3))
model.add(GlobalMaxPooling1D())
model.add(BatchNormalization())

# Add fully connected layers
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# Summarize the model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 348, 300)          54592800  
_________________________________________________________________
lstm_layer (LSTM)            (None, 348, 60)           86640     
_________________________________________________________________
lstm_layer2 (LSTM)           (None, 348, 30)           10920     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 348, 128)          19328     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 116, 128)          0         
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 128)               512       
__________

In [20]:
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [21]:
model_hist = model.fit(processed_x_train,y_train,epochs=2,batch_size=64,verbose=1)

Epoch 1/2
Epoch 2/2


In [22]:
model.save('model-2layerlstm300d-9706.h5')

In [23]:
oldmodel = load_model('/media/pratik/C8CC238ECC2375BA/Users/HP/Desktop/IIST/ML models/model-lstm300d-9534.h5')
oldmodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 348, 300)          54592800  
_________________________________________________________________
lstm_layer (LSTM)            (None, 348, 60)           86640     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 348, 128)          38528     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 116, 128)          0         
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 128)               512       
_________________________________________________________________
dense_5 (Dense)              (None, 50)                6450      
__________