# Simple Keras LSTM

#### Based off of open source code at https://www.kaggle.com/antmarakis/bi-lstm-conv-layer/code

In [3]:
# import libs

import numpy as np
import pandas as pd

from keras.layers import Dense, Input, LSTM, Bidirectional, Conv1D
from keras.layers import Dropout, Embedding
from keras.preprocessing import text, sequence
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.models import Model

In [9]:
# import files
EMBEDDING_FILE = "tmp/glove.840B.300d.txt"
train = pd.read_csv('data/clean_train.csv')
test = pd.read_csv('data/clean_test.csv')

In [10]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,1,000103f0d9cfb60f,d'aww ! matches background colour I am seeming...,0,0,0,0,0,0
2,2,000113f07ec002fd,"hey man , I am really trying edit war . it is ...",0,0,0,0,0,0
3,3,0001b41b1c6bb37e,""" cannot make real suggestions improvement - w...",0,0,0,0,0,0
4,4,0001d958c54c6e35,", sir , hero . chance remember page that is ?",0,0,0,0,0,0


In [11]:
# constants
max_features=100000
maxlen=150
embed_size=300

In [62]:
# data
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train['comment_text'].fillna(' ', inplace=True)
test['comment_text'].fillna(' ', inplace=True)

train_y = train[classes].values
train_x = train['comment_text'].str.lower()
test_x = test['comment_text'].str.lower()

In [64]:
# Vectorize text + Embedding

tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(train_x.values)

train_x = tokenizer.texts_to_sequences(train_x)
test_x = tokenizer.texts_to_sequences(test_x)

train_x = sequence.pad_sequences(train_x, maxlen=maxlen)
test_x = sequence.pad_sequences(test_x, maxlen=maxlen)

In [65]:
embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [67]:
# Model setup

# Build Model
inp = Input(shape=(maxlen,))

x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
x = SpatialDropout1D(0.35)(x)

x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.15, recurrent_dropout=0.15))(x)
x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)

avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])

out = Dense(6, activation='sigmoid')(x)

model = Model(inp, out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [68]:
# Prediction

batch_size = 32
epochs = 1

model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, verbose=1)
predictions = model.predict(test_x, batch_size=batch_size, verbose=1)

submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictions
submission.to_csv('submission.csv', index=False)

Epoch 1/1
 11328/159571 [=>............................] - ETA: 55:43 - loss: 0.0846 - acc: 0.9732

KeyboardInterrupt: 