In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.utils import plot_model
from sklearn.metrics import classification_report

from keras import metrics as keras_metrics

In [None]:
EMBEDDING_DIM = 200
MAX_SEQ_LENGTH = 200
MAX_FEATURES = 20000

In [None]:
def build_embedding_layer(word_index):
    embeddings_index = {}
    with open('../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return Embedding(len(word_index)+1, 
                     EMBEDDING_DIM, 
                     weights=[embedding_matrix],
                     input_length=MAX_SEQ_LENGTH,
                     trainable=False)

In [None]:
print(os.listdir('../input'))

In [None]:
train = pd.read_csv('../input/augmented-toxicity/augmented_train.csv')
test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv')

In [None]:
LABEL_COLS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train[LABEL_COLS].values
train_sentences = train['comment_text']
test_sentences = test['comment_text']

In [None]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(train_sentences))
tokenized_train = tokenizer.texts_to_sequences(train_sentences)
tokenized_test = tokenizer.texts_to_sequences(test_sentences)

In [None]:
X_train = pad_sequences(tokenized_train, maxlen=MAX_SEQ_LENGTH)
X_test = pad_sequences(tokenized_test, maxlen=MAX_SEQ_LENGTH)

In [None]:
model = Sequential([
    build_embedding_layer(tokenizer.word_index),
    LSTM(60, return_sequences=True, name='lstm'),
    LSTM(120, return_sequences=True, name='lstm-2'),
    GlobalMaxPool1D(),
    Dropout(0.3),
    Dense(50, activation='relu'),
    Dropout(0.3),
    Dense(6, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[keras_metrics.categorical_accuracy, keras_metrics.binary_accuracy])

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
early_stopping=keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')
history = model.fit(X_train, y_train, epochs=3, batch_size=128, validation_split=0.33, callbacks=[early_stopping])

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
y_test_predictions = model.predict(X_test)

In [None]:
df = pd.DataFrame(y_test_predictions, columns=LABEL_COLS)
df['id'] = test['id']
df.describe()
df.to_csv('predicted_labels.csv', index=False)