# Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

SEED = 42
np.random.seed(SEED)
tf.set_random_seed(SEED)

  from ._conv import register_converters as _register_converters


# Dataset

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

# Exploration

In [3]:
train.head(8)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0


In [4]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


# Parameters

In [5]:
NUM_WORDS = 70000
SEQ_MAX_LEN = 30
EMBEDDING_DIM = 50 # 50, 100, 200 or 300

# Data prepration

In [6]:
from sklearn.feature_selection import VarianceThreshold
from keras.preprocessing import sequence, text

try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

Using TensorFlow backend.


In [7]:
tokenizer = text.Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts( list(train.comment_text.values.astype(str)))

word_index = tokenizer.word_index
print('Found %d unique words in training set' % len(word_index))

Found 210337 unique words in training set


In [8]:
x = tokenizer.texts_to_sequences(train.comment_text.values.astype(str))
x = sequence.pad_sequences(x, maxlen=SEQ_MAX_LEN)

features = ['comment_text']
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X = train[features]
y = train[targets]

In [9]:
print(X.shape, y.shape, x.shape)

(159571, 1) (159571, 6) (159571, 30)


In [10]:
X_train, X_val, x_train, x_val, y_train, y_val = train_test_split(X, x, y, test_size=0.005, random_state=SEED)

# Embeddings

* GloVe: https://nlp.stanford.edu/projects/glove/

In [11]:
import os

In [12]:
embeddings_index = {}
with open('../embeddings/glove.6B/glove.6B.' + str(EMBEDDING_DIM) + 'd.txt', encoding='utf-8') as embedding_file:
    for line in embedding_file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print('Found %s pretrained word vectors.' % len(embeddings_index))

Found 400000 pretrained word vectors.


In [13]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print(embedding_matrix.shape)

(210338, 50)


# Model

In [14]:
from keras.layers import Embedding, GRU, Dense, BatchNormalization, Activation, Dropout
from keras.models import Sequential
from keras_tqdm import TQDMNotebookCallback
from keras.optimizers import Adam, RMSprop, Adamax, Adagrad, Nadam
from keras.activations import elu, relu, tanh, sigmoid

In [15]:
embedding_layer = Embedding(
    len(word_index) + 1,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=SEQ_MAX_LEN,
    trainable=False,
    dropout=0.2
)

  import sys


In [16]:
model = Sequential()
model.add(embedding_layer)
model.add(GRU(256, recurrent_dropout=0.3, dropout=0.3, return_sequences=False))

model.add(Dense(256))
model.add(BatchNormalization())
model.add(Activation(elu))
model.add(Dropout(0.35))

model.add(Dense(6, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])

In [None]:
model.fit(
    x_train,
    y_train,
    batch_size=256,
    epochs=3,
    verbose=0,
    validation_data=(x_val, y_val),
    callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=True)]
)