In [1]:
import numpy as np
import pandas as pd

# data explore

In [2]:
%time train_raw = pd.read_csv('data/train.csv')

CPU times: user 552 ms, sys: 41.2 ms, total: 593 ms
Wall time: 592 ms


In [3]:
train_raw.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
train_raw.iloc[0].T

id                                                0000997932d777bf
comment_text     Explanation\nWhy the edits made under my usern...
toxic                                                            0
severe_toxic                                                     0
obscene                                                          0
threat                                                           0
insult                                                           0
identity_hate                                                    0
Name: 0, dtype: object

In [5]:
train_raw.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [6]:
train_raw[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=0) * 1.0 / train_raw.shape[0]

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64

In [7]:
# multiple tags
train_raw[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1).max()

6

# Prepare training data: text and target

In [8]:
texts = train_raw['comment_text'].values
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
targets = train_raw[target_cols].values

In [9]:
print texts.shape
print targets.shape
print train_raw.shape

(159571,)
(159571, 6)
(159571, 8)


# Text preprocessing using Keras

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## tokenizer

In [11]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

CPU times: user 15.7 s, sys: 101 ms, total: 15.8 s
Wall time: 15.8 s


In [12]:
print len(sequences)
print sequences[0]
print len(sequences[0])
print texts[0]
print len(texts[0].split(' '))

159571
[688, 75, 1, 126, 130, 177, 29, 672, 4511, 12052, 1116, 86, 331, 51, 2278, 11448, 50, 6864, 15, 60, 2756, 148, 7, 2937, 34, 117, 1221, 15188, 2825, 4, 45, 59, 244, 1, 365, 31, 1, 38, 27, 143, 73, 3462, 89, 3085, 4583, 2273, 985]
47
Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
42


In [13]:
print len(word_index)
print type(word_index)
print word_index.keys()[:10]
print word_index['gavan']

210554
<type 'dict'>
["dool's", '\xca\x8a', 'bailyite', 'sowell', 'tsukino', '\xca\x84', 'woods', 'spiders', 'gavan', 'dekolb']
90396


## get training set and validation set

In [14]:
train_raw['comment_text_len'] = train_raw['comment_text'].map(lambda x : len(x))
print train_raw['comment_text_len'].max()

5895


In [15]:
MAX_SEQUENCE_LENGTH = 200
VALIDATION_SPLIT = 0.1

def get_train_valid(sequences, targets):
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    targets = np.asarray(targets)

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    targets = targets[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

    x_train = data[:-nb_validation_samples]
    y_train = targets[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = targets[-nb_validation_samples:]

    return x_train, y_train, x_val, y_val

In [16]:
%time x_train, y_train, x_val, y_val = get_train_valid(sequences, targets)

CPU times: user 932 ms, sys: 112 ms, total: 1.04 s
Wall time: 1.05 s


# word embedding

### convert Glove to word2vec

### load glove wordvector

In [17]:
from gensim.models import KeyedVectors

def load_glove_model(word_embedding_file):
    word2vec = KeyedVectors.load_word2vec_format(word_embedding_file, binary=False)
    return word2vec

In [18]:
%time glove_vec = load_glove_model('data/glove.twitter.27B/glove.twitter.27B.100d.word2vec.txt')

CPU times: user 1min 29s, sys: 943 ms, total: 1min 30s
Wall time: 1min 30s


### define an embedding layer

In [19]:
from keras.layers import Embedding

def build_embedding_layer(word_index, word2vec, embedding_dim):
    # including UNKNOWN
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    
    # if a word is in word2vec vocabulary, use the pretrained weights
    for word, i in word_index.items():
        if word in word2vec.wv.vocab:
            embedding_matrix[i] = word2vec.wv[word]
    
    embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], 
                                input_length=MAX_SEQUENCE_LENGTH, trainable=True)
    return embedding_layer

In [22]:
%time embedding_layer = build_embedding_layer(word_index=word_index, word2vec=glove_vec, embedding_dim=100)

CPU times: user 758 ms, sys: 24 ms, total: 782 ms
Wall time: 783 ms


# CNN for NLP
CNNs are not particularly good for most NLP tasks since they lose out on the sequential flow of information. But since the objective here boils down to recognizing 'blocks' of sentiments scattered in text, they work decently well!  
<img src='images/cnn_nlp.png'>

In [23]:
from keras.layers import Input, Conv1D, MaxPool1D, Flatten, Dense
from keras.models import Model

N_TARGET_CLASSES = 6

def build_model(embedding_layer):
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    x = Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = MaxPool1D(5)(x)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPool1D(5)(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    preds = Dense(N_TARGET_CLASSES, activation='sigmoid')(x)
    
    model = Model(sequence_input, preds)
    return model

> Sigmoid (and not Softmax) is the more appropriate objective function here, since each sample could belong to multiple classes (A comment could be an insult and obscene at the same time).

# Model training

In [41]:
from sklearn.metrics import roc_auc_score  
import keras.backend as K

In [53]:
%%time
# build model
model = build_model(embedding_layer)
model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy'])

CPU times: user 64.3 ms, sys: 8.05 ms, total: 72.4 ms
Wall time: 70.6 ms


In [37]:
import keras
import time

In [38]:
# fit model and save all history in tensorboard
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=32, verbose=1)

Train on 143614 samples, validate on 15957 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb8be4f9cd0>