In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from itertools import chain
from imblearn.over_sampling import RandomOverSampler

from crossvalidation import multilabel_label_combinations
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, confusion_matrix

from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, InputLayer, Embedding, Conv1D, LSTM, Bidirectional, GlobalMaxPool1D, Add, Dropout, Dense
from keras.models import Sequential, Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
dftrain = pd.read_csv("input/train.csv")
dftrain['comment_text'] = dftrain['comment_text'].apply(str) # some values parsed as float
dftest = pd.read_csv("input/test.csv")
dftest['comment_text'] = dftest['comment_text'].apply(str) # some values parsed as float

In [3]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
dftest.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [5]:
def tokenize(text):
    delimeter = "([?\\/.,`~!@#4%^&*()-+\[\]{}<>'\"]*[ \s\n\t\r]+)"
    tokens = re.split(delimeter, text + " ")
    stripped_tokens = map(str.strip, tokens)
    noempty_tokens = filter(bool, stripped_tokens)
    return list(noempty_tokens)

In [6]:
def preprocess_text(text):
    return " ".join(tokenize(text.replace("'ll", " will").replace("n't", " not")))


def preprocess_texts(texts):
    return [preprocess_text(text) for text in texts]

In [7]:
dftrain['preprocessed_text'] = preprocess_texts(dftrain['comment_text'])
dftest['preprocessed_text'] = preprocess_texts(dftest['comment_text'])

In [8]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,"Nonsense ? kiss off , geek . what I said is tr..."
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,""" Please do not vandalize pages , as you did w..."
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,""" """"Points of interest """" I removed the """"poin..."
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,Asking some his nationality is a Racial offenc...
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,The reader here is not going by my say so for ...


In [9]:
def get_train_val_idx():
    def get_label_combination_indices():
        labels = np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])
        label_combinations = multilabel_label_combinations(labels, 2)
        label_combination_indices = np.zeros([len(dftrain)])
        for i, row in enumerate(label_combinations):
            idx = np.all(labels == row, axis=1)
            label_combination_indices[idx] = i
        return label_combination_indices

    label_combination_indices = get_label_combination_indices()
    train_idx, val_idx, _, _ = train_test_split(np.arange(len(dftrain), dtype=np.int), 
                                                label_combination_indices, 
                                                stratify=label_combination_indices,
                                                random_state=42)
    
    return train_idx, val_idx

train_idx, val_idx = get_train_val_idx()

In [10]:
with open('fasttext-train.txt', 'w', encoding='utf-8') as target:
    for text in list(dftrain['preprocessed_text']) + list(dftest['preprocessed_text']):
        target.write("__label__0__\t{0}\n".format(text))

In [None]:
!fasttext skipgram -input fasttext-train.txt -output fasttext-vector-model

In [11]:
wordset = set(chain(*map(lambda val: val.split(' '), dftrain['preprocessed_text']))) | set(chain(*map(lambda val: val.split(' '), dftest['preprocessed_text'])))

In [12]:
with open('fasttext-words.txt', 'w', encoding='utf-8') as target:
    for word in wordset:
        target.write("{0}\n".format(word))

In [None]:
!fasttext print-word-vectors fasttext-vector-model.bin < fasttext-words.txt > fasttext-word-vectors.txt

In [13]:
vectors = []
word2index = {}
with open('fasttext-word-vectors.txt', 'r', encoding='utf-8') as src:
    for row in filter(lambda row: len(row) > 0, map(lambda line: line.strip().split(' '), src)):
        word = row[0]
        vector = np.fromiter(map(float, row[1:]), dtype=np.float)
        word2index[word] = len(word2index)
        vectors.append(vector)
vectors = np.array(vectors)

In [14]:
def text_to_sequence(text):
    sequence = [word2index[word] for word in text.split(' ') if word in word2index]
    return sequence


def texts_to_sequence(texts):
    return [text_to_sequence(text) for text in texts]

In [15]:
train_sequences = texts_to_sequence(np.array(dftrain['preprocessed_text'])[train_idx])
val_sequences = texts_to_sequence(np.array(dftrain['preprocessed_text'])[val_idx])
test_sequences = texts_to_sequence(np.array(dftest['preprocessed_text']))

In [16]:
MAXLEN = 100
train_X = np.array(pad_sequences(train_sequences, maxlen=MAXLEN))
val_X = np.array(pad_sequences(val_sequences, maxlen=MAXLEN))
test_X = np.array(pad_sequences(test_sequences, maxlen=MAXLEN))
del train_sequences
del val_sequences
del test_sequences

In [17]:
train_y = np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])[train_idx]
val_y = np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])[val_idx]

# Embedding

In [18]:
def get_embedding(trainable=False):
    embedding = Embedding(len(word2index), 100, weights=[vectors], trainable=trainable)
    return embedding

In [19]:
def confusion(y_true, y_pred):
    false_positive_count = np.logical_and(y_true == 0, y_pred == 1).sum()
    false_negative_count = np.logical_and(y_true == 1, y_pred == 0).sum()
    true_positive_count = np.logical_and(y_true == 1, y_pred == 1).sum()
    true_negative_count = np.logical_and(y_true == 0, y_pred == 0).sum()
    
    false_positive_rate = false_positive_count / (false_positive_count + true_negative_count)
    false_negative_rate = false_negative_count / (false_negative_count + true_positive_count)
    true_negative_rate = true_negative_count / (false_positive_count + true_negative_count)
    true_positive_rate = true_positive_count / (false_negative_count + true_positive_count)
    
    return np.array([
        [true_negative_rate, false_positive_rate],
        [false_negative_rate, true_positive_rate],
    ])

In [20]:
def val_result(model):
    val_prediction = model.predict(val_X, verbose=True)
    losses = []
    for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
        print(label)
        loss = log_loss(val_y[:, i], val_prediction[:, i])
        losses.append(loss)
        print('loss: ', loss)
        print(confusion(val_y[:, i], 1.0 * (val_prediction[:, i] > 0.5)))
    print('Total loss: ', np.array(losses).mean())

# Convolution

In [21]:
def get_model():
    input = Input(shape=(MAXLEN,), dtype='int32')
    embedding = get_embedding() (input)
    
    conv1 = Conv1D(50, 1, activation='relu') (embedding)
    pool1 = GlobalMaxPool1D() (conv1)
    
    conv2 = Conv1D(50, 2, activation='relu') (embedding)
    pool2 = GlobalMaxPool1D() (conv2)
    
    merge = Add() ([pool1, pool2])
    drop1 = Dropout(0.3) (merge)
    fc1 = Dense(100, activation='relu') (drop1)
    drop2 = Dropout(0.3) (fc1)
    fc2 = Dense(6, activation='sigmoid') (drop2)
    
    model = Model(input, fc2)
    
    return model

model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 100)     72162000    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 100, 50)      5050        embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 99, 50)       10050       embedding_1[0][0]                
__________________________________________________________________________________________________
global_max

In [22]:
model.compile('nadam', 'binary_crossentropy')

In [23]:
model.fit(train_X, train_y, 
          epochs=200,
          validation_data=(val_X, val_y), 
          verbose=True, 
          callbacks=[
              ModelCheckpoint('model-conv.h5', save_best_only=True),
              EarlyStopping(patience=10),
          ])

Train on 71888 samples, validate on 23963 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200


<keras.callbacks.History at 0x25f7c6d5a58>

In [24]:
model.load_weights('model-conv.h5')

In [25]:
val_result(model)

toxic
loss:  0.107912785592
[[ 0.98965454  0.01034546]
 [ 0.3301601   0.6698399 ]]
severe_toxic
loss:  0.0261312943296
[[  9.99957847e-01   4.21531847e-05]
 [  9.95833333e-01   4.16666667e-03]]
obscene
loss:  0.0616836553112
[[ 0.99387287  0.00612713]
 [ 0.32654659  0.67345341]]
threat
loss:  0.0122763452377
[[ 1.  0.]
 [ 1.  0.]]
insult
loss:  0.0746462835164
[[ 0.98787932  0.01212068]
 [ 0.38674497  0.61325503]]
identity_hate
loss:  0.0256580353838
[[  9.99536998e-01   4.63001936e-04]
 [  8.09756098e-01   1.90243902e-01]]
Total loss:  0.0513847332284


# Custom loss function

In [26]:
def custom_loss(y_true, y_pred):
    eps = 1e-10
    toxic_pos_weight         = 1.0
    toxic_neg_weight         = 1.0
    severe_toxic_pos_weight  = 1.5
    severe_toxic_neg_weight  = 1.0
    obscene_pos_weight       = 1.0
    obscene_neg_weight       = 1.0
    threat_pos_weight        = 1.5
    threat_neg_weight        = 1.0
    insult_pos_weight        = 1.0
    insult_neg_weight        = 1.0
    identity_hate_pos_weight = 1.5
    identity_hate_neg_weight = 1.0
    toxic         =         toxic_pos_weight *      y_true[:, 0]  * K.log(    y_pred[:, 0] + eps) + \
                            toxic_neg_weight * (1 - y_true[:, 0]) * K.log(1 - y_pred[:, 0] + eps)
    severe_toxic  =  severe_toxic_pos_weight *      y_true[:, 1]  * K.log(    y_pred[:, 1] + eps) + \
                     severe_toxic_neg_weight * (1 - y_true[:, 1]) * K.log(1 - y_pred[:, 1] + eps)
    obscene       =       obscene_pos_weight *      y_true[:, 2]  * K.log(    y_pred[:, 2] + eps) + \
                          obscene_neg_weight * (1 - y_true[:, 2]) * K.log(1 - y_pred[:, 2] + eps)
    threat        =        threat_pos_weight *      y_true[:, 3]  * K.log(    y_pred[:, 3] + eps) + \
                           threat_neg_weight * (1 - y_true[:, 3]) * K.log(1 - y_pred[:, 3] + eps)
    insult        =        insult_pos_weight *      y_true[:, 4]  * K.log(    y_pred[:, 4] + eps) + \
                           insult_neg_weight * (1 - y_true[:, 4]) * K.log(1 - y_pred[:, 4] + eps)
    identity_hate = identity_hate_pos_weight *      y_true[:, 5]  * K.log(    y_pred[:, 5] + eps) + \
                    identity_hate_neg_weight * (1 - y_true[:, 5]) * K.log(1 - y_pred[:, 5] + eps)
    return - (toxic + severe_toxic + obscene + threat + insult + identity_hate) / 6.0

In [27]:
model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 100, 100)     72162000    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 100, 50)      5050        embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 99, 50)       10050       embedding_2[0][0]                
__________________________________________________________________________________________________
global_max

In [28]:
model.compile('nadam', custom_loss)

In [29]:
model.fit(train_X, train_y, 
          batch_size=32,
          epochs=200,
          validation_data=(val_X, val_y), 
          verbose=True, 
          callbacks=[
              ModelCheckpoint('model-conv.h5', save_best_only=True),
              EarlyStopping(patience=10),
          ])

Train on 71888 samples, validate on 23963 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200


<keras.callbacks.History at 0x25f7c8a4a90>

In [30]:
model.load_weights('model-conv.h5')

val_result(model)

toxic
loss:  0.106000858547
[[ 0.9826344   0.0173656 ]
 [ 0.26006058  0.73993942]]
severe_toxic
loss:  0.0250515600914
[[  9.99957847e-01   4.21531847e-05]
 [  1.00000000e+00   0.00000000e+00]]
obscene
loss:  0.0574519323598
[[ 0.99030239  0.00969761]
 [ 0.23884103  0.76115897]]
threat
loss:  0.012510347684
[[  9.99916272e-01   8.37275506e-05]
 [  9.86842105e-01   1.31578947e-02]]
insult
loss:  0.0741042633897
[[ 0.98831848  0.01168152]
 [ 0.375       0.625     ]]
identity_hate
loss:  0.0254444991554
[[  9.99831636e-01   1.68364340e-04]
 [  8.39024390e-01   1.60975610e-01]]
Total loss:  0.0500939102045


# RNN

In [31]:
def get_model():
    return Sequential([
        InputLayer(input_shape=(MAXLEN,), dtype='int32'),
        get_embedding(),
        Bidirectional(LSTM(50, return_sequences=True)),
        GlobalMaxPool1D(),
        Dropout(0.3),
        Dense(50, activation='relu'),
        Dropout(0.3),
        Dense(6, activation='sigmoid')
    ])

model = get_model()

In [32]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 100)          72162000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 100)          60400     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 100)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_6 (Dropout)          (None, 50)                0         
__________

In [33]:
model.compile('nadam', 'binary_crossentropy')

In [34]:
model.fit(train_X, train_y, 
          batch_size=32,
          epochs=5,
          validation_data=(val_X, val_y), 
          verbose=True, 
          callbacks=[
              ModelCheckpoint('model-rnn.h5', save_best_only=True),
          ])

Train on 71888 samples, validate on 23963 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x25f7cc084a8>

In [37]:
model.load_weights('model-rnn.h5')

In [38]:
val_result(model)

toxic
loss:  0.096308736771
[[ 0.98425088  0.01574912]
 [ 0.24145392  0.75854608]]
severe_toxic
loss:  0.0236998996393
[[  9.99536315e-01   4.63685031e-04]
 [  9.29166667e-01   7.08333333e-02]]
obscene
loss:  0.0546462138876
[[ 0.98994975  0.01005025]
 [ 0.19655442  0.80344558]]
threat
loss:  0.0119974299074
[[ 1.  0.]
 [ 1.  0.]]
insult
loss:  0.0703548359798
[[ 0.98634228  0.01365772]
 [ 0.32885906  0.67114094]]
identity_hate
loss:  0.0248463182712
[[  9.99536998e-01   4.63001936e-04]
 [  8.48780488e-01   1.51219512e-01]]
Total loss:  0.0469755724094


In [39]:
test_prediction = model.predict(test_X, verbose=True)



In [41]:
submission = pd.read_csv('input/sample_submission.csv')
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = test_prediction
submission.to_csv('output.csv', index=None)

In [42]:
model = get_model()
model.compile('nadam', custom_loss)

In [43]:
model.fit(train_X, train_y, 
          batch_size=32,
          epochs=5,
          validation_data=(val_X, val_y), 
          verbose=True, 
          callbacks=[
              ModelCheckpoint('model-rnn.h5', save_best_only=True),
          ])

Train on 71888 samples, validate on 23963 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x25f7d0b4da0>

In [47]:
model.load_weights('model-rnn.h5')

In [48]:
val_result(model)

toxic
loss:  0.094682783082
[[ 0.98859228  0.01140772]
 [ 0.26568585  0.73431415]]
severe_toxic
loss:  0.0244011336883
[[  9.99241243e-01   7.58757324e-04]
 [  8.79166667e-01   1.20833333e-01]]
obscene
loss:  0.054559119224
[[ 0.99175703  0.00824297]
 [ 0.22787784  0.77212216]]
threat
loss:  0.0127841209976
[[ 1.  0.]
 [ 1.  0.]]
insult
loss:  0.0683301590677
[[ 0.98585921  0.01414079]
 [ 0.31040268  0.68959732]]
identity_hate
loss:  0.025187609588
[[ 0.99844263  0.00155737]
 [ 0.68292683  0.31707317]]
Total loss:  0.046657487608


In [49]:
test_prediction = model.predict(test_X, verbose=True)



In [50]:
submission = pd.read_csv('input/sample_submission.csv')
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = test_prediction
submission.to_csv('output.csv', index=None)