In [40]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from itertools import chain
from imblearn.over_sampling import RandomOverSampler

from crossvalidation import multilabel_label_combinations
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, confusion_matrix

from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, InputLayer, Embedding, Conv1D, LSTM, Bidirectional, GlobalMaxPool1D, Add, Dropout, Dense
from keras.models import Sequential, Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
dftrain = pd.read_csv("input/train.csv")
dftrain['comment_text'] = dftrain['comment_text'].apply(str) # some values parsed as float
dftest = pd.read_csv("input/test.csv")
dftest['comment_text'] = dftest['comment_text'].apply(str) # some values parsed as float

In [3]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
dftest.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [5]:
def tokenize(text):
    delimeter = "([?\\/.,`~!@#4%^&*()-+\[\]{}<>'\"]*[ \s\n\t\r]+)"
    tokens = re.split(delimeter, text + " ")
    stripped_tokens = map(str.strip, tokens)
    noempty_tokens = filter(bool, stripped_tokens)
    return list(noempty_tokens)

In [6]:
def preprocess_text(text):
    return " ".join(tokenize(text.replace("'ll", " will").replace("n't", " not")))


def preprocess_texts(texts):
    return [preprocess_text(text) for text in texts]

In [7]:
dftrain['preprocessed_text'] = preprocess_texts(dftrain['comment_text'])
dftest['preprocessed_text'] = preprocess_texts(dftest['comment_text'])

In [8]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,"Nonsense ? kiss off , geek . what I said is tr..."
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,""" Please do not vandalize pages , as you did w..."
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,""" """"Points of interest """" I removed the """"poin..."
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,Asking some his nationality is a Racial offenc...
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,The reader here is not going by my say so for ...


In [9]:
def get_train_val_idx():
    def get_label_combination_indices():
        labels = np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])
        label_combinations = multilabel_label_combinations(labels, 2)
        label_combination_indices = np.zeros([len(dftrain)])
        for i, row in enumerate(label_combinations):
            idx = np.all(labels == row, axis=1)
            label_combination_indices[idx] = i
        return label_combination_indices

    label_combination_indices = get_label_combination_indices()
    train_idx, val_idx, _, _ = train_test_split(np.arange(len(dftrain), dtype=np.int), 
                                                label_combination_indices, 
                                                stratify=label_combination_indices,
                                                random_state=42)
    
    return train_idx, val_idx

train_idx, val_idx = get_train_val_idx()

In [45]:
with open('fasttext-train.txt', 'w', encoding='utf-8') as target:
    for text in list(dftrain['preprocessed_text']) + list(dftest['preprocessed_text']):
        target.write("__label__0__\t{0}\n".format(text))

In [None]:
!fasttext skipgram -input fasttext-train.txt -output fasttext-vector-model

In [10]:
wordset = set(chain(*map(lambda val: val.split(' '), dftrain['preprocessed_text']))) | set(chain(*map(lambda val: val.split(' '), dftest['preprocessed_text'])))

In [81]:
with open('fasttext-words.txt', 'w', encoding='utf-8') as target:
    for word in wordset:
        target.write("{0}\n".format(word))

In [None]:
!fasttext print-word-vectors fasttext-vector-model.bin < fasttext-words.txt > fasttext-word-vectors.txt

In [11]:
vectors = []
word2index = {}
with open('fasttext-word-vectors.txt', 'r', encoding='utf-8') as src:
    for row in filter(lambda row: len(row) > 0, map(lambda line: line.strip().split(' '), src)):
        word = row[0]
        vector = np.fromiter(map(float, row[1:]), dtype=np.float)
        word2index[word] = len(word2index)
        vectors.append(vector)
vectors = np.array(vectors)

In [12]:
def text_to_sequence(text):
    sequence = [word2index[word] for word in text.split(' ') if word in word2index]
    return sequence


def texts_to_sequence(texts):
    return [text_to_sequence(text) for text in texts]

In [13]:
train_sequences = texts_to_sequence(np.array(dftrain['preprocessed_text'])[train_idx])
val_sequences = texts_to_sequence(np.array(dftrain['preprocessed_text'])[val_idx])
test_sequences = texts_to_sequence(np.array(dftest['preprocessed_text']))

In [14]:
MAXLEN = 100
train_X = np.array(pad_sequences(train_sequences, maxlen=MAXLEN))
val_X = np.array(pad_sequences(val_sequences, maxlen=MAXLEN))
test_X = np.array(pad_sequences(test_sequences, maxlen=MAXLEN))
del train_sequences
del val_sequences
del test_sequences

In [15]:
train_y = np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])[train_idx]
val_y = np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']])[val_idx]

In [16]:
vectors.shape

(721620, 100)

# Embedding

In [17]:
def get_embedding(trainable=False):
    embedding = Embedding(len(word2index), 100, weights=[vectors], trainable=trainable)
    return embedding

In [38]:
def confusion(y_true, y_pred):
    false_positive_count = np.logical_and(y_true == 0, y_pred == 1).sum()
    false_negative_count = np.logical_and(y_true == 1, y_pred == 0).sum()
    true_positive_count = np.logical_and(y_true == 1, y_pred == 1).sum()
    true_negative_count = np.logical_and(y_true == 0, y_pred == 0).sum()
    
    false_positive_rate = false_positive_count / (false_positive_count + true_negative_count)
    false_negative_rate = false_negative_count / (false_negative_count + true_positive_count)
    true_negative_rate = true_negative_count / (false_positive_count + true_negative_count)
    true_positive_rate = true_positive_count / (false_negative_count + true_positive_count)
    
    return np.array([
        [true_negative_rate, false_positive_rate],
        [false_negative_rate, true_positive_rate],
    ])

# Convolution

In [91]:
def get_model():
    input = Input(shape=(MAXLEN,), dtype='int32')
    embedding = get_embedding() (input)
    
    conv1 = Conv1D(50, 1, activation='relu') (embedding)
    pool1 = GlobalMaxPool1D() (conv1)
    
    conv2 = Conv1D(50, 2, activation='relu') (embedding)
    pool2 = GlobalMaxPool1D() (conv2)
    
    merge = Add() ([pool1, pool2])
    drop1 = Dropout(0.3) (merge)
    fc1 = Dense(100, activation='relu') (drop1)
    drop2 = Dropout(0.3) (fc1)
    fc2 = Dense(6, activation='sigmoid') (drop2)
    
    model = Model(input, fc2)
    
    return model

model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 100, 100)     72162000    input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 100, 50)      5050        embedding_4[0][0]                
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 99, 50)       10050       embedding_4[0][0]                
__________________________________________________________________________________________________
global_max

In [92]:
model.compile('adam', 'binary_crossentropy')

In [93]:
model.fit(train_X, train_y, 
          epochs=200,
          validation_data=(val_X, val_y), 
          verbose=True, 
          callbacks=[
              ModelCheckpoint('model-conv.h5', save_best_only=True),
              EarlyStopping(patience=10),
          ])

Train on 71888 samples, validate on 23963 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200


<keras.callbacks.History at 0x2337e9afa90>

In [98]:
model.load_weights('model-conv.h5')

def val_result(model):
    val_prediction = model.predict(val_X, verbose=True)
    losses = []
    for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
        print(label)
        loss = log_loss(val_y[:, i], val_prediction[:, i])
        losses.append(loss)
        print('loss: ', loss)
        print(confusion(val_y[:, i], 1.0 * (val_prediction[:, i] > 0.5)))
    print('Total loss: ', np.array(losses).mean())
    
val_result(model)

toxic
loss:  0.103945565169
[[ 0.98794569  0.01205431]
 [ 0.29684119  0.70315881]]
severe_toxic
loss:  0.0248819026766
[[  9.99831387e-01   1.68612739e-04]
 [  9.87500000e-01   1.25000000e-02]]
obscene
loss:  0.0600789274654
[[ 0.99391695  0.00608305]
 [ 0.32498042  0.67501958]]
threat
loss:  0.0126833524881
[[ 1.  0.]
 [ 1.  0.]]
insult
loss:  0.0743594396471
[[ 0.99047034  0.00952966]
 [ 0.43036913  0.56963087]]
identity_hate
loss:  0.0257490880308
[[  9.99957909e-01   4.20910851e-05]
 [  9.36585366e-01   6.34146341e-02]]
Total loss:  0.0502830459129


# Custom loss function

In [99]:
def custom_loss(y_true, y_pred):
    eps = 1e-10
    toxic_pos_weight         = 1.0
    toxic_neg_weight         = 1.0
    severe_toxic_pos_weight  = 1.5
    severe_toxic_neg_weight  = 1.0
    obscene_pos_weight       = 1.0
    obscene_neg_weight       = 1.0
    threat_pos_weight        = 1.5
    threat_neg_weight        = 1.0
    insult_pos_weight        = 1.0
    insult_neg_weight        = 1.0
    identity_hate_pos_weight = 1.5
    identity_hate_neg_weight = 1.0
    toxic         =         toxic_pos_weight *      y_true[:, 0]  * K.log(    y_pred[:, 0] + eps) + \
                            toxic_neg_weight * (1 - y_true[:, 0]) * K.log(1 - y_pred[:, 0] + eps)
    severe_toxic  =  severe_toxic_pos_weight *      y_true[:, 1]  * K.log(    y_pred[:, 1] + eps) + \
                     severe_toxic_neg_weight * (1 - y_true[:, 1]) * K.log(1 - y_pred[:, 1] + eps)
    obscene       =       obscene_pos_weight *      y_true[:, 2]  * K.log(    y_pred[:, 2] + eps) + \
                          obscene_neg_weight * (1 - y_true[:, 2]) * K.log(1 - y_pred[:, 2] + eps)
    threat        =        threat_pos_weight *      y_true[:, 3]  * K.log(    y_pred[:, 3] + eps) + \
                           threat_neg_weight * (1 - y_true[:, 3]) * K.log(1 - y_pred[:, 3] + eps)
    insult        =        insult_pos_weight *      y_true[:, 4]  * K.log(    y_pred[:, 4] + eps) + \
                           insult_neg_weight * (1 - y_true[:, 4]) * K.log(1 - y_pred[:, 4] + eps)
    identity_hate = identity_hate_pos_weight *      y_true[:, 5]  * K.log(    y_pred[:, 5] + eps) + \
                    identity_hate_neg_weight * (1 - y_true[:, 5]) * K.log(1 - y_pred[:, 5] + eps)
    return - (toxic + severe_toxic + obscene + threat + insult + identity_hate) / 6.0

In [100]:
model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 100, 100)     72162000    input_6[0][0]                    
__________________________________________________________________________________________________
conv1d_12 (Conv1D)              (None, 100, 50)      5050        embedding_6[0][0]                
__________________________________________________________________________________________________
conv1d_13 (Conv1D)              (None, 99, 50)       10050       embedding_6[0][0]                
__________________________________________________________________________________________________
global_max

In [101]:
model.compile('adam', custom_loss)

In [102]:
model.fit(train_X, train_y, 
          batch_size=32,
          epochs=200,
          validation_data=(val_X, val_y), 
          verbose=True, 
          callbacks=[
              ModelCheckpoint('model-conv.h5', save_best_only=True),
              EarlyStopping(patience=10),
          ])

Train on 71888 samples, validate on 23963 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200


<keras.callbacks.History at 0x2337eb95d68>

In [103]:
model.load_weights('model-conv.h5')

val_result(model)

toxic
loss:  0.105332165325
[[ 0.9878995   0.0121005 ]
 [ 0.31241887  0.68758113]]
severe_toxic
loss:  0.0253344406112
[[ 0.99898832  0.00101168]
 [ 0.8375      0.1625    ]]
obscene
loss:  0.0608212333743
[[ 0.99316759  0.00683241]
 [ 0.33202819  0.66797181]]
threat
loss:  0.0116147358343
[[  9.99832545e-01   1.67455101e-04]
 [  9.34210526e-01   6.57894737e-02]]
insult
loss:  0.0742947426321
[[ 0.99060208  0.00939792]
 [ 0.44043624  0.55956376]]
identity_hate
loss:  0.0248569240586
[[  9.99200269e-01   7.99730617e-04]
 [  7.95121951e-01   2.04878049e-01]]
Total loss:  0.0503757069727
