In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np

def read_vectors():
    with open('vectors.vec', 'r', encoding='utf-8') as src:
        row_count, vector_size = map(int, src.readline().split(' '))
        vectors = np.zeros([row_count, vector_size], dtype=np.float32)
        word2index = {}
        for i in range(row_count):
            if (i % 10000) == 0:
                print("Row {0} of {1}".format(i + 1, row_count))
            row = src.readline()
            parts = row.split(' ')
            if len(parts) != (vector_size + 1):
                continue
            word = parts[0]
            vector = np.fromiter(map(float, parts[1:]), dtype=np.float32)
            word2index[word] = len(word2index)
            vectors[word2index[word]] = vector
    return word2index, vectors

global_word2index, global_vectors = read_vectors()

Row 1 of 2419021
Row 10001 of 2419021
Row 20001 of 2419021
Row 30001 of 2419021
Row 40001 of 2419021
Row 50001 of 2419021
Row 60001 of 2419021
Row 70001 of 2419021
Row 80001 of 2419021
Row 90001 of 2419021
Row 100001 of 2419021
Row 110001 of 2419021
Row 120001 of 2419021
Row 130001 of 2419021
Row 140001 of 2419021
Row 150001 of 2419021
Row 160001 of 2419021
Row 170001 of 2419021
Row 180001 of 2419021
Row 190001 of 2419021
Row 200001 of 2419021
Row 210001 of 2419021
Row 220001 of 2419021
Row 230001 of 2419021
Row 240001 of 2419021
Row 250001 of 2419021
Row 260001 of 2419021
Row 270001 of 2419021
Row 280001 of 2419021
Row 290001 of 2419021
Row 300001 of 2419021
Row 310001 of 2419021
Row 320001 of 2419021
Row 330001 of 2419021
Row 340001 of 2419021
Row 350001 of 2419021
Row 360001 of 2419021
Row 370001 of 2419021
Row 380001 of 2419021
Row 390001 of 2419021
Row 400001 of 2419021
Row 410001 of 2419021
Row 420001 of 2419021
Row 430001 of 2419021
Row 440001 of 2419021
Row 450001 of 2419021
Ro

In [3]:
import pandas as pd

dftrain = pd.read_csv('input/train.csv')
dftest = pd.read_csv('input/test.csv')

In [4]:
import re

def tokenize(text, lowercase=True):
    if lowercase:
        text = text.lower()
    delimeter = "([\\/.,`~@#4%^&*()-+\[\]{}<>'\"]*[ \s\n\t\r]+|[!?.])"
    tokens = re.split(delimeter, text + " ")
    stripped_tokens = map(str.strip, tokens)
    noempty_tokens = filter(bool, stripped_tokens)
    return list(noempty_tokens)
  
text_train = dftrain['comment_text'].apply(tokenize)
text_test = dftest['comment_text'].apply(tokenize)

In [5]:
from itertools import chain

def get_words():
    words_train = set(chain(*text_train))
    words_test = set(chain(*text_test))
    vocabulary_words = set(global_word2index.keys())
    return sorted((words_train | words_test) & vocabulary_words)

words = get_words()

In [6]:
def get_vectors():
    word2index = {
        word: i
        for i, word in enumerate(words)
    }
    vectors = np.zeros([
        len(word2index),
        global_vectors.shape[1]
    ])
    for i, word in enumerate(words):
        vectors[i] = global_vectors[global_word2index[word]]
    return word2index, vectors

word2index, vectors = get_vectors()

In [7]:
import gc

del global_word2index
del global_vectors
gc.collect()

221

In [8]:
MAXLEN = 500

def tokens_to_sequence(tokens):
    result = np.ones([MAXLEN], dtype=np.int32) * len(word2index)
    subsequence = [
        word2index[word]
        for word in tokens
        if word in word2index
    ]
    size = min(MAXLEN, len(subsequence))
    result[:size] = subsequence[:size]
    return result

sequence_train = np.array([tokens_to_sequence(text) for text in text_train])
sequence_test = np.array([tokens_to_sequence(text) for text in text_test])

In [9]:
del text_train
del text_test
gc.collect()

0

In [10]:
def get_embedding_matrix():
    embedding_matrix = np.zeros([vectors.shape[0] + 1, vectors.shape[1]])
    embedding_matrix[:vectors.shape[0], :] = vectors
    return embedding_matrix

embedding_matrix = get_embedding_matrix()

In [11]:
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
targets_train = np.array(dftrain[targets])

In [12]:
del dftrain
del dftest
gc.collect()

188

In [13]:
from keras.layers import InputLayer, Embedding, Bidirectional, CuDNNGRU, Dropout, Dense, Input, Multiply, Lambda
from keras.optimizers import RMSprop
from keras.models import Sequential, Model
from keras.losses import binary_crossentropy
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [14]:
from tflearn.objectives import roc_auc_score as roc_auc_score_tf
import tensorflow as tf

In [15]:
def label_loss(y_true, y_pred):
    return roc_auc_score_tf(y_pred, y_true)


def loss(y_true, y_pred):
    weight_0 = 1/6
    weight_1 = 1/6
    weight_2 = 1/6
    weight_3 = 1/6
    weight_4 = 1/6
    weight_5 = 1/6
    return  weight_0 * label_loss(y_true[:, 0], y_pred[:, 0]) + \
            weight_1 * label_loss(y_true[:, 1], y_pred[:, 1]) + \
            weight_2 * label_loss(y_true[:, 2], y_pred[:, 2]) + \
            weight_3 * label_loss(y_true[:, 3], y_pred[:, 3]) + \
            weight_4 * label_loss(y_true[:, 4], y_pred[:, 4]) + \
            weight_5 * label_loss(y_true[:, 5], y_pred[:, 5])

In [16]:
def loss_test():
    with tf.Session() as sess:
        y_pred = tf.constant([[0.8, 0.7, 0.8, 0.7, 0.8, 0.7,],
                              [0.7, 0.6, 0.7, 0.6, 0.7, 0.6,],
                              [0.1, 0.0, 0.1, 0.0, 0.1, 0.0,],
                              [0.2, 0.1, 0.2, 0.1, 0.2, 0.1,]])
        y_true = tf.constant([[1.0, 0.0, 1.0, 0.0, 1.0, 0.0,],
                              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0,],
                              [0.0, 0.0, 0.0, 0.0, 0.0, 0.0,],
                              [0.0, 0.0, 0.0, 0.0, 0.0, 0.0,]])
        print(sess.run(loss(y_pred, y_true)))
        y_pred = tf.constant([[1.0, 0.0, 1.0, 0.0, 1.0, 0.0,],
                              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0,],
                              [0.0, 0.0, 0.0, 0.0, 0.0, 0.0,],
                              [0.0, 0.0, 0.0, 0.0, 0.0, 0.0,]])
        y_true = tf.constant([[1.0, 0.0, 1.0, 0.0, 1.0, 0.0,],
                              [1.0, 1.0, 1.0, 1.0, 1.0, 1.0,],
                              [0.0, 0.0, 0.0, 0.0, 0.0, 0.0,],
                              [0.0, 0.0, 0.0, 0.0, 0.0, 0.0,]])
        print(sess.run(loss(y_pred, y_true)))
        
loss_test()

0.008
0.0


In [17]:
def get_embedding():
    return Embedding(embedding_matrix.shape[0],
                     embedding_matrix.shape[1],
                     weights=[embedding_matrix],
                     trainable=False,
                     input_length=MAXLEN)

def get_model():
    K.clear_session()
    model = Sequential([
        InputLayer(input_shape=(MAXLEN,), dtype='int32'),
        get_embedding(),
        Bidirectional(CuDNNGRU(64, return_sequences=True)),
        Dropout(0.3),
        Bidirectional(CuDNNGRU(64, return_sequences=False)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dense(6, activation='sigmoid')
    ])
    model.compile(optimizer=RMSprop(clipvalue=2, clipnorm=2),
                  loss=loss,
                  metrics=['accuracy'])
    return model

def get_model_trainable_weights(model):
    weights = []
    for layer in model.layers:
        if layer.trainable:
            weights.append(layer.get_weights())
    return weights

In [18]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback


def metric(y_true, y_pred):
    values = []
    for i in range(y_true.shape[1]):
        values.append(roc_auc_score(y_true[:, i], y_pred[:, i]))
    return np.array(values).mean()


class checkpoint(Callback):
    def __init__(self, training_data, validation_data, batch_size, fname):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        self.batch_size = batch_size
        self.fname = fname
        self.best_score = None
        self.best_epoch = None


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x, batch_size=self.batch_size)
        roc = metric(self.y, y_pred)
        y_pred_val = self.model.predict(self.x_val, batch_size=self.batch_size)
        roc_val = metric(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        
        if (self.best_score is None) or (roc_val > self.best_score):
            self.best_score = roc_val
            self.best_epoch = epoch
            self.model.save(self.fname)

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [19]:
FOLD_COUNT = 10
FOLD_SIZE = len(sequence_train) // FOLD_COUNT
BATCH_SIZE = 256


def fold_mask(fold):
    start_index = FOLD_SIZE * fold
    if fold == FOLD_COUNT - 1:
        end_index = len(sequence_train) - 1
    else:
        end_index = FOLD_SIZE * (fold + 1)
    fold_mask = np.array([True] * len(sequence_train))
    fold_mask[start_index : end_index] = False
    return fold_mask


def fold_train(fold):
    print("Training fold {0}".format(fold))
    train_mask = fold_mask(fold)
    val_mask = np.logical_not(train_mask)
    model = get_model()
    model.fit(sequence_train[train_mask], targets_train[train_mask], 
              validation_data=(sequence_train[val_mask], targets_train[val_mask]),
              batch_size=BATCH_SIZE,
              epochs=60,
              verbose=True,
              callbacks=[
                  checkpoint(
                      (sequence_train[train_mask], targets_train[train_mask]),
                      (sequence_train[val_mask], targets_train[val_mask]),
                      BATCH_SIZE,
                      'model.tmp'
                  ),
                  ModelCheckpoint('model.tmp', save_best_only=True),
                  EarlyStopping(monitor='val_loss', patience=5)
              ])
    model.load_weights('model.tmp')
    return model

In [20]:
import pickle

for i in range(FOLD_COUNT):
    model = fold_train(i)
    trainable_weights = get_model_trainable_weights(model)
    with open('fold-{0}.pkl'.format(i), 'wb') as target:
        pickle.dump(trainable_weights, target)
    del model
    gc.collect()

Training fold 0
Train on 143614 samples, validate on 15957 samples
Epoch 1/60
roc-auc: 0.9838 - roc-auc_val: 0.9827                                                                                                    
Epoch 2/60
roc-auc: 0.9875 - roc-auc_val: 0.9852                                                                                                    
Epoch 3/60
roc-auc: 0.9904 - roc-auc_val: 0.988                                                                                                    
Epoch 4/60
roc-auc: 0.9917 - roc-auc_val: 0.9885                                                                                                    
Epoch 5/60
roc-auc: 0.9927 - roc-auc_val: 0.9887                                                                                                    
Epoch 6/60
roc-auc: 0.9934 - roc-auc_val: 0.9895                                                                                                    
Epoch 7/60
roc-auc: 0.9939 - roc-auc_val

Epoch 9/60
roc-auc: 0.9943 - roc-auc_val: 0.9899                                                                                                    
Training fold 3
Train on 143614 samples, validate on 15957 samples
Epoch 1/60
roc-auc: 0.9827 - roc-auc_val: 0.9781                                                                                                    
Epoch 2/60
roc-auc: 0.9876 - roc-auc_val: 0.9828                                                                                                    
Epoch 3/60
roc-auc: 0.9898 - roc-auc_val: 0.985                                                                                                    
Epoch 4/60
roc-auc: 0.9917 - roc-auc_val: 0.9868                                                                                                    
Epoch 5/60
roc-auc: 0.9925 - roc-auc_val: 0.9864                                                                                                    
Epoch 6/60
roc-auc: 0.9933 - roc-auc_val

Training fold 6
Train on 143614 samples, validate on 15957 samples
Epoch 1/60
roc-auc: 0.9838 - roc-auc_val: 0.9824                                                                                                    
Epoch 2/60
roc-auc: 0.9881 - roc-auc_val: 0.9862                                                                                                    
Epoch 3/60
roc-auc: 0.9905 - roc-auc_val: 0.9881                                                                                                    
Epoch 4/60
roc-auc: 0.9913 - roc-auc_val: 0.9884                                                                                                    
Epoch 5/60
roc-auc: 0.992 - roc-auc_val: 0.988                                                                                                    
Epoch 6/60
roc-auc: 0.993 - roc-auc_val: 0.9894                                                                                                    
Epoch 7/60
roc-auc: 0.9939 - roc-auc_val: 

Epoch 8/60
roc-auc: 0.9943 - roc-auc_val: 0.9865                                                                                                    
Epoch 9/60
roc-auc: 0.9945 - roc-auc_val: 0.9874                                                                                                    
Training fold 9
Train on 143614 samples, validate on 15957 samples
Epoch 1/60
roc-auc: 0.9825 - roc-auc_val: 0.9773                                                                                                    
Epoch 2/60
roc-auc: 0.9871 - roc-auc_val: 0.9826                                                                                                    
Epoch 3/60
roc-auc: 0.99 - roc-auc_val: 0.986                                                                                                    
Epoch 4/60
roc-auc: 0.9918 - roc-auc_val: 0.9878                                                                                                    
Epoch 5/60
roc-auc: 0.9922 - roc-auc_val: 

In [21]:
def read_trained_models_weights():
    models = []
    for i in range(FOLD_COUNT):
        fname = 'fold-{0}.pkl'.format(i)
        with open(fname, 'rb') as src:
            models.append(pickle.load(src))
    return models

model_weights = read_trained_models_weights()

In [None]:
#def build_model_val_predictions():
#    for i, weights in enumerate(model_weights):
#        val_mask = np.logical_not(fold_mask(i))
#        model = get_model()
#        

In [22]:
def build_model():
    K.clear_session()
    fold_models = []
    
    for i in range(FOLD_COUNT):
        fold_model = Sequential([
            InputLayer(input_shape=(MAXLEN, embedding_matrix.shape[1])),
            Bidirectional(CuDNNGRU(64, return_sequences=True)),
            Dropout(0.3),
            Bidirectional(CuDNNGRU(64, return_sequences=False)),
            Dropout(0.3),
            Dense(32, activation='relu'),
            Dense(6, activation='sigmoid')
        ])
        weights = model_weights[i]
        trainable_layers = filter(lambda layer: layer.trainable,
                                  fold_model.layers)
        for i, layer in enumerate(trainable_layers):
            layer.set_weights(weights[i])
        fold_models.append(fold_model)
    
    input_layer = Input(shape=(MAXLEN,), dtype='int32')
    embedding = get_embedding() (input_layer)
    fold_outputs = [
        model(embedding)
        for model in fold_models
    ]
    multiply = Multiply() (fold_outputs)
    output = Lambda (lambda X: X ** (1 / FOLD_COUNT)) (multiply)
    model = Model(input_layer, output)
    model.compile(optimizer=RMSprop(clipvalue=1, clipnorm=1),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model 


model = build_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 500, 300)     173121600   input_11[0][0]                   
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 6)            219366      embedding_1[0][0]                
__________________________________________________________________________________________________
sequential_2 (Sequential)       (None, 6)            219366      embedding_1[0][0]                
__________________________________________________________________________________________________
sequential

In [23]:
prediction = model.predict(sequence_test, verbose=True, batch_size=256)



In [24]:
submission = pd.read_csv('input/sample_submission.csv')
len(submission)

153164

In [25]:
prediction.shape

(153164, 6)

In [26]:
for i, target in enumerate(targets):
    print(target)
    print(prediction[:, i].min())
    print(prediction[:, i].max())
    print(prediction[:, i].mean())
    print('')

toxic
0.016473548
0.94471747
0.32081753

severe_toxic
0.0052384604
0.9832689
0.31150168

obscene
0.014428718
0.9641874
0.35117787

threat
0.00039980368
0.9549562
0.22804485

insult
0.0026214288
0.9325024
0.32914492

identity_hate
0.0013246255
0.96608764
0.28602156



In [27]:
submission[targets] = prediction
submission.to_csv('output.csv', index=None)