In [8]:
import numpy as np
np.random.seed(42)
import pandas as pd
import os,sys,time

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

In [9]:
DataBaseDir = '../../data/version1'
InputDir = '%s/l0/kfold' % DataBaseDir
OutputDir = '%s/l1/kfold' % DataBaseDir
kfold = 3
strategy = 'bi-gru'
# load data
start = time.time()
valid_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir).reset_index(drop= True)
    ## for valid/holdout data set
    if(fold == 0):
        HoldoutData = pd.read_csv('%s/holdout.csv' % FoldInputDir).reset_index(drop= True)
    valid['fold'] = fold
    valid_dfs.append(valid)
    print('load data for fold %s done.' % fold)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
end = time.time()
print('load data done, train %s holdout %s, time elapsed %s' % (len(TrainData), len(HoldoutData), (end - start)))
##### model selection with CV

load data for fold 0 done.
load data for fold 1 done.
load data for fold 2 done.
load data done, train 143591 holdout 15980, time elapsed 0.8826570510864258


In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
EmbeddingFile = '../../data/raw/crawl-300d-2M.vec'
max_features = 30000
maxlen = 100
embed_size = 300
batch_size = 32
epochs = 2
start = time.time()
EmbeddingInidex = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EmbeddingFile))
end = time.time()
print('load embedding features done, corpus size %s, time elapsed %s' % (len(EmbeddingInidex), (end - start)))

def get_model(embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

for fold in range(kfold):
    print('=============== fold %s ==============\n')
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'holdout': HoldoutData
    }
    ## tokenize with entire corpus composed by train/valid/holdout
    tokenizer = text.Tokenizer(num_words= max_features)
    EntireCorpus = list(FoldData['train']['comment_text'].values) + list(FoldData['valid']['comment_text'].values) + list(FoldData['test']['comment_text'].values)
    tokenizer.fit_on_texts(EntireCorpus)
    X_train = tokenizer.texts_to_sequences(FoldData['train']['comment_text'].values)
    X_valid = tokenizer.texts_to_sequences(FoldData['valid']['comment_text'].values)
    X_holdout = tokenizer.texts_to_sequences(FoldData['holdout']['comment_text'].values)
    X_train = sequence.pad_sequences(X_train, maxlen= maxlen)
    X_valid = sequence.pad_sequences(X_valid, maxlen= maxlen)
    X_holdout = sequence.pad_sequences(X_holdout, maxlen= maxlen)
    Y_train = FoldData['train'][targets].values
    Y_valid = FoldData['valid'][targets].values
    Y_holdout = FoldData['holdout'][targets].values
    print('token done.')
    ## embedding with pre-trained embedding library
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print('embedding done.')
    ## construct bi-gru model
    model = get_model(embedding_matrix)
    hist = model.fit(X_train, Y_train, 
                     batch_size= batch_size, 
                     epochs= epochs, 
                     validation_data= (X_valid, Y_valid),
                     callbacks=[RocAuc], verbose=2)
    print('fitting done.')
    Y_holdout_pred = model.predict(X_holdout, batch_size=1024)
    print('predict done.')

2000000
