In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
import os,sys,time

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

Using TensorFlow backend.


In [5]:
DataBaseDir = '../../data/version1'
InputDir = '%s/l0/kfold' % DataBaseDir
OutputDir = '%s/l1/kfold' % DataBaseDir
kfold = 3
strategy = 'bi-gru'
# load data
start = time.time()
valid_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir).reset_index(drop= True)
    ## for valid/holdout data set
    if(fold == 0):
        HoldoutData = pd.read_csv('%s/holdout.csv' % FoldInputDir).reset_index(drop= True)
    valid['fold'] = fold
    valid_dfs.append(valid)
    print('load data for fold %s done.' % fold)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
end = time.time()
print('load data done, train %s holdout %s, time elapsed %s' % (len(TrainData), len(HoldoutData), (end - start)))
##
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def LoadEmbeddingVectors(f):
    ## debug
    k = 5000
    EmbeddingDict = {}
    with open(f, 'r') as i_file:
        for line in i_file:
            if(k == 0):
                break
            w, coe_vec= get_coefs(*line.rstrip().rsplit(' '))
            EmbeddingDict[w] = coe_vec
            k -= 1
    i_file.close()
    return EmbeddingDict

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
EmbeddingFile = '../../data/raw/crawl-300d-2M.vec'
max_features = 30000
maxlen = 100
embed_size = 300
batch_size = 32
epochs = 2
start = time.time()
EmbeddingIndex = LoadEmbeddingVectors(EmbeddingFile)
end = time.time()
print('load embedding features done, corpus size %s, time elapsed %s' % (len(EmbeddingIndex), (end - start)))

def get_model(embedding_matrix):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

cv_score = .0
holdout_score = .0
start = time.time()
for fold in range(kfold):
    print('====== fold %s ======\n' % fold)
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'holdout': HoldoutData
    }
    ## tokenize with entire corpus composed by train/valid/holdout
    tokenizer = text.Tokenizer(num_words= max_features)
    EntireCorpus = list(FoldData['train']['comment_text'].values) + list(FoldData['valid']['comment_text'].values) + list(FoldData['holdout']['comment_text'].values)
    tokenizer.fit_on_texts(EntireCorpus)
    X_train = tokenizer.texts_to_sequences(FoldData['train']['comment_text'].values)
    X_valid = tokenizer.texts_to_sequences(FoldData['valid']['comment_text'].values)
    X_holdout = tokenizer.texts_to_sequences(FoldData['holdout']['comment_text'].values)
    X_train = sequence.pad_sequences(X_train, maxlen= maxlen)
    X_valid = sequence.pad_sequences(X_valid, maxlen= maxlen)
    X_holdout = sequence.pad_sequences(X_holdout, maxlen= maxlen)
    Y_train = FoldData['train'][targets].values
    Y_valid = FoldData['valid'][targets].values
    Y_holdout = FoldData['holdout'][targets].values
    end = time.time()
    print('token done, time elapsed %s.' % (end - start))
    ## embedding with pre-trained embedding library
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = EmbeddingIndex.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    end = time.time()
    print('embedding done, time elapsed %s.' % (end - start))
    ## construct bi-gru model
    start = time.time()
    model = get_model(embedding_matrix)
    RocAuc = RocAucEvaluation(validation_data= (X_valid, Y_valid), interval=1)
    hist = model.fit(X_train, Y_train, 
                     batch_size= batch_size, 
                     epochs= epochs, 
                     validation_data= (X_valid, Y_valid),
                     callbacks=[RocAuc], verbose=2)
    end = time.time()
    print('fitting done, time elapsed %s.' % (end - start))
    ## predict
    pred_cols = ['%s_%s' % (strategy, c) for c in targets]
    FoldInputDir['valid'][pred_cols] = model.predict(X_valid, batch_size=1024)
    FoldInputDir['holdout'][pred_cols] = model.predict(X_holdout, batch_size=1024)
    end = time.time()
    print('predict done, time elapsed %s.' % (end - start))
    ## evaluate
    cv_score += roc_auc_score(FoldInputDir['valid'][pred_cols], FoldInputDir['valid'][targets])
    holdout_score += roc_auc_score(FoldInputDir['holdout'][pred_cols], FoldInputDir['holdout'][targets])
    ## output
    FoldOutputDir = '%s/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    out_cols = pred_cols.extend(targets)
    for mod in ['valid', 'holdout']:
        FoldData[mod][out_cols].to_csv('%s/%s.csv' % (FoldOutputDir, mod))
    end = time.time()
    print('output done, time elapsed %s.\n' % (end - start))
cv_score /= kfold
holdout_score /= kfold
end = time.time()
print('\n================')
print('cv score %.5f, holdout score %.5f, time elapsed %s' % (cv_score, holdout_score, (end - start)))
print('================')

load data for fold 0 done.
load data for fold 1 done.
load data for fold 2 done.
load data done, train 143591 holdout 15980, time elapsed 0.926793098449707
load embedding features done, corpus size 5000, time elapsed 0.3693718910217285

token done, time elapsed 19.210576057434082.
embedding done, time elapsed 19.279154777526855.
Train on 95716 samples, validate on 47875 samples
Epoch 1/2


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/yuanpingzhou/miniconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-81e455501cd8>", line 125, in <module>
    callbacks=[RocAuc], verbose=2)
  File "/Users/yuanpingzhou/miniconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1598, in fit
    validation_steps=validation_steps)
  File "/Users/yuanpingzhou/miniconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1183, in _fit_loop
    outs = f(ins_batch)
  File "/Users/yuanpingzhou/miniconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2273, in __call__
    **self.session_kwargs)
  File "/Users/yuanpingzhou/miniconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 778, in run
    run_metadata_ptr)
  File "/Users/yuanpingzhou/miniconda3/lib/python3.6/site-packages/tensorflow/python/client/sess

KeyboardInterrupt: 