In [5]:
import numpy as np
np.random.seed(42)
import pandas as pd
import os,sys,time,datetime

from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Flatten, Dropout
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import K, Activation
from keras.engine import Layer

from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
#warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

DataBaseDir = '../../data/version4'
InputDir = '%s/l0/kfold' % DataBaseDir
OutputDir = '%s/l1' % DataBaseDir
kfold = 10
strategy = 'bi-gru-capsule'
# load data
start = time.time()
valid_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir).reset_index(drop= True)#.sample(frac= 0.05)
    ## for valid/holdout data set
    if(fold == 0):
        #HoldoutData = pd.read_csv('%s/holdout.csv' % FoldInputDir).reset_index(drop= True)#.sample(frac= 0.1)
        TestData = pd.read_csv('%s/test.csv' % FoldInputDir).reset_index(drop= True)#.sample(frac= 0.05)
    valid['fold'] = fold
    valid_dfs.append(valid)
    print('load data for fold %s done.' % fold)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
end = time.time()
print('load data done, train %s, time elapsed %s' % (len(TrainData), (end - start)))
##
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def LoadEmbeddingVectors(f):
    ## debug
    k = 1000
    EmbeddingDict = {}
    with open(f, 'r') as i_file:
        for line in i_file:
            #if(k == 0):
            #    break
            w, coe_vec= get_coefs(*line.rstrip().rsplit(' '))
            EmbeddingDict[w] = coe_vec
            k -= 1
    i_file.close()
    return EmbeddingDict

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

            

targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
EmbeddingFile = '../../data/raw/crawl-300d-2M.vec'
max_features = 80000
maxlen = 256
#max_features = 3000
#maxlen = 10
embed_size = 300
batch_size = 128
epochs = 5
start = time.time()
EmbeddingIndex = LoadEmbeddingVectors(EmbeddingFile)
end = time.time()
print('load embedding features done, corpus size %s, time elapsed %s' % (len(EmbeddingIndex), (end - start)))

gru_len = 128
Routings = 5
Num_capsule = 40
Dim_capsule = 64
dropout_p = 0.25
rate_drop_dense = 0.25

def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale


# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)


def get_model(embedding_matrix):
    input1 = Input(shape=(maxlen,))
    embed_layer = Embedding(max_features,
                            embed_size,
                            weights=[embedding_matrix],
                            trainable=False)(input1)
    embed_layer = SpatialDropout1D(0.5)(embed_layer)
    x = Bidirectional(
        GRU(128, activation='relu', dropout=dropout_p, recurrent_dropout=dropout_p, return_sequences=True)
    )(embed_layer)
    
    ## capsule 1
    capsule1 = Capsule(num_capsule= 32, dim_capsule= 64, routings=Routings,share_weights=True)(x)
    capsule1 = Flatten()(capsule1)
    capsule1 = Dropout(dropout_p)(capsule1)
    ## capsule 2
    capsule2 = Capsule(num_capsule= 64, dim_capsule= 32, routings=Routings,share_weights=True)(x)
    capsule2 = Flatten()(capsule2)
    capsule2 = Dropout(dropout_p)(capsule2)
    
    conc = concatenate([capsule1, capsule2])
    
    output = Dense(6, activation='sigmoid')(conc)
    
    model = Model(inputs=input1, outputs=output)
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
    model.summary()
    return model

##
cv_score = .0
start = time.time()
pred_cols = ['%s_%s' % (strategy, c) for c in targets]
for c in pred_cols:
    #HoldoutData[c] = .0
    TestData[c] = .0
for fold in range(kfold):
    print('====== fold %s ======\n' % fold)
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        #'holdout': HoldoutData,
        'test': TestData
    }
    for c in pred_cols:
        FoldData['valid'][c] = .0
        #FoldData['holdout'][c] = .0
        FoldData['test'][c] = .0
    ## tokenize with entire corpus composed by train/valid/holdout
    tokenizer = text.Tokenizer(num_words= max_features)
    EntireCorpus = list(FoldData['train']['comment_text'].values) + list(FoldData['valid']['comment_text'].values) + list(FoldData['test']['comment_text'].values)
    tokenizer.fit_on_texts(EntireCorpus)
    X_train = tokenizer.texts_to_sequences(FoldData['train']['comment_text'].values)
    X_valid = tokenizer.texts_to_sequences(FoldData['valid']['comment_text'].values)
    #X_holdout = tokenizer.texts_to_sequences(FoldData['holdout']['comment_text'].values)
    X_test = tokenizer.texts_to_sequences(FoldData['test']['comment_text'].values)
    X_train = sequence.pad_sequences(X_train, maxlen= maxlen)
    X_valid = sequence.pad_sequences(X_valid, maxlen= maxlen)
    #X_holdout = sequence.pad_sequences(X_holdout, maxlen= maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen= maxlen)
    Y_train = FoldData['train'][targets].values
    Y_valid = FoldData['valid'][targets].values
    #Y_holdout = FoldData['holdout'][targets].values
    ## embedding with pre-trained embedding library
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = EmbeddingIndex.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    ## construct bi-gru model
    model = get_model(embedding_matrix)
    RocAuc = RocAucEvaluation(validation_data= (X_valid, Y_valid), interval=1)
    hist = model.fit(X_train, Y_train, 
                     batch_size= batch_size, 
                     epochs= epochs, 
                     validation_data= (X_valid, Y_valid),
                     callbacks=[RocAuc], verbose=2)
    end = time.time()
    print('fitting done, time elapsed %s.' % (end - start))
    ## predict for valid
    pred_valid = model.predict(X_valid, batch_size=1024)
    FoldData['valid'][pred_cols] = pred_valid
#     ## predict for holdout
#     pred_holdout = model.predict(X_holdout, batch_size=1024)
#     FoldData['holdout'][pred_cols] = pred_holdout
#     HoldoutData[pred_cols] += pred_holdout
    ## predict for test
    pred_test = model.predict(X_test, batch_size=1024)
    FoldData['test'][pred_cols] = pred_test
    TestData[pred_cols] += pred_test
    ## evaluate
#     print(FoldData['valid'][pred_cols].isnull().sum(axis= 0))
    score = roc_auc_score(FoldData['valid'][targets], FoldData['valid'][pred_cols])
    cv_score += score
    ## output
    FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in ['valid', 'test']:
        if(mod == 'test'):
            out_cols = ['id']
            out_cols.extend(pred_cols)
        else:
            out_cols = pred_cols.copy()
            out_cols.extend(targets)
        FoldData[mod][out_cols].to_csv('%s/%s_%s.csv' % (FoldOutputDir, mod, strategy),float_format='%.8f', index= False) 
    end = time.time()
    print('fold %s, score %.5f, time elapsed %.2fs' % (fold, score, (end - start)))
cv_score /= kfold
#HoldoutData[pred_cols] /= kfold
TestData[pred_cols] /= kfold
#holdout_score = roc_auc_score(HoldoutData[targets], HoldoutData[pred_cols])
end = time.time()
print('\n================')
print('cv score %.5f,  time elapsed %s' % (cv_score, (end - start)))
print('================')

## submit
sub = TestData[['id']].copy()
sub[targets] = TestData[pred_cols]
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l0/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.8f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

load data for fold 0 done.
load data for fold 1 done.
load data for fold 2 done.
load data for fold 3 done.
load data for fold 4 done.
load data for fold 5 done.
load data for fold 6 done.
load data for fold 7 done.
load data for fold 8 done.
load data for fold 9 done.
load data done, train 159571, time elapsed 0.9403910636901855
load embedding features done, corpus size 2000000, time elapsed 77.1948184967041



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_8 (InputLayer)             (None, 256)           0                                            
____________________________________________________________________________________________________
embedding_8 (Embedding)          (None, 256, 300)      24000000    input_8[0][0]                    
____________________________________________________________________________________________________
spatial_dropout1d_8 (SpatialDrop (None, 256, 300)      0           embedding_8[0][0]                
____________________________________________________________________________________________________
bidirectional_8 (Bidirectional)  (None, 256, 256)      329472      spatial_dropout1d_8[0][0]        
___________________________________________________________________________________________

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


fold 0, score 0.98771, time elapsed 3546.81s

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_9 (InputLayer)             (None, 256)           0                                            
____________________________________________________________________________________________________
embedding_9 (Embedding)          (None, 256, 300)      24000000    input_9[0][0]                    
____________________________________________________________________________________________________
spatial_dropout1d_9 (SpatialDrop (None, 256, 300)      0           embedding_9[0][0]                
____________________________________________________________________________________________________
bidirectional_9 (Bidirectional)  (None, 256, 256)      329472      spatial_dropout1d_9[0][0]        
_____________________________________________

Train on 143608 samples, validate on 15963 samples
Epoch 1/5

 ROC-AUC - epoch: 1 - score: 0.979664 

709s - loss: 0.0629 - acc: 0.9779 - val_loss: 0.0529 - val_acc: 0.9788
Epoch 2/5

 ROC-AUC - epoch: 2 - score: 0.986104 

713s - loss: 0.0495 - acc: 0.9814 - val_loss: 0.0599 - val_acc: 0.9759
Epoch 3/5

 ROC-AUC - epoch: 3 - score: 0.987247 

708s - loss: 0.0469 - acc: 0.9822 - val_loss: 0.0480 - val_acc: 0.9807
Epoch 4/5

 ROC-AUC - epoch: 4 - score: 0.987286 

706s - loss: 0.0458 - acc: 0.9826 - val_loss: 0.0511 - val_acc: 0.9794
Epoch 5/5

 ROC-AUC - epoch: 5 - score: 0.987528 

714s - loss: 0.0456 - acc: 0.9826 - val_loss: 0.0505 - val_acc: 0.9791
fitting done, time elapsed 13968.775046825409.
fold 3, score 0.98753, time elapsed 14046.52s

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_12 (InputLayer)            (None, 256

fold 5, score 0.98534, time elapsed 20789.67s

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_14 (InputLayer)            (None, 256)           0                                            
____________________________________________________________________________________________________
embedding_14 (Embedding)         (None, 256, 300)      24000000    input_14[0][0]                   
____________________________________________________________________________________________________
spatial_dropout1d_14 (SpatialDro (None, 256, 300)      0           embedding_14[0][0]               
____________________________________________________________________________________________________
bidirectional_14 (Bidirectional) (None, 256, 256)      329472      spatial_dropout1d_14[0][0]       
____________________________________________

Train on 143630 samples, validate on 15941 samples
Epoch 1/5

 ROC-AUC - epoch: 1 - score: 0.982092 

716s - loss: 0.0635 - acc: 0.9781 - val_loss: 0.0544 - val_acc: 0.9781
Epoch 2/5

 ROC-AUC - epoch: 2 - score: 0.987783 

706s - loss: 0.0499 - acc: 0.9814 - val_loss: 0.0478 - val_acc: 0.9809
Epoch 3/5

 ROC-AUC - epoch: 3 - score: 0.988783 

700s - loss: 0.0472 - acc: 0.9822 - val_loss: 0.0472 - val_acc: 0.9812
Epoch 4/5

 ROC-AUC - epoch: 4 - score: 0.989523 

701s - loss: 0.0458 - acc: 0.9825 - val_loss: 0.0425 - val_acc: 0.9833
Epoch 5/5

 ROC-AUC - epoch: 5 - score: 0.989851 

702s - loss: 0.0451 - acc: 0.9825 - val_loss: 0.0434 - val_acc: 0.9830
fitting done, time elapsed 31038.91871571541.
fold 8, score 0.98985, time elapsed 31118.20s

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_17 (InputLayer)            (None, 256)

0