In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from keras import backend as K

### Custom Py Script
import Pipeline
import Toxic_Models
import Model_trainer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Load Pretrained Embedding Model

In [3]:
emb_model = Pipeline.load_emb_model('./emb_model/crawl-300d-2M.vec')

# Pre load

In [4]:
list_classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.read_csv("./input/sample_submission.csv")

# Hyper parameter

In [5]:
### preprocessing parameter
maxlen = 180
max_features = 100000       
embed_size = 300


### model parameter
cell_size = 64                   ### Cell unit size
cell_type_GRU = True             ### Cell Type: GRU/LSTM
filter_size = 64
kernel_size = 2
stride = 1 

### K-fold cross-validation
k= 5
kf = KFold(n_splits=k, shuffle=False)

### training protocol
epochs= 13
batch_size = 128
lr_s = True                        ### Use of Learning Schedule

In [6]:
X_tr, Y_tr, X_te, emb_matrix = Pipeline.load_data_2path(emb_model, max_features = max_features, maxlen = maxlen)

=== Data is loaded
=== Data is preprocessed
=== Embedding Matrix is loaded


In [None]:
from keras.utils import plot_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

In [None]:
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

In [None]:
mdl = Toxic_Models.get_model_rnn_cnn(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU, filter_size=filter_size, kernel_size=kernel_size, stride=stride)

In [None]:
plot_model(mdl, to_file='model_rnn_cnn.png')
SVG(model_to_dot(mdl).create(prog='dot', format='svg'))

# Model

In [None]:
model_name = 'rnn'

### ================================================================== ###
oofs = []
res = np.zeros_like(submission[list_classes])

for train_index, val_index in kf.split(X_tr[0], Y_tr):
    mdl = Toxic_Models.get_model_rnn(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU)
    pred, oof = Model_trainer.model_train_cv(mdl, X_tra = [X_tr[0][train_index], X_tr[1][train_index]], X_val = [X_tr[0][val_index], X_tr[1][val_index]],
                                             y_tra=  Y_tr[train_index], y_val= Y_tr[val_index], x_test=X_te, 
                                             model_name=model_name, batch_size=batch_size, epochs=epochs, lr_schedule=lr_s)
    res += pred
    oofs.append(oof)
    K.clear_session()
res = res/k
    

### Collect result & Report
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [None]:
model_name = 'rnncnn'

### ================================================================== ###
oofs = []
res = np.zeros_like(submission[list_classes])

for train_index, val_index in kf.split(X_tr[0], Y_tr):
    mdl = Toxic_Models.get_model_rnn_cnn(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU, 
                                         filter_size=filter_size, kernel_size=kernel_size, stride=stride)
    pred, oof = Model_trainer.model_train_cv(mdl, X_tra = [X_tr[0][train_index], X_tr[1][train_index]], X_val = [X_tr[0][val_index], X_tr[1][val_index]],
                                             y_tra=  Y_tr[train_index], y_val= Y_tr[val_index], x_test=X_te, 
                                             model_name=model_name, batch_size=batch_size, epochs=epochs, lr_schedule=lr_s)
    res += pred
    oofs.append(oof)
    K.clear_session()

res = res/k
    

### Collect result & Report
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [None]:
model_name = 'rnncaps'

oofs = []
res = np.zeros_like(submission[list_classes])
for train_index, val_index in kf.split(datas[0][0], datas[1]):
    cv_x_tr = [X_t_pre[train_index], X_t_post[train_index]]
    cv_x_val = [X_t_pre[val_index], X_t_post[val_index]]
    cv_y_tr = Y_t[train_index]
    cv_y_te = Y_t[val_index]

    mdl = Toxic_Models.get_model_rnn_caps(datas[3], cell_size=80, maxlen=maxlen, cell_type_GRU=cell_type_GRU)
    preds = Model_trainer.model_train_cv(mdl, cv_x_tr, cv_x_val,cv_y_tr, cv_y_te, datas[2], model_name, batch_size=128, epochs=epochs, lr=lr)
    res += preds[0]
    oofs.append(preds[1])
    
res = res/k
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [7]:
model_name = '2rnn'

### ================================================================== ###
oofs = []
res = np.zeros_like(submission[list_classes])

for train_index, val_index in kf.split(X_tr[0], Y_tr):
    mdl = Toxic_Models.get_model_2rnn(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU)
    pred, oof = Model_trainer.model_train_cv(mdl, X_tra = [X_tr[0][train_index], X_tr[1][train_index]], X_val = [X_tr[0][val_index], X_tr[1][val_index]],
                                             y_tra=  Y_tr[train_index], y_val= Y_tr[val_index], x_test=X_te, 
                                             model_name=model_name, batch_size=batch_size, epochs=epochs, lr_schedule=lr_s)
    res += pred
    oofs.append(oof)
    K.clear_session()
res = res/k
    

### Collect result & Report
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

Train on 127656 samples, validate on 31915 samples
Epoch 1/13
 - 75s - loss: 0.0605 - binary_crossentropy: 0.0605 - acc: 0.9790 - val_loss: 0.0454 - val_binary_crossentropy: 0.0454 - val_acc: 0.9825

 ROC-AUC - epoch: 1 - score: 0.982217 


Epoch 00001: val_loss improved from inf to 0.04540, saving model to best_model.hdf5
Epoch 2/13
 - 70s - loss: 0.0434 - binary_crossentropy: 0.0434 - acc: 0.9833 - val_loss: 0.0419 - val_binary_crossentropy: 0.0419 - val_acc: 0.9836

 ROC-AUC - epoch: 2 - score: 0.989128 


Epoch 00002: val_loss improved from 0.04540 to 0.04191, saving model to best_model.hdf5
Epoch 3/13
 - 70s - loss: 0.0400 - binary_crossentropy: 0.0400 - acc: 0.9843 - val_loss: 0.0396 - val_binary_crossentropy: 0.0396 - val_acc: 0.9842

 ROC-AUC - epoch: 3 - score: 0.990106 


Epoch 00003: val_loss improved from 0.04191 to 0.03957, saving model to best_model.hdf5
Epoch 4/13
 - 72s - loss: 0.0380 - binary_crossentropy: 0.0380 - acc: 0.9851 - val_loss: 0.0403 - val_binary_crossentro


 ROC-AUC - epoch: 4 - score: 0.988066 


Epoch 00004: val_loss improved from 0.03938 to 0.03918, saving model to best_model.hdf5
Epoch 5/13
 - 69s - loss: 0.0374 - binary_crossentropy: 0.0374 - acc: 0.9852 - val_loss: 0.0382 - val_binary_crossentropy: 0.0382 - val_acc: 0.9852

 ROC-AUC - epoch: 5 - score: 0.988472 


Epoch 00005: val_loss improved from 0.03918 to 0.03821, saving model to best_model.hdf5
Epoch 6/13
 - 69s - loss: 0.0365 - binary_crossentropy: 0.0365 - acc: 0.9855 - val_loss: 0.0380 - val_binary_crossentropy: 0.0380 - val_acc: 0.9850

 ROC-AUC - epoch: 6 - score: 0.988636 


Epoch 00006: val_loss improved from 0.03821 to 0.03799, saving model to best_model.hdf5
Epoch 7/13
 - 72s - loss: 0.0358 - binary_crossentropy: 0.0358 - acc: 0.9857 - val_loss: 0.0378 - val_binary_crossentropy: 0.0378 - val_acc: 0.9851

 ROC-AUC - epoch: 7 - score: 0.988753 


Epoch 00007: val_loss improved from 0.03799 to 0.03781, saving model to best_model.hdf5
Epoch 8/13
 - 74s - loss: 0.0415 - b

KeyboardInterrupt: 

In [None]:
model_name = '2rnncnn'

### ================================================================== ###
oofs = []
res = np.zeros_like(submission[list_classes])

for train_index, val_index in kf.split(X_tr[0], Y_tr):
    mdl = Toxic_Models.get_model_2rnn_cnn(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU, 
                                         filter_size=filter_size, kernel_size=kernel_size, stride=stride)
    pred, oof = Model_trainer.model_train_cv(mdl, X_tra = [X_tr[0][train_index], X_tr[1][train_index]], X_val = [X_tr[0][val_index], X_tr[1][val_index]],
                                             y_tra=  Y_tr[train_index], y_val= Y_tr[val_index], x_test=X_te, 
                                             model_name=model_name, batch_size=batch_size, epochs=epochs, lr_schedule=lr_s)
    res += pred
    oofs.append(oof)
    K.clear_session()

res = res/k
    
### Collect result & Report
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [None]:
model_name = 'rnn2cnndr'

oofs = []
res = np.zeros_like(submission[list_classes])
for train_index, val_index in kf.split(datas[0][0], datas[1]):
    cv_x_tr = [X_t_pre[train_index], X_t_post[train_index]]
    cv_x_val = [X_t_pre[val_index], X_t_post[val_index]]
    cv_y_tr = Y_t[train_index]
    cv_y_te = Y_t[val_index]
    
    mdl = Toxic_Models.get_model_rnn2_cnn_directlink(datas[3], cell_size=80, maxlen=maxlen, cell_type_GRU=cell_type_GRU)
    preds = Model_trainer.model_train_cv(mdl, cv_x_tr, cv_x_val,cv_y_tr, cv_y_te, datas[2], model_name, batch_size=256, epochs=epochs, lr=lr)
    res += preds[0]
    oofs.append(preds[1])
                                        
res = res/k
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)
                                        
np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [None]:
model_name = 'rnn2cnn32dr'

oofs = []
res = np.zeros_like(submission[list_classes])
for train_index, val_index in kf.split(datas[0][0], datas[1]):
    cv_x_tr = [X_t_pre[train_index], X_t_post[train_index]]
    cv_x_val = [X_t_pre[val_index], X_t_post[val_index]]
    cv_y_tr = Y_t[train_index]
    cv_y_te = Y_t[val_index]
    
    mdl = Toxic_Models.get_model_rnn2_cnn_directlink(datas[3], cell_size=80, maxlen=maxlen, cell_type_GRU=cell_type_GRU, kernel_size=3, stride=2)
    preds = Model_trainer.model_train_cv(mdl, cv_x_tr, cv_x_val,cv_y_tr, cv_y_te, datas[2], model_name, batch_size=256, epochs=epochs, lr=lr)
    res += preds[0]
    oofs.append(preds[1])
                                        
res = res/k
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)
                                        
np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [None]:
model_name = 'dualrnn2cnndr'

oofs = []
res = np.zeros_like(submission[list_classes])
for train_index, val_index in kf.split(datas[0][0], datas[1]):
    cv_x_tr = [X_t_pre[train_index], X_t_post[train_index]]
    cv_x_val = [X_t_pre[val_index], X_t_post[val_index]]
    cv_y_tr = Y_t[train_index]
    cv_y_te = Y_t[val_index]
    
    mdl = Toxic_Models.get_model_dual_rnn2_cnn_directlink(datas[3], cell_size=80, maxlen=maxlen, cell_type_GRU=True)
    preds = Model_trainer.model_train_cv(mdl, cv_x_tr, cv_x_val,cv_y_tr, cv_y_te, datas[2], model_name, batch_size=256, epochs=epochs, lr=lr)
    res += preds[0]
    oofs.append(preds[1])
                                        
res = res/k
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)
                                        
np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [None]:
model_name = 'dualrnn2cnn32dr'

oofs = []
res = np.zeros_like(submission[list_classes])
for train_index, val_index in kf.split(datas[0][0], datas[1]):
    cv_x_tr = [X_t_pre[train_index], X_t_post[train_index]]
    cv_x_val = [X_t_pre[val_index], X_t_post[val_index]]
    cv_y_tr = Y_t[train_index]
    cv_y_te = Y_t[val_index]
    
    mdl = Toxic_Models.get_model_dual_rnn2_cnn_directlink(datas[3], cell_size=80, maxlen=maxlen, cell_type_GRU=True)
    preds = Model_trainer.model_train_cv(mdl, cv_x_tr, cv_x_val,cv_y_tr, cv_y_te, datas[2], model_name, batch_size=256, epochs=epochs, lr=lr)
    res += preds[0]
    oofs.append(preds[1])
                                        
res = res/k
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)
                                        
np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

# END