In [None]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import KFold
from keras import backend as K

### Custom Py Script from src folder
from src import Pipeline, Toxic_Models, Model_trainer

# Load Pretrained Embedding Model

In [None]:
# emb_model = Pipeline.load_emb_model('./emb_model/crawl-300d-2M.vec')        # FastText Embeddings
emb_model = Pipeline.load_emb_model('./emb_model/glove.840B.300d.txt')    # Glove Embeddings

# Hyper parameter

In [None]:
### classes names
list_classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

### preprocessing parameter
maxlen = 180
max_features = 100000       
embed_size = 300


### model parameter
cell_size = 64                   ### Cell unit size
cell_type_GRU = True             ### Cell Type: GRU/LSTM
filter_size = 64
kernel_size = 2
stride = 1 

### K-fold cross-validation
k= 5
kf = KFold(n_splits=k, shuffle=False)

### training protocol
epochs= 13
batch_size = 128
lr_s = True                        ### Use of Learning Schedule

# Load data

In [None]:
submission = pd.read_csv("./input/sample_submission.csv")
X_tr, Y_tr, X_te, emb_matrix = Pipeline.load_data_2path(emb_model, max_features = max_features, maxlen = maxlen)

# Model

In [None]:
model_name = 'rnn'

### ================================================================== ###
oofs = []
res = np.zeros_like(submission[list_classes])

for train_index, val_index in kf.split(X_tr[0], Y_tr):
    mdl = Toxic_Models.get_model_rnn(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU)
    pred, oof = Model_trainer.model_train_cv(mdl, X_tra = [X_tr[0][train_index], X_tr[1][train_index]], X_val = [X_tr[0][val_index], X_tr[1][val_index]],
                                             y_tra=  Y_tr[train_index], y_val= Y_tr[val_index], x_test=X_te, 
                                             model_name=model_name, batch_size=batch_size, epochs=epochs, lr_schedule=lr_s)
    res += pred
    oofs.append(oof)
    K.clear_session()
    time.sleep(20)
    
res = res/k
    

### Collect result & Report
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [None]:
model_name = 'rnncnn'

### ================================================================== ###
oofs = []
res = np.zeros_like(submission[list_classes])

for train_index, val_index in kf.split(X_tr[0], Y_tr):
    mdl = Toxic_Models.get_model_rnn_cnn(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU, 
                                         filter_size=filter_size, kernel_size=kernel_size, stride=stride)
    pred, oof = Model_trainer.model_train_cv(mdl, X_tra = [X_tr[0][train_index], X_tr[1][train_index]], X_val = [X_tr[0][val_index], X_tr[1][val_index]],
                                             y_tra=  Y_tr[train_index], y_val= Y_tr[val_index], x_test=X_te, 
                                             model_name=model_name, batch_size=batch_size, epochs=epochs, lr_schedule=lr_s)
    res += pred
    oofs.append(oof)
    K.clear_session()
    time.sleep(20)

res = res/k
    

### Collect result & Report
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [None]:
model_name = 'rnn_caps'

### ================================================================== ###
oofs = []
res = np.zeros_like(submission[list_classes])

for train_index, val_index in kf.split(X_tr[0], Y_tr):
    mdl = Toxic_Models.get_model_rnn_caps(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU)
    pred, oof = Model_trainer.model_train_cv(mdl, X_tra = [X_tr[0][train_index], X_tr[1][train_index]], X_val = [X_tr[0][val_index], X_tr[1][val_index]],
                                             y_tra=  Y_tr[train_index], y_val= Y_tr[val_index], x_test=X_te, 
                                             model_name=model_name, batch_size=batch_size, epochs=epochs, lr_schedule=lr_s)
    res += pred
    oofs.append(oof)
    K.clear_session()
    time.sleep(20)
    
res = res/k
    

### Collect result & Report
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [None]:
model_name = '2rnn'

### ================================================================== ###
oofs = []
res = np.zeros_like(submission[list_classes])

for train_index, val_index in kf.split(X_tr[0], Y_tr):
    mdl = Toxic_Models.get_model_2rnn(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU)
    pred, oof = Model_trainer.model_train_cv(mdl, X_tra = [X_tr[0][train_index], X_tr[1][train_index]], X_val = [X_tr[0][val_index], X_tr[1][val_index]],
                                             y_tra=  Y_tr[train_index], y_val= Y_tr[val_index], x_test=X_te, 
                                             model_name=model_name, batch_size=batch_size, epochs=epochs, lr_schedule=lr_s)
    res += pred
    oofs.append(oof)
    K.clear_session()
    time.sleep(20)
    
res = res/k
    

### Collect result & Report
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

In [None]:
model_name = '2rnncnn'

### ================================================================== ###
oofs = []
res = np.zeros_like(submission[list_classes])

for train_index, val_index in kf.split(X_tr[0], Y_tr):
    mdl = Toxic_Models.get_model_2rnn_cnn(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU, 
                                         filter_size=filter_size, kernel_size=kernel_size, stride=stride)
    pred, oof = Model_trainer.model_train_cv(mdl, X_tra = [X_tr[0][train_index], X_tr[1][train_index]], X_val = [X_tr[0][val_index], X_tr[1][val_index]],
                                             y_tra=  Y_tr[train_index], y_val= Y_tr[val_index], x_test=X_te, 
                                             model_name=model_name, batch_size=batch_size, epochs=epochs, lr_schedule=lr_s)
    res += pred
    oofs.append(oof)
    K.clear_session()
    time.sleep(20)

res = res/k
    
### Collect result & Report
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

# END