In [1]:
from utils import *

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from keras.preprocessing import text, sequence
from sklearn.model_selection import KFold
import re, os
print(os.listdir("../input"))

Using TensorFlow backend.


['sample_submission.csv', 'test.csv', 'train.csv']


In [13]:
# Path
dir_emb_pre_trained = "../emb_pre_trained"
submission = pd.read_csv('../input/sample_submission.csv')

In [4]:
# func for preprocessing
def load_data_2path(emb_model,
             filepath_train = "../input/train.csv", 
             filepath_test = "../input/test.csv", 
             embed_size = 300,
             max_features = 100000,
             maxlen = 100
            ):

    DOC_Column = "comment_text"
    list_classes = ["target"]

    ###load data    
    print("Data is loading ...", end='')
    train = pd.read_csv(filepath_train)
    test = pd.read_csv(filepath_test)
    print("\r === Data is loaded")

    list_sentences_train = train[DOC_Column].fillna('UNK').values
    list_sentences_test = test[DOC_Column].fillna('UNK').values
    y = train[list_classes].values
    
    print("Data is preprocessing ...", end='')
    preprocessed_train = list_sentences_train.tolist()
    preprocessed_test = list_sentences_test.tolist()
    
    tokenizer = text.Tokenizer(num_words =max_features)
    tokenizer.fit_on_texts(preprocessed_train + preprocessed_test)

    list_tokenized_train = tokenizer.texts_to_sequences(preprocessed_train)
    list_tokenized_test = tokenizer.texts_to_sequences(preprocessed_test)

    X_t_pre = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen, truncating='pre')
#     X_t_post = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen, truncating='post')
    
    X_te_pre = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen, truncating='pre')
#     X_te_post = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen, truncating='post')
    
    print("\r === Data is preprocessed")
    
#     X_t = [X_t_pre, X_t_post]
#     X_te = [X_te_pre, X_te_post]

    X_t = X_t_pre
    X_te = X_te_pre
    
    print("Embedding Matrix is Computing ...", end='')
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(0.001, 0.4, (nb_words, embed_size))

    for word, i in word_index.items():
        if i >= max_features: continue
        try:
            embedding_vector = emb_model.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        except: 
            pass
    print("\r === Embedding Matrix is Computed")

    return X_t, y, X_te, embedding_matrix, tokenizer

In [5]:
emb_model = load_emb_model(dir_emb_pre_trained+'/crawl-300d-2M.vec')        # FastText Embeddings
# emb_model = load_emb_model(dir_emb_pre_trained+'/glove.twitter.27B.100d.txt')    # Glove Embeddings

In [19]:
filepath_train = "../input/train.csv" 
filepath_test = "../input/test.csv"

### preprocessing parameter
embed_size = 128
max_features = 150000
maxlen = 180

### classes names
list_classes = 'target'




### model parameter
cell_size = 64                   ### Cell unit size
cell_type_GRU = True             ### Cell Type: GRU/LSTM
filter_size = 64
kernel_size = 2
stride = 1 

# ### K-fold cross-validation
k= 5
kf = KFold(n_splits=k, shuffle=True, random_state=1991)

### training protocol
epochs= 8
batch_size = 1024
lr_s = False                        ### Use of Learning Schedule

In [20]:
X_tr, Y_tr, X_te, emb_matrix, tknzr = load_data_2path(emb_model, max_features = max_features, maxlen = maxlen, embed_size=embed_size)

 === Data is loaded
 === Data is preprocessed
 === Embedding Matrix is Computed


In [21]:
#===============keras ==============
from keras.models import Model
from keras.layers import Dense, Embedding, Input, concatenate, Flatten, add
from keras.layers import CuDNNLSTM, CuDNNGRU, Bidirectional
from keras.layers import Dropout, SpatialDropout1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras import backend as K
from keras.layers import MaxPooling1D, Activation
from keras import initializers, regularizers, constraints

In [22]:
prob_dropout = 0.2
emb_train = True

In [None]:
inp_pre = Input(shape=(maxlen, ), name='input_pre')
#     inp_post = Input(shape=(maxlen, ), name='input_post')


##pre
x1 = Embedding(max_features, embed_size, weights=[emb_matrix], trainable = emb_train)(inp_pre)
x1 = SpatialDropout1D(rate = prob_dropout)(x1)

if cell_type_GRU:
    x1 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x1)
else :
    x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)

avg_pool1 = GlobalAveragePooling1D()(x1)
max_pool1 = GlobalMaxPooling1D()(x1)

##post
#     x2 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_post)
#     x2 = SpatialDropout1D(prob_dropout)(x2)
    
#     if cell_type_GRU:
#         x2 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x2)
#     else :
#         x2 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
    
#     avg_pool2 = GlobalAveragePooling1D()(x2)
#     max_pool2 = GlobalMaxPooling1D()(x2)

conc = concatenate([avg_pool1, max_pool1])
outp = Dense(1, activation="sigmoid")(conc)

model = Model(inputs=inp_pre, outputs=outp)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['mse','binary_crossentropy', 'accuracy'])

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint
            
def schedule(ind):
    a = [0.001, 0.0008, 0.0006, 0.0004, 0.0002, 0.0001, 0.00005, 0.003, 0.0005, 0.0001, 0.00005,
         0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005]
    return a[ind]
        
def model_train_cv(model, X_tra, X_val, y_tra, y_val, x_test, model_name, batch_size = 1024, epochs = 2, lr_schedule=True):
    file_path = "best_model.hdf5"
    
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
    lr_s = LearningRateScheduler(schedule)
    
    if lr_schedule:
        hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                             callbacks = [lr_s, check_point], verbose=1)
    else:
        print('== no learing schedule')
        hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                             callbacks = [check_point], verbose=1)
        
    model.load_weights(file_path)
    oof = model.predict(X_val, batch_size=batch_size, verbose=1)
    pred = model.predict(x_test, batch_size=batch_size, verbose=1)
    
    return pred, oof

In [None]:
model_name = 'rnn'

### ================================================================== ###
oofs = []
res = np.zeros_like(submission["prediction"])

for train_index, val_index in kf.split(X_tr, Y_tr):
#     mdl = Toxic_Models.get_model_rnn(emb_matrix, cell_size=cell_size, maxlen=maxlen, cell_type_GRU=cell_type_GRU)
    pred, oof = model_train_cv(model, X_tra = X_tr[train_index], X_val = X_tr[val_index],
                                             y_tra=  Y_tr[train_index], y_val= Y_tr[val_index], x_test=X_te, 
                                             model_name=model_name, batch_size=batch_size, epochs=epochs, lr_schedule=lr_s)
    res += pred
    oofs.append(oof)
    K.clear_session()
    time.sleep(20)
    
res = res/k
    

### Collect result & Report
submission[list_classes] = res
submission.to_csv("submission_{}.csv".format(model_name), index = False)

np_oofs = np.array(oofs)
pd_oofs = pd.DataFrame(np.concatenate(np_oofs), columns=list_classes)
pd_oofs.to_csv("oofs_{}.csv".format(model_name), index=False)

== no learing schedule
Train on 1443899 samples, validate on 360975 samples
Epoch 1/8

Epoch 00001: val_loss improved from inf to 0.23759, saving model to best_model.hdf5
Epoch 2/8