In [6]:
%%writefile Pipeline.py 
import pandas as pd
import numpy as np
import codecs
import re
import string
import os

#===============keras ==============
from keras.preprocessing import text, sequence

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def load_emb_model(embedding_path):
    return dict(get_coefs(*o.strip().split(" ")) for o in codecs.open(embedding_path, "r", "utf-8" ))

def load_data_2path(emb_model,
             filepath_train = "./input/train.csv", 
             filepath_test = "./input/test.csv", 
             embed_size = 300,
             max_features = 100000,
             maxlen = 180
            ):

    DOC_Column = "comment_text"
    list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

    ###load data    
    train = pd.read_csv(filepath_train)
    test = pd.read_csv(filepath_test)
    print("=== Data is loaded")

    list_sentences_train = train[DOC_Column].fillna('UNK').values
    list_sentences_test = test[DOC_Column].fillna('UNK').values
    y = train[list_classes].values

    preprocessed_train = list_sentences_train.tolist()
    preprocessed_test = list_sentences_test.tolist()
    
    tokenizer = text.Tokenizer(num_words =max_features)
    tokenizer.fit_on_texts(preprocessed_train + preprocessed_test)

    list_tokenized_train = tokenizer.texts_to_sequences(preprocessed_train)
    list_tokenized_test = tokenizer.texts_to_sequences(preprocessed_test)

    X_t_pre = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen, truncating='pre')
    X_t_post = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen, truncating='post')
    
    X_te_pre = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen, truncating='pre')
    X_te_post = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen, truncating='post')
    print("=== Data is preprocessed")
    
    X_t = [X_t_pre, X_t_post]
    X_te = [X_te_pre, X_te_post]

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
#     embedding_matrix = np.zeros((nb_words, embed_size))
    embedding_matrix = np.random.normal(0.001, 0.4, (nb_words, embed_size))

    for word, i in word_index.items():
        if i >= max_features: continue
        try:
            embedding_vector = emb_model.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        except: 
            pass
    print("=== Embedding Matrix is loaded")

    return X_t, y, X_te, embedding_matrix

Overwriting Pipeline.py


In [7]:
%%writefile Model_trainer.py

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback, LearningRateScheduler, ModelCheckpoint
            
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

def schedule(ind):
    a = [0.001, 0.0008, 0.0006, 0.0004, 0.0002, 0.0001, 0.00005, 0.003, 0.0005, 0.0001, 0.00005,
         0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005, 0.00005]
    return a[ind]
        
def model_train_cv(model, X_tra, X_val, y_tra, y_val, x_test, model_name, batch_size = 32, epochs = 2, lr_schedule=True):
    file_path = "best_model.hdf5"
    
    RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
    lr_s = LearningRateScheduler(schedule)
    
    if lr_schedule:
        hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                             callbacks = [RocAuc, lr_s, check_point], verbose=2)
    else:
        print('== no learing schedule')
        hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                             callbacks = [RocAuc, check_point], verbose=2)
        
    model.load_weights(file_path)
    oof = model.predict(X_val, batch_size=batch_size, verbose=1)
    pred = model.predict(x_test, batch_size=batch_size, verbose=1)
    
    return pred, oof

Overwriting Model_trainer.py


In [8]:
%%writefile Toxic_Models.py
#===============keras ==============
from keras.models import Model
from keras.layers import Dense, Embedding, Input, concatenate, Flatten, add
from keras.layers import CuDNNLSTM, CuDNNGRU, Bidirectional, Conv1D
from keras.layers import Dropout, SpatialDropout1D, BatchNormalization, GlobalAveragePooling1D, GlobalMaxPooling1D, PReLU
from keras.optimizers import Adam, RMSprop
from keras.layers import MaxPooling1D
from keras.layers import K, Activation
from keras.engine import Layer
    
def get_model_rnn(
                  embedding_matrix, cell_size = 80, cell_type_GRU = True,
                  maxlen = 180, max_features = 100000, embed_size = 300,
                  prob_dropout = 0.2, emb_train = False
                 ):
    
    inp_pre = Input(shape=(maxlen, ), name='input_pre')
    inp_post = Input(shape=(maxlen, ), name='input_post')

    ##pre
    x1 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_pre)
    x1 = SpatialDropout1D(prob_dropout)(x1)
    
    if cell_type_GRU:
        x1 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x1)
    else :
        x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)
    
    avg_pool1 = GlobalAveragePooling1D()(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    
    ##post
    x2 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_post)
    x2 = SpatialDropout1D(prob_dropout)(x2)
    
    if cell_type_GRU:
        x2 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x2)
    else :
        x2 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
    
    avg_pool2 = GlobalAveragePooling1D()(x2)
    max_pool2 = GlobalMaxPooling1D()(x2)
    
    ##merge
    conc = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=[inp_pre, inp_post], outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_crossentropy', 'accuracy'])

    return model

def get_model_rnn_cnn(
                      embedding_matrix, cell_size = 80, cell_type_GRU = True,
                      maxlen = 180, max_features = 100000, embed_size = 300,
                      prob_dropout = 0.2, emb_train = False,
                      filter_size=128, kernel_size = 2, stride = 1
                      ):
    inp_pre = Input(shape=(maxlen, ), name='input_pre')
    inp_post = Input(shape=(maxlen, ), name='input_post')
    
    ##pre
    x1 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_pre)
    x1 = SpatialDropout1D(prob_dropout)(x1)
    
    if cell_type_GRU:
        x1 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x1)
    else :
        x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)
    
    x1 = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x1)
    avg_pool1 = GlobalAveragePooling1D()(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    
    ##post
    x2 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_post)
    x2 = SpatialDropout1D(prob_dropout)(x2)
    
    if cell_type_GRU:
        x2 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x2)
    else :
        x2 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
    
    x2 = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x2)
    avg_pool2 = GlobalAveragePooling1D()(x2)
    max_pool2 = GlobalMaxPooling1D()(x2)
    
    ##merge
    conc = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])
    outp = Dense(6, activation="sigmoid")(conc)
    
    
    model = Model(inputs=[inp_pre, inp_post], outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_crossentropy', 'accuracy'])

    return model

def get_model_2rnn(
                  embedding_matrix, cell_size = 80, cell_type_GRU = True,
                  maxlen = 180, max_features = 100000, embed_size = 300,
                  prob_dropout = 0.2, emb_train = False
                 ):
    
    inp_pre = Input(shape=(maxlen, ), name='input_pre')
    inp_post = Input(shape=(maxlen, ), name='input_post')

    ##pre
    x1 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_pre)
    x1 = SpatialDropout1D(prob_dropout)(x1)
    
    if cell_type_GRU:
        x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)
        x1 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x1)
    else :
        x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)
        x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)
    
    avg_pool1 = GlobalAveragePooling1D()(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    
    ##post
    x2 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_post)
    x2 = SpatialDropout1D(prob_dropout)(x2)
    
    if cell_type_GRU:
        x2 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
        x2 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x2)
    else :
        x2 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
        x2 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
    
    avg_pool2 = GlobalAveragePooling1D()(x2)
    max_pool2 = GlobalMaxPooling1D()(x2)
    
    ##merge
    conc = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=[inp_pre, inp_post], outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_crossentropy', 'accuracy'])

    return model

def get_model_2rnn_cnn(
                       embedding_matrix, cell_size = 80, cell_type_GRU = True,
                       maxlen = 180, max_features = 100000, embed_size = 300,
                       prob_dropout = 0.2, emb_train = False,
                       filter_size=128, kernel_size = 2, stride = 1
                      ):

    inp_pre = Input(shape=(maxlen, ), name='input_pre')
    inp_post = Input(shape=(maxlen, ), name='input_post')
    
    ##pre
    x1 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_pre)
    x1 = SpatialDropout1D(prob_dropout)(x1)
    
    if cell_type_GRU:
        x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)
        x1 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x1)
    else :
        x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)
        x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)
    
    x1 = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x1)
    avg_pool1 = GlobalAveragePooling1D()(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    
    ##post
    x2 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_post)
    x2 = SpatialDropout1D(prob_dropout)(x2)
    
    if cell_type_GRU:
        x2 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
        x2 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x2)
    else :
        x2 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
        x2 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
    
    x2 = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x2)
    avg_pool2 = GlobalAveragePooling1D()(x2)
    max_pool2 = GlobalMaxPooling1D()(x2)
    
    ##merge
    conc = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])
    outp = Dense(6, activation="sigmoid")(conc)
    
    
    model = Model(inputs=[inp_pre, inp_post], outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_crossentropy', 'accuracy'])

    return model

def get_model_2rnn_cnn_sp(
                          embedding_matrix, cell_size = 80, cell_type_GRU = True,
                          maxlen = 180, max_features = 100000, embed_size = 300,
                          prob_dropout = 0.2, emb_train = False,
                          filter_size=128, kernel_size = 2, stride = 1
                         ):
    
    inp_pre = Input(shape=(maxlen, ), name='input_pre')
    inp_post = Input(shape=(maxlen, ), name='input_post')
    
    ##pre
    x1 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_pre)
    x1 = SpatialDropout1D(prob_dropout)(x1)
    
    if cell_type_GRU:
        x1_ = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)
        x1 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x1_)
    else :
        x1_ = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1)
        x1 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1_)
    
    x1_ = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x1_)
    avg_pool1_ = GlobalAveragePooling1D()(x1_)
    max_pool1_ = GlobalMaxPooling1D()(x1_)
    
    x1 = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x1)
    avg_pool1 = GlobalAveragePooling1D()(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    
    ##post
    x2 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_post)
    x2 = SpatialDropout1D(prob_dropout)(x2)
    
    if cell_type_GRU:
        x2_ = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
        x2 = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x2_)
    else :
        x2_ = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2)
        x2 = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2_)
    
    x2_ = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x2_)
    avg_pool2_ = GlobalAveragePooling1D()(x2_)
    max_pool2_ = GlobalMaxPooling1D()(x2_)
    
    x2 = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x2)
    avg_pool2 = GlobalAveragePooling1D()(x2)
    max_pool2 = GlobalMaxPooling1D()(x2)
    
    ##merge
    conc = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2, avg_pool1_, max_pool1_, avg_pool2_, max_pool2_])
    outp = Dense(6, activation="sigmoid")(conc)
    
    
    model = Model(inputs=[inp_pre, inp_post], outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_crossentropy', 'accuracy'])

    return model

def get_model_dual_2rnn_cnn_sp(
                               embedding_matrix, cell_size = 80, cell_type_GRU = True,
                               maxlen = 180, max_features = 100000, embed_size = 300,
                               prob_dropout = 0.2, emb_train = False,
                               filter_size=128, kernel_size = 2, stride = 1
                              ):
    
    inp_pre = Input(shape=(maxlen, ), name='input_pre')
    inp_post = Input(shape=(maxlen, ), name='input_post')
    
    ##pre
    x1 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_pre)
    x1g = SpatialDropout1D(prob_dropout)(x1)
    x1l = SpatialDropout1D(prob_dropout)(x1)
    
    x1_g = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x1g)
    x1g = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x1_g)
    x1_l = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1l)
    x1l = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x1_l)
    
    x1_g = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x1_g)
    x1_l = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x1_l)
    avg_pool1_g = GlobalAveragePooling1D()(x1_g)
    max_pool1_g = GlobalMaxPooling1D()(x1_g)
    avg_pool1_l = GlobalAveragePooling1D()(x1_l)
    max_pool1_l = GlobalMaxPooling1D()(x1_l)
    
    x1g = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x1g)
    x1l = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x1l)
    avg_pool1g = GlobalAveragePooling1D()(x1g)
    max_pool1g = GlobalMaxPooling1D()(x1g)
    avg_pool1l = GlobalAveragePooling1D()(x1l)
    max_pool1l = GlobalMaxPooling1D()(x1l)
    
    ##post
    x2 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable = emb_train)(inp_post)
    x2g = SpatialDropout1D(prob_dropout)(x2)
    x2l = SpatialDropout1D(prob_dropout)(x2)
    
    x2_g = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x2g)
    x2g = Bidirectional(CuDNNGRU(cell_size, return_sequences=True))(x2_g)
    x2_l = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2l)
    x2l = Bidirectional(CuDNNLSTM(cell_size, return_sequences=True))(x2_l)
    
    x2_g = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x2_g)
    x2_l = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x2_l)
    avg_pool2_g = GlobalAveragePooling1D()(x2_g)
    max_pool2_g = GlobalMaxPooling1D()(x2_g)
    avg_pool2_l = GlobalAveragePooling1D()(x2_l)
    max_pool2_l = GlobalMaxPooling1D()(x2_l)
    
    x2g = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x2g)
    x2l = Conv1D(filter_size, kernel_size = kernel_size, strides=stride, padding = "valid", kernel_initializer = "he_uniform")(x2l)
    avg_pool2g = GlobalAveragePooling1D()(x2g)
    max_pool2g = GlobalMaxPooling1D()(x2g)
    avg_pool2l = GlobalAveragePooling1D()(x2l)
    max_pool2l = GlobalMaxPooling1D()(x2l)
    
    ##merge
    conc = concatenate([avg_pool1g, max_pool1g, avg_pool1l, max_pool1l, avg_pool1_g, max_pool1_g, avg_pool1_l, max_pool1_l, 
                        avg_pool2g, max_pool2g, avg_pool2l, max_pool2l, avg_pool2_g, max_pool2_g, avg_pool2_l, max_pool2_l])
    outp = Dense(6, activation="sigmoid")(conc)
    
    
    model = Model(inputs=[inp_pre, inp_post], outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_crossentropy', 'accuracy'])

    return model

Overwriting Toxic_Models.py


# End of Code