In [1]:
import gc
import os

import keras
import keras_models
import numpy as np
import pandas as pd
import utils
from gensim.models import KeyedVectors
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.optimizers import SGD, Adadelta, Adam, Nadam, RMSprop
from keras.preprocessing import sequence, text
from nltk.corpus import stopwords
from tqdm import tqdm

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Using TensorFlow backend.


In [2]:
n_folds = 5
n_bags = 1
split_size = 0.1
max_features = 300000
nb_words = max_features
sequence_length = 1024
embedding_dim = 300
bidirectional = False
run_prefix = '1024len_Glove300k_CharLevel1_'
embedding_filename = 'Glove_300dim_embeddingBasic300k'

run_prefix = 'FastText_'
src = '/home/w/Projects/Toxic/data/'
model_name = 'LSTMattentionChar'
optimizer = 'Nadam'
data_type = 'BasicClean'
kfold_run = 0
batch_size = 256
importance = 0
stratify = 0
save_models = 0
load_models = 0
save_oof = 0
prepare_submission = 1


if bidirectional and 'LSTM' in model_name or bidirectional and 'GRU' in model_name:
    run_prefix = 'Bidirectional{}'.format(run_prefix)
if kfold_run:
    general_run_name = '{}{}fold_BS{}_{}'.format(
        run_prefix, n_folds, batch_size, optimizer)
else:
    general_run_name = '{}{}bag_BS{}_{}'.format(
        run_prefix, n_bags, batch_size, optimizer)


if len(data_type) > 0:
    general_run_name += '_{}'.format(data_type)
if importance:
    general_run_name += '_ImportanceTrain'
if stratify and kfold_run:
    general_run_name += '_Stratified'

run_name = '{}{}'.format(model_name, general_run_name)
print('Run name: {}'.format(run_name))


model_callbacks = [EarlyStopping(monitor='val_loss', patience=18, verbose=1),
                   ReduceLROnPlateau(monitor='val_loss', factor=0.5, verbose=1,
                                     patience=8, min_lr=1e-5)]

Run name: LSTMattentionCharFastText_1bag_BS256_Nadam_BasicClean


In [3]:
train, test = utils.load_data(src, mode=data_type)
print(train.shape, test.shape)
list_classes = ["toxic", "severe_toxic",
                "obscene", "threat", "insult", "identity_hate"]
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

tokenizer = text.Tokenizer(num_words=max_features, char_level=True)
tokenizer.fit_on_texts(train.comment_text.tolist() + test.comment_text.tolist())
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index)) + 1

X_train = sequence.pad_sequences(
    list_tokenized_train, maxlen=sequence_length)  # [:1000]
y_train = train[list_classes].values  # [:1000]
X_test = sequence.pad_sequences(
    list_tokenized_test, maxlen=sequence_length)  # [:1000]
print(X_train.shape, y_train.shape, X_test.shape)

del train, test, list_tokenized_train, list_tokenized_test
gc.collect()

Load data with basic cleaning.
(95851, 8) (226998, 2)
(95851, 1024) (95851, 6) (226998, 1024)


39

In [None]:
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, Dropout, concatenate
from keras.layers import (GRU, LSTM, Bidirectional, CuDNNGRU, CuDNNLSTM, Dense,
                          Dropout, Embedding, Flatten, Input, Lambda, Reshape,
                          concatenate)
from keras.layers.advanced_activations import PReLU
from keras.layers.convolutional import (AveragePooling1D, Conv1D, MaxPooling1D,
                                        ZeroPadding1D)
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.models import Model
from keras_attention import Attention
from keras_attention_context import AttentionWithContext



def CharacterlevelCNN(conv_layers = 2, 
                dilation_rates = [0, 2, 4, 8, 16], 
                embed_size = 256):
    inp = Input(shape=(None, ))
    x = Embedding(input_dim = len(tokenizer.word_counts)+1, 
                  output_dim = embed_size)(inp)
    prefilt_x = Dropout(0.25)(x)
    out_conv = []
    # dilation rate lets us use ngrams and skip grams to process 
    for dilation_rate in dilation_rates:
        x = prefilt_x
        for i in range(2):
            if dilation_rate>0:
                x = Conv1D(16*2**(i), 
                           kernel_size = 3, 
                           dilation_rate = dilation_rate,
                          activation = 'relu',
                          name = 'ngram_{}_cnn_{}'.format(dilation_rate, i)
                          )(x)
            else:
                x = Conv1D(16*2**(i), 
                           kernel_size = 1,
                          activation = 'relu',
                          name = 'word_fcl_{}'.format(i))(x)
        out_conv += [Dropout(0.5)(GlobalMaxPool1D()(x))]
    x = concatenate(out_conv, axis = -1)    
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model


def LSTMattentionChar(params):

    Embedding_layer = Embedding(params['nb_words'],
                                params['embedding_dim'],
                                input_length=params['sequence_length'],
                                trainable=True)

    input_ = Input(shape=(params['sequence_length'], ))
    embed_input_ = Embedding_layer(input_)

    if params['bidirectional']:
        x = Bidirectional(
            CuDNNLSTM(params['lstm_units'], return_sequences=True))(embed_input_)
    else:
        x = CuDNNLSTM(params['lstm_units'],
                      return_sequences=True)(embed_input_)
    x = AttentionWithContext()(x)
    # x = GlobalAveragePooling1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation='sigmoid')(x)

    model = Model(inputs=input_, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer=params['optimizer'],
                  metrics=['accuracy'])
    return model


if optimizer == 'Adam':
    optimizer = Adam(lr=1e-4, decay=1e-3)
    # optimizer = 'adam'
if optimizer == 'Nadam':
    optimizer = Nadam(lr=1e-4, schedule_decay=1e-3)
    # optimizer = 'nadam'
if optimizer == 'SGD':
    optimizer = SGD(lr=1e-3, momentum=0.9,
                    decay=1e-4, nesterov=True)

In [None]:
model_parameters = {
    'lstm_units': 256,
    'bidirectional': bidirectional,
    'nb_words': len(tokenizer.word_counts) + 1,
    'embedding_dim': embedding_dim,
    'sequence_length': sequence_length,
    'optimizer': optimizer,
    'num_columns': X_train.shape[1],
}

pipeline_parameters = {
    'model_name': LSTMattentionChar,
    'predict_test': True,
    'number_epochs': 1000,
    'batch_size': batch_size,
    'seed': 1337,
    'shuffle': True,
    'verbose': True,
    'run_save_name': run_name,
    'load_keras_model': load_models,
    'save_model': save_models,
    'save_history': True,
    'save_statistics': True,
    'output_statistics': True,
    'src_dir': os.getcwd(),
}

if kfold_run:
    oof_train, oof_test = utils.run_parametrized_kfold(X_train[features], y_train, 
                                                       X_test[features],
                                                       pipeline_parameters,
                                                       model_parameters,
                                                       model_callbacks=model_callbacks,
                                                       n_folds=n_folds,
                                                       importance_training=importance,
                                                       save_oof=save_oof)
    print(oof_train.shape, oof_test.shape)
else:
    oof_valid, oof_test = utils.run_parametrized_bagging(X_train, y_train,
                                                         X_test=X_test,
                                                         pipeline_parameters=pipeline_parameters,
                                                         model_parameters=model_parameters,
                                                         model_callbacks=model_callbacks,
                                                         n_bags=n_bags,
                                                         split_size=split_size,
                                                         importance_training=importance)
    print(oof_valid.shape, oof_test.shape)


if prepare_submission:
    submission = utils.output_submission(
        oof_test.mean(axis=0), run_name, save=True)

Running parametrized bagging
Running: LSTMattentionCharFastText_1bag_BS256_Nadam_BasicClean
Training on bag: 1 

Saving CSV logs for model from current bag/fold: LSTMattentionCharFastText_1bag_BS256_Nadam_BasicClean, bag number 1 

Splitting data - validation split size: 0.1, split seed: 1337
Train on 86265 samples, validate on 9586 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
19456/86265 [=====>........................] - ETA: 1:33 - loss: 0.0921 - acc: 0.9737