In [10]:
import numpy as np 
import pandas as pd 
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from subprocess import check_output

import sys, os
sys.path.append(os.path.dirname(os.getcwd()))
from score import calc_auc_score, calc_log_loss

## Take Input

In [2]:
# read files
def readInputFiles(train_file_path, test_file_path):
    train = pd.read_csv(train_file_path)
    test = pd.read_csv(test_file_path)
    train = train.sample(frac=1)
    return train, test
    
train, test = readInputFiles('../dataset/train_new.csv', '../dataset/test_new.csv')

## Preprocessing Data

In [3]:
def preProcessData():
    max_features = 20000
    maxlen = 100

    # grab all the comments from train and fill the NAN comments with CVxTz
    list_sentences_train = train["comment_text"].fillna("CVxTz").values

    # get the values for 6 classes
    list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    y = train[list_classes].values

    # grab all the comments from test and fill the NAN comments with CVxTz
    list_sentences_test = test["comment_text"].fillna("CVxTz").values

    tokenizer = text.Tokenizer(num_words=max_features)

    # only use the training data comments for tokenizer
    tokenizer.fit_on_texts(list(list_sentences_train))

    # convert form strings to list of indices of words
    list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
    list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

    # truncate list if length over 100
    # pad list if length less than 100
    X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
    
    return max_features, maxlen, X_train, X_test, y
    
max_features, maxlen, X_train, X_test, y = preProcessData()

In [4]:
# configure a model
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

print "done"

done


In [5]:
# create a model

def createModel(file_path):
    model = get_model()
    batch_size = 32
    epochs = 2

    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

    # The point is to let the script run for more epochs locally 
    # because it will timeout if done so in Kaggle Kernels
    early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

    callbacks_list = [checkpoint, early] #early
    model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

    model.load_weights(file_path)

    return model
print "done"
    
# file_path: file where model gets stored
model = createModel("weights_base.best.hdf5")

done
Train on 114890 samples, validate on 12766 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.04880, saving model to weights_base.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.04880 to 0.04709, saving model to weights_base.best.hdf5


In [9]:
model.save("bidirectional_lstm.h5")

In [6]:
# get predictions
predictions = model.predict(X_test)

In [8]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# create the submission file and store the results
def saveResults():
    submission = pd.DataFrame.from_dict({'id': test['id']})
    for idx, col in enumerate(list_classes):
        submission[col] = predictions[:,idx]
    submission.to_csv('submission.csv', index=False)
    
saveResults()

In [11]:
def get_scores(test, preds, fallback_preds_filename):
    try: 
        true = test
    except NameError:
        true = pd.read_csv('../dataset/test_new.csv')
    try: 
        y_pred = preds
    except NameError:
        pred = pd.read_csv(fallback_preds_filename)
        y_pred = pred[list_classes].values

    y_true = true[list_classes].values

    loss = calc_log_loss(y_true, y_pred)
    auc = calc_auc_score(y_true, y_pred)
    return loss, auc

In [13]:
true = pd.read_csv('../dataset/test_new.csv')
pred = predictions

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_true = true[list_classes].values

print y_true[0]
print predictions[0]
loss_, aucs = get_scores(true, pred, fallback_preds_filename=None)

[0 0 0 0 0 0]
[0.18822935 0.00020282 0.00796122 0.0015497  0.01676907 0.00307044]


In [14]:
print ("Log loss = ", loss_)
print ("AUC Score = ", aucs)

('Log loss = ', 0.04911418352106683)
('AUC Score = ', 0.9761352698716607)
