In [None]:
import numpy as np
import pandas as pd
import re, csv, codecs

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.callbacks import ModelCheckpoint
from keras.models import Model
from keras.models import load_model
from keras import initializers, regularizers, constraints, optimizers, layers

import sys, os
sys.path.append(os.path.dirname(os.getcwd()))
from score import calc_auc_score, calc_log_loss

## Read train and test data

In [None]:
# read files
def readInputFiles(train_file_path, test_file_path):
    train = pd.read_csv(train_file_path)
    test = pd.read_csv(test_file_path)
    train = train.sample(frac=1)
    return train, test
    
train, test = readInputFiles('../dataset/train_new.csv', '../dataset/test_new.csv')

## Preprocessing Data

In [18]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

def preProcessData():
    max_features = 20000
    maxlen = 200
    
    y = train[list_classes].values
    list_sentences_train = train["comment_text"].values
    list_sentences_test = test["comment_text"].values
   
    tokenizer = Tokenizer(num_words=max_features)

    tokenizer.fit_on_texts(list(list_sentences_train))
    
    list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
    list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
    
    X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)
    
    return max_features, maxlen, X_train, X_test, y

max_features, maxlen, X_train, X_test, y = preProcessData()

In [19]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

In [20]:
def createModel():
    model = get_model()
    batch_size = 32
    epochs=2
    model.fit(X_train,y, epochs=2, batch_size=32, validation_split=0.1)
    return model

## Fit model if doesn't exist already 

In [27]:
print("start fitting...")
file_path = "lstm_model_new_4.h5"
if os.path.isfile(file_path):
    print ("Model already exists. Loading from path ", file_path)
    model = load_model(file_path)
else:
    print ("Model doesn't exist already, training model and saving at path ", file_path)
    model = createModel()
    model.save(file_path)

start fitting...
("Model doesn't exist already, training model and saving at path ", 'lstm_model_new_4.h5')
Train on 114890 samples, validate on 12766 samples
Epoch 1/2
Epoch 2/2


In [28]:
print("start predicting...")
y_pred = model.predict(X_test, batch_size=1024)
print ("done")

start predicting...
done


In [29]:
submission = pd.DataFrame.from_dict({'id': test['id']})
for idx, col in enumerate(list_classes):
    submission[col] = y_pred[:,idx]
submission.to_csv('submission.csv', index=False)
print ("done")

done


In [30]:
def get_scores(test, preds, fallback_preds_filename):
    try: 
        true = test
    except NameError:
        true = pd.read_csv('../dataset/test_new.csv')
    try: 
        y_pred = preds
    except NameError:
        pred = pd.read_csv(fallback_preds_filename)
        y_pred = pred[list_classes].values

    y_true = true[list_classes].values

    loss = calc_log_loss(y_true, y_pred)
    auc = calc_auc_score(y_true, y_pred)
    return loss, auc

In [31]:
true = pd.read_csv('../dataset/test_new.csv')
pred = y_pred

loss_, aucs = get_scores(true, pred, fallback_preds_filename=None)

In [32]:
print ("Log loss = ", loss_)
print ("AUC Score = ", aucs)

('Log loss = ', 0.048055253077555114)
('AUC Score = ', 0.9791128556297686)
