In [25]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.models import load_model
from keras import initializers, regularizers, constraints, optimizers, layers

import sys, os
sys.path.append(os.path.dirname(os.getcwd()))
from score import calc_auc_score, calc_log_loss

## Read train and test data

In [2]:
train = pd.read_csv('../dataset/train_new.csv')
test = pd.read_csv('../dataset/test_new.csv')

## Get the training class labels

In [3]:
# Store the list of classes in a variable so that we don't need to specify each one again and again
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

In [4]:
## Get the list of sentences in training and testingy.shape
# this is the y value: 
# rows ---> comment text, 
# columns --> "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
# [[0 0 0 0 0 0]
#  [0 0 0 0 0 0]
#  [0 0 0 0 0 0]
#  [0 0 0 0 0 0]
#  [0 0 0 0 0 0]
#  [0 0 0 0 0 0]
#  [1 1 1 0 1 0]
#  [0 0 0 0 0 0]
#  [0 0 0 0 0 0]
#  [0 0 0 0 0 0]]

## Get the list of sentences in training and testing

In [5]:
# list_sentences_train
# 0    Explanation\nWhy the edits made under my usern...
# 1    D'aww! He matches this background colour I'm s...
# 2    Hey man, I'm really not trying to edit war. It...
# 3    "\nMore\nI can't make any real suggestions on ...
# 4    You, sir, are my hero. Any chance you remember...
# 5    "\n\nCongratulations from me as well, use the ...
# 6         COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
# 7    Your vandalism to the Matt Shirvington article...
# 8    Sorry if the word 'nonsense' was offensive to ...
# 9    alignment on this subject and which are contra...


In [6]:
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

### Create a word tokenizer 

The Tokenizer allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf...

In [7]:
max_features = 20000

# num_words: the maximum number of words to keep, 
# based on word frequency. Only the most common num_words words will be kept
tokenizer = Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(list(list_sentences_train) + list(list_sentences_test))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [8]:
# pad_sequences: Sequences that are shorter than maxlen are padded with value at the end.
# Sequences longer than maxlen are truncated so that they fit the desired length.
# By default, padded or truncated at the front

maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)


inp = Input(shape=(maxlen, ))
embed_size = 128

# Embedding: Turns positive integers (indexes) into dense vectors of fixed size.
# eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
# Embedding layer can only be used as the first layer in a model.

x = Embedding(max_features, embed_size)(inp)

# Sequence classification with LSTM
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)

# Dropout consists in randomly setting a fraction rate of input units to 0 
# at each update during training time, which helps prevent overfitting
x = Dropout(0.1)(x)

x = Dense(50, activation="relu")(x)


# relu: rectifier activation function
# 50: # of neurons in the first layers
# 0.1 (rate): float between 0 and 1. Fraction of the input units to drop.
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

# now the model will take as input arrays of shape (*, 200)
# and output arrays of shape (*, 32)

model = Model(inputs=inp, outputs=x)

# Configures the model for training.
# binary_crossentropy: logarithmic loss
# adam: stochastic gradient descent
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])


# using mean square error to evalute the model instead
# use accuracy and mean absolute error as perfoemance metrics
# configure the model using the given metrics 
# model.compile(loss='mean_squared_error',
#               optimizer='sgd',
#               metrics=['mae', 'acc'])

print ("done")

done


In [31]:
print("start fitting...")
file_path = "lstm_model.h5"
if os.path.isfile(file_path):
    print ("Model already exists. Loading from path ", file_path)
    model = load_model(file_path)
else:
    print ("Model doesn't exist already, training model and saving at path ", file_path)
    model.fit(X_t,y, epochs=2, batch_size=32, validation_split=0.1)
    model.save(file_path)

print ("done")

start fitting...
('Model already exists. Loading from path ', 'lstm_model.h5')
done


In [10]:
print("start preditcting...")
y_pred = model.predict(X_te, batch_size=1024)
print ("done")

start preditcting...
done


In [11]:
submission = pd.DataFrame.from_dict({'id': test['id']})
for idx, col in enumerate(list_classes):
    submission[col] = y_pred[:,idx]
submission.to_csv('submission.csv', index=False)
print ("done")

done


In [12]:
r = model.to_json()
print (r)

{"class_name": "Model", "keras_version": "2.1.6", "config": {"layers": [{"class_name": "InputLayer", "config": {"dtype": "float32", "batch_input_shape": [null, 200], "name": "input_1", "sparse": false}, "inbound_nodes": [], "name": "input_1"}, {"class_name": "Embedding", "config": {"embeddings_initializer": {"class_name": "RandomUniform", "config": {"maxval": 0.05, "seed": null, "minval": -0.05}}, "name": "embedding_1", "dtype": "float32", "output_dim": 128, "trainable": true, "embeddings_regularizer": null, "input_dim": 20000, "mask_zero": false, "embeddings_constraint": null, "batch_input_shape": [null, null], "activity_regularizer": null, "input_length": null}, "inbound_nodes": [[["input_1", 0, 0, {}]]], "name": "embedding_1"}, {"class_name": "LSTM", "config": {"recurrent_activation": "hard_sigmoid", "trainable": true, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"seed": null, "gain": 1.0}}, "use_bias": true, "bias_regularizer": null, "return_state": false, "unro

In [20]:
model.save("lstm_model.h5")

In [26]:
new_model = load_model('lstm_model.h5')

In [15]:
def get_scores(test, preds, fallback_preds_filename):
    try: 
        true = test
    except NameError:
        true = pd.read_csv('../dataset/test_new.csv')
    try: 
        y_pred = preds
    except NameError:
        pred = pd.read_csv(fallback_preds_filename)
        y_pred = pred[list_classes].values

    y_true = true[list_classes].values

    loss = calc_log_loss(y_true, y_pred)
    auc = calc_auc_score(y_true, y_pred)
    return loss, auc

In [16]:
true = pd.read_csv('../dataset/test_new.csv')
pred = y_pred

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_true = true[list_classes].values

print y_true[0]
print y_pred[0]
loss_, aucs = get_scores(true, pred, fallback_preds_filename=None)

[0 0 0 0 0 0]
[0.17043376 0.00030354 0.00837982 0.00228762 0.02350723 0.00728446]


In [19]:
print ("Log loss = ", loss_)
print ("AUC Score = ", aucs)

('Log loss = ', 0.04767628315445003)
('AUC Score = ', 0.9788112586949559)


In [46]:
from sklearn.metrics import confusion_matrix
y_classes = y_pred.argmax(axis=1)
print y_classes

[0 0 0 ... 0 0 0]
