In [9]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

In [3]:
import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

In [5]:
# read files
def readInputFiles(train_file_path, test_file_path):
    train = pd.read_csv(train_file_path)
    test = pd.read_csv(test_file_path)
    train = train.sample(frac=1)
    return train, test
    
train, test = readInputFiles('./dataset/train_new.csv', './dataset/test_new.csv')

In [15]:
EMBEDDING_FILE = './embeddings/fasttext/crawl-300d-2M.vec'

In [8]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

def preProcessData():
    max_features = 20000
    maxlen = 200
    
    y = train[list_classes].values
    list_sentences_train = train["comment_text"].fillna("CVxTz").values
    list_sentences_test = test["comment_text"].fillna("CVxTz").values
   
    tokenizer = Tokenizer(num_words=max_features)

    tokenizer.fit_on_texts(list(list_sentences_train))
    
    list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
    list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
    
    X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)
    
    return max_features, maxlen, X_train, X_test, y

max_features, maxlen, X_train, X_test, y = preProcessData()

NameError: global name 'pad_sequences' is not defined

In [16]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

In [17]:
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [18]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


In [19]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [21]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [None]:
def createModel():
    model = get_model()
    batch_size = 32
    epochs=2
    model.fit(X_train, y, epochs=epochs, batch_size=batch_size, validation_split=0.1)
    return model

## Fit model if doesn't exist already 

In [None]:
print("start fitting...")
file_path = "pooledgru_fasttext_model.h5"
if os.path.isfile(file_path):
    print ("Model already exists. Loading from path ", file_path)
    model = load_model(file_path)
else:
    print ("Model doesn't exist already, training model and saving at path ", file_path)
    model = createModel()
    model.save(file_path)

In [54]:
print("start predicting...")
y_pred = model.predict(X_test, batch_size=1024)
print ("done")

start prediccting...
done


In [11]:
submission = pd.DataFrame.from_dict({'id': test['id']})
for idx, col in enumerate(list_classes):
    submission[col] = y_pred[:,idx]
submission.to_csv('submission_pooled.csv', index=False)

done


## Trying Puneet's ROC function

In [28]:
from score import calc_auc_score, calc_log_loss

In [69]:
def get_scores(test, preds, fallback_preds_filename):
    try: 
        true = test
    except NameError:
        true = pd.read_csv('../dataset/test_new.csv')
    try: 
        y_pred = preds
    except NameError:
        pred = pd.read_csv(fallback_preds_filename)
        y_pred = pred[list_classes].values

    y_true = true[list_classes].values

    loss = calc_log_loss(y_true, y_pred)
    auc = calc_auc_score(y_true, y_pred)
    return loss, auc

In [74]:
true = pd.read_csv('../dataset/test_new.csv')
pred = y_pred

loss_, aucs = get_scores(true, pred, fallback_preds_filename=None)

In [75]:
print ("Log loss = ", loss_)
print ("AUC Score = ", aucs)

('Log loss = ', 0.6953239047674854)
('AUC Score = ', 0.5181614688451134)


## Demo

In [40]:
X_test

[[13474,
  19,
  6,
  2360,
  229,
  1032,
  75,
  7588,
  24,
  14,
  30,
  7561,
  7,
  311,
  52,
  540,
  2,
  81,
  5,
  289,
  2,
  19381,
  66,
  437,
  1063,
  9972,
  52,
  8,
  5,
  1,
  2133,
  3,
  30,
  7561,
  7,
  69,
  5,
  626,
  62,
  287,
  9,
  82,
  52,
  931,
  353,
  4,
  66,
  6073,
  1878,
  79,
  5390,
  2229,
  3879,
  3358,
  47,
  6,
  459,
  153,
  5,
  8678],
 [1412,
  94,
  12,
  20,
  350,
  15,
  29,
  318,
  12,
  2968,
  1,
  1065,
  3633,
  24,
  1347,
  374,
  170,
  37,
  7,
  72,
  89,
  30,
  619,
  22,
  6,
  18,
  54,
  223,
  25,
  1013,
  15,
  29,
  768,
  17,
  30,
  619,
  45,
  267,
  35,
  69,
  127,
  6],
 [4070, 46, 4499, 78, 39, 7, 33, 783, 1897, 297, 38, 311],
 [5101,
  465,
  73,
  1772,
  9,
  13,
  23,
  8,
  10,
  1,
  5101,
  465,
  68,
  3,
  107,
  1717,
  21,
  5544,
  190,
  41,
  19,
  945,
  7515,
  820,
  7120,
  3,
  9,
  18,
  238,
  2,
  33,
  21,
  5544,
  355,
  4,
  879,
  95,
  441,
  10,
  1,
  23,
  7,
  67,
  1

In [None]:
y_pred = model.predict(x_test, batch_size=1024)

In [109]:
zgg = tokenizer.texts_to_sequences(["They're not stupid"])
aaa = sequence.pad_sequences(zgg, maxlen=100)

In [110]:
np.set_printoptions(suppress=True)
model.predict(aaa)

array([[0.91784865, 0.00616852, 0.20719856, 0.00008358, 0.5931242 ,
        0.00994436]], dtype=float32)