In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

In [15]:
EMBEDDING_FILE = './embeddings/fasttext/crawl-300d-2M.vec'

In [11]:
train = pd.read_csv('./dataset/train_new.csv')
test = pd.read_csv('./dataset/test_new.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values
y_test = test[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [38]:
type(test["comment_text"]

numpy.ndarray

In [12]:
max_features = 30000
maxlen = 100
embed_size = 300

In [13]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [16]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

In [17]:
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [18]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


In [19]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [21]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [None]:
model = get_model()


batch_size = 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

#hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

In [None]:
print("start fitting...")
file_path = "pooledgru_fasttext_model.h5"
if os.path.isfile(file_path):
    print ("Model already exists. Loading from path ", file_path)
    model = load_model(file_path)
else:
    print ("Model doesn't exist already, training model and saving at path ", file_path)
    model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)
    model.save(file_path)

print ("done")

In [None]:
y_pred = model.predict(x_test, batch_size=1024)

In [27]:
roc_auc_score(y_test, y_pred)

0.9869043067409287

## Trying Pyuneet;s ROC fucntiuo

In [28]:
from score import calc_auc_score, calc_log_loss

In [31]:
calc_auc_score(y_test, y_pred)

0.9869043067409287

## Demo

In [40]:
X_test

[[13474,
  19,
  6,
  2360,
  229,
  1032,
  75,
  7588,
  24,
  14,
  30,
  7561,
  7,
  311,
  52,
  540,
  2,
  81,
  5,
  289,
  2,
  19381,
  66,
  437,
  1063,
  9972,
  52,
  8,
  5,
  1,
  2133,
  3,
  30,
  7561,
  7,
  69,
  5,
  626,
  62,
  287,
  9,
  82,
  52,
  931,
  353,
  4,
  66,
  6073,
  1878,
  79,
  5390,
  2229,
  3879,
  3358,
  47,
  6,
  459,
  153,
  5,
  8678],
 [1412,
  94,
  12,
  20,
  350,
  15,
  29,
  318,
  12,
  2968,
  1,
  1065,
  3633,
  24,
  1347,
  374,
  170,
  37,
  7,
  72,
  89,
  30,
  619,
  22,
  6,
  18,
  54,
  223,
  25,
  1013,
  15,
  29,
  768,
  17,
  30,
  619,
  45,
  267,
  35,
  69,
  127,
  6],
 [4070, 46, 4499, 78, 39, 7, 33, 783, 1897, 297, 38, 311],
 [5101,
  465,
  73,
  1772,
  9,
  13,
  23,
  8,
  10,
  1,
  5101,
  465,
  68,
  3,
  107,
  1717,
  21,
  5544,
  190,
  41,
  19,
  945,
  7515,
  820,
  7120,
  3,
  9,
  18,
  238,
  2,
  33,
  21,
  5544,
  355,
  4,
  879,
  95,
  441,
  10,
  1,
  23,
  7,
  67,
  1

In [None]:
y_pred = model.predict(x_test, batch_size=1024)

In [109]:
zgg = tokenizer.texts_to_sequences(["They're not stupid"])
aaa = sequence.pad_sequences(zgg, maxlen=100)

In [110]:
np.set_printoptions(suppress=True)
model.predict(aaa)

array([[0.91784865, 0.00616852, 0.20719856, 0.00008358, 0.5931242 ,
        0.00994436]], dtype=float32)