In [45]:
import configparser
import pandas as pd
import re
import numpy as np
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [33]:
config = configparser.ConfigParser()
config.read('../config.cfg')
train = pd.read_csv(config['FILES']['TRAIN'])
test = pd.read_csv(config['FILES']['TEST'])

In [90]:
def my_sentences(df):
    corpus = open(config['FILES']['GLOVE_DIR'] + 'custom_corpus', 'a')
    for comment in df['comment_text']:
        comment = ' '.join(re.split('\n',comment.lower()))
        comment = re.sub(r'[^a-zA-Z0-9\s:]', '', comment)
        corpus.write(comment)


In [91]:
my_sentences(train)
my_sentences(test)

Create Glove vector with custom_glove.sh

In [96]:
EMBEDDING_FILE=config['FILES']['GLOVE_DIR'] + 'vectors.txt'
list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [98]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)


In [108]:
def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
all_embs = np.stack([val if len(val)==50 else np.append(val,[0]) for val in embeddings_index.values() ])
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(-0.020803546066841937, 0.5339872121371699)

In [109]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [110]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [111]:
model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc57abb0080>

In [112]:
test_label = pd.read_csv(config['FILES']['LABEL'])
test_label_strip = test_label[test_label.toxic != -1]
yt = test_label_strip[list_classes].values

In [113]:
model.evaluate(X_te[test_label_strip.index], yt, batch_size=1024)



[0.06681015081218045, 0.9725426753757036]

In [114]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model2 = Model(inputs=inp, outputs=x)
model2.compile(loss='hinge', optimizer='adam', metrics=['accuracy'])

In [115]:
model2.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc575f32b70>

In [116]:
model2.evaluate(X_te[test_label_strip.index], yt, batch_size=1024)



[0.9622319583361563, 0.9976085531375815]