https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-051

In [1]:
import gc
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from gensim.models import KeyedVectors
from tqdm import tqdm

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling3D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

Using TensorFlow backend.


In [2]:
max_features = 200000
sequence_length = 196
embedding_dim = 300
create_embedding = False


train = pd.read_pickle("../data/train_spacy_clean.pkl")
test = pd.read_pickle("../data/test_spacy_clean.pkl")

train['comment_reversed'] = train.comment_text.apply(lambda x: ' '.join(x.split(' ')[::-1]))
test['comment_reversed'] = test.comment_text.apply(lambda x: ' '.join(x.split(' ')[::-1]))

In [3]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train.comment_text.values.tolist() + train.comment_reversed.values.tolist() +
                           test.comment_text.values.tolist() + test.comment_reversed.values.tolist())

list_tokenized_train = tokenizer.texts_to_sequences(train.comment_text.values)
list_tokenized_train2 = tokenizer.texts_to_sequences(train.comment_reversed.values)
list_tokenized_test = tokenizer.texts_to_sequences(test.comment_text.values)
list_tokenized_test2 = tokenizer.texts_to_sequences(test.comment_reversed.values)


word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index)) + 1

X_train = sequence.pad_sequences(list_tokenized_train, maxlen=sequence_length)
X_train2 = sequence.pad_sequences(list_tokenized_train2, maxlen=sequence_length)
y_train = train[list_classes].values

X_test = sequence.pad_sequences(list_tokenized_test, maxlen=sequence_length)
X_test2 = sequence.pad_sequences(list_tokenized_test2, maxlen=sequence_length)

del train, test, list_tokenized_train, list_tokenized_train2, list_tokenized_test, list_tokenized_test2
gc.collect()

29

In [4]:
if create_embedding:
    embedding_file = '/home/w/Projects/Toxic/data/embeddings/GoogleNews-vectors-negative300.bin.gz'
    word2vec = KeyedVectors.load_word2vec_format(embedding_file, binary=True)
    print('Found %s word vectors of word2vec' % len(word2vec.vocab))

    embedding_matrix = np.zeros((nb_words, embedding_dim))
    for word, i in tqdm(word_index.items()):
        if word in word2vec.vocab:
            embedding_matrix[i] = word2vec.word_vec(word)
    print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
else:
    embedding_matrix = pd.read_pickle('../data/embeddings/GoogleNews_300dim_embedding.pkl')

In [8]:
import keras_models_quora


epochs = 100
batch_size = 128


model_callbacks = [EarlyStopping(monitor='val_loss', patience=6, verbose=1, mode='min'),
                   ReduceLROnPlateau(monitor='val_loss', factor=0.7, verbose=1,
                                     patience=4, min_lr=1e-6)]


model = keras_models_quora.decomposable_attention('../data/embeddings/GoogleNews_300dim_embedding.pkl', maxlen=196)
model.fit([X_train, X_train2], y_train, batch_size=batch_size, epochs=epochs, 
          validation_split=0.1, callbacks=model_callbacks)

y_test = model.predict(X_test)

Train on 86265 samples, validate on 9586 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
10624/86265 [==>...........................] - ETA: 1:11 - loss: 0.0488 - binary_crossentropy: 0.0488 - acc: 0.9817

KeyboardInterrupt: 