In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams["figure.figsize"] = (12, 9)
sns.set(context='paper', style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=1.2)

In [14]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, LeakyReLU
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

In [15]:
max_features = 24000  # TODO
maxlen = 100

In [16]:
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")
train = train.sample(frac=1)

In [17]:
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

In [18]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [19]:
file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor='val_loss', mode='min', patience=20)
callbacks = [checkpoint, early]

In [None]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, output_dim=128)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation='relu')(x)  # Leaky relu?
    x = Dropout(0.1)(x)
    x = Dense(6, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                 optimizer='nadam', # adam
                 metrics=['accuracy'])
    return model

In [25]:
model = get_model()
history = model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=callbacks)

Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2


In [20]:
model.load_weights(file_path)
y_test = model.predict(X_te)

In [22]:
sample_submission = pd.read_csv("input/sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("output/keras_baseline.csv", index=False)

# Hyperopt

In [20]:
def get_model_with_params(p):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, output_dim=p['embedding_size'])(inp)
    if p['cell_type']=='lstm':
        cell = LSTM(int(p['units']), return_sequences=True, dropout=p['dropout_r'], recurrent_dropout=p['dropout_r'])
    else:
        cell = GRU(int(p['units']), return_sequences=True, dropout=p['dropout_r'], recurrent_dropout=p['dropout_r'])
    x = Bidirectional(cell)(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(p['dropout_1'])(x)
    x = Dense(p['dense_1'])(x)
    x = LeakyReLU()(x)
    x = Dropout(p['dropout_2'])(x)
    x = Dense(6, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer=p['opt_algo'], metrics=['accuracy'])
    return model

In [None]:
%%time

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def score(p):
    print("Training with params:", p)
    model = get_model_with_params(p)
    h = model.fit(X_t, y, batch_size=p['batch_size'], epochs=p['epochs'], validation_split=0.1, callbacks=callbacks)
    score = min(h.history['val_loss']) #h.history['val_loss'][-1]
    print("\tScore {0}\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

def optimize():
    trials = Trials()
    space = {
        'batch_size' : hp.choice('batch_size', np.arange(12, 25, dtype=int)),
        'dropout_1': hp.quniform('dropout_1', 0.00, 0.15, 0.025),
        'dropout_2': hp.quniform('dropout_2', 0.025, 0.2, 0.025),
        'dropout_r': 0, # hp.quniform('dropout_r', 0.00, 0.15, 0.025),
        'dense_1': hp.choice('dense_1', np.arange(40, 65, dtype=int)),
        'cell_type': hp.choice('cell_type', ['lstm', 'gru']),
        'embedding_size': hp.choice('embedding_size', np.arange(64, 81, dtype=int)), # [64, 96, 128]
        'units': hp.choice('units', np.arange(32, 48, dtype=int)),
        'opt_algo': hp.choice('opt_algo', ['rmsprop', 'nadam', 'adam']),
        'epochs': 2,
    }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=16)

    print("Best:", best)
    return best, trials

b, t = optimize()

In [24]:
p = {'cell_type': 'lstm', 'opt_algo': 'adam', 'units': 24, 'batch_size': 20, 'embedding_size': 128,  
     'dense_1': 60, 'dropout_1': 0.25, 'dropout_2': 0.25, 'dropout_r': 0.0, 'epochs': 2}
manual_model = get_model_with_params(p)
manual_model.summary()
h = manual_model.fit(X_t, y, batch_size=p['batch_size'], epochs=p['epochs'], validation_split=0.1, callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 100, 64)           1536000   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 100, 48)           17088     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 48)                0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 48)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 60)                2940      
_________________________________________________________________
leaky_re_lu_6 (LeakyReLU)    (None, 60)                0         
__________

In [13]:
final_model = load_model(file_path, custom_objects={ })
final_model.summary()
y_test = final_model.predict(X_te)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 100, 128)          3072000   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 100, 100)          53700     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 100)               0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 51)                5151      
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 51)                0         
__________

In [14]:
sample_submission = pd.read_csv("input/sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("output/keras_tuned_0.0457.csv", index=False)

In [None]:
# Plots
# my_plots = ['loss', 'acc']
# for plot in my_plots:
#     plt.plot(history.history[plot])
#     plt.plot(history.history['val_' + plot])
#     plt.title('model ' + plot)
#     plt.ylabel(plot)
#     plt.xlabel('epoch')
#     plt.legend(['train', 'test'], loc='upper left')
#     plt.show()