In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams["figure.figsize"] = (12, 9)
sns.set(context='paper', style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=1.2)

from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

In [2]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, LeakyReLU
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras import regularizers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
max_features = 24000  # TODO
maxlen = 100

In [4]:
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")
train = train.sample(frac=1)

In [17]:
test.shape

(153164, 2)

In [18]:
train.shape

(159571, 8)

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
136211,d8a64df4f787e25c,"""similar statement was made about methane. I r...",0,0,0,0,0,0
158053,e8009916c2ed8733,"""\n\n The difference between a first serializa...",0,0,0,0,0,0
44702,77728ec3cd3557bf,"""\n\nThat was already discussed above. Just re...",0,0,0,0,0,0
65845,b015b87aa218d5e4,Concerning Peter Cushing\nThere is a local son...,0,0,0,0,0,0
104842,30ecfea816cf7821,"Okay, bin Laden. You just keep concentrating o...",1,0,0,0,0,0


In [6]:
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

In [7]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [8]:
file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor='val_loss', mode='min', patience=20)
callbacks = [checkpoint, early]

In [9]:
for col in list_classes:
    print(train[col].value_counts(normalize=True))

0    0.904156
1    0.095844
Name: toxic, dtype: float64
0    0.990004
1    0.009996
Name: severe_toxic, dtype: float64
0    0.947052
1    0.052948
Name: obscene, dtype: float64
0    0.997004
1    0.002996
Name: threat, dtype: float64
0    0.950636
1    0.049364
Name: insult, dtype: float64
0    0.991195
1    0.008805
Name: identity_hate, dtype: float64


In [10]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, output_dim=128)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                 optimizer='adam', # nadam
                 metrics=['accuracy'])
    return model

In [11]:
model = get_model()
history = model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=callbacks)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


In [None]:
Train on 86265 samples, validate on 9586 samples
Epoch 1/2
86240/86265 [============================>.] - ETA: 0s - loss: 0.0829 - acc: 0.9765Epoch 00001: val_loss did not improve
86265/86265 [==============================] - 652s 8ms/step - loss: 0.0829 - acc: 0.9765 - val_loss: 0.0556 - val_acc: 0.9812
Epoch 2/2
86240/86265 [============================>.] - ETA: 0s - loss: 0.0505 - acc: 0.9823Epoch 00002: val_loss did not improve
86265/86265 [==============================] - 703s 8ms/step - loss: 0.0504 - acc: 0.9823 - val_loss: 0.0549 - val_acc: 0.9817

In [12]:
model.load_weights(file_path)
y_test = model.predict(X_te)

In [13]:
sample_submission = pd.read_csv("input/sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("output/keras_baseline.csv", index=False)

In [15]:
sample_submission.shape

(153164, 7)

In [16]:
X_te.shape

(153164, 100)

# Hyperopt

In [10]:
def get_model_with_params(p):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, output_dim=p['embedding_size'])(inp)
    if p['cell_type']=='lstm':
        cell = LSTM(int(p['units']), return_sequences=True)  # , dropout=p['dropout_r'], recurrent_dropout=p['dropout_r']
    else:
        cell = GRU(int(p['units']), return_sequences=True)  # , dropout=p['dropout_r'], recurrent_dropout=p['dropout_r']
    x = Bidirectional(cell)(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(p['dropout_1'])(x)
    x = Dense(p['dense_1'], activation='relu')(x)  #     x = LeakyReLU()(x)
    x = Dropout(p['dropout_2'])(x)
    x = Dense(6, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer=p['opt_algo'], metrics=['accuracy'])
    return model

In [12]:
%%time

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def score(p):
    print("Training with params:", p)
    model = get_model_with_params(p)
    h = model.fit(X_t, y, batch_size=p['batch_size'], epochs=p['epochs'], validation_split=0.1, callbacks=callbacks)
    score = min(h.history['val_loss']) #h.history['val_loss'][-1]
    print("\tScore {0}\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

def optimize():
    trials = Trials()
    space = {
        'batch_size' : hp.choice('batch_size', np.arange(16, 33, dtype=int)),
        'dropout_1': hp.quniform('dropout_1', 0.00, 0.15, 0.025),
        'dropout_2': hp.quniform('dropout_2', 0.025, 0.2, 0.025),
        'dropout_r': 0, # hp.quniform('dropout_r', 0.00, 0.15, 0.025),
        'dense_1': hp.choice('dense_1', np.arange(44, 65, dtype=int)),
        'cell_type': hp.choice('cell_type', ['lstm', 'gru']),
        'embedding_size': hp.choice('embedding_size', np.arange(64, 129, dtype=int)), # [64, 96, 128]
        'units': hp.choice('units', np.arange(40, 65, dtype=int)),
        'opt_algo': hp.choice('opt_algo', ['adam']),  # 'rmsprop', 'nadam', 
        'epochs': 2,
    }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=24)

    print("Best:", best)
    return best, trials

b, t = optimize()

Training with params: {'batch_size': 20, 'cell_type': 'lstm', 'dense_1': 52, 'dropout_1': 0.0, 'dropout_2': 0.15000000000000002, 'dropout_r': 0, 'embedding_size': 121, 'epochs': 2, 'opt_algo': 'adam', 'units': 44}
Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2
	Score 0.051197376069030846

Training with params: {'batch_size': 16, 'cell_type': 'lstm', 'dense_1': 46, 'dropout_1': 0.15000000000000002, 'dropout_2': 0.17500000000000002, 'dropout_r': 0, 'embedding_size': 66, 'epochs': 2, 'opt_algo': 'adam', 'units': 51}
Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2
	Score 0.05123978088192693

Training with params: {'batch_size': 21, 'cell_type': 'gru', 'dense_1': 64, 'dropout_1': 0.05, 'dropout_2': 0.025, 'dropout_r': 0, 'embedding_size': 83, 'epochs': 2, 'opt_algo': 'adam', 'units': 43}
Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2
	Score 0.04893448421454191

Training with params: {'batch_size': 21, 'cell_type': 'gru', 'dens

Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2
	Score 0.04949153492787972

Training with params: {'batch_size': 18, 'cell_type': 'gru', 'dense_1': 53, 'dropout_1': 0.025, 'dropout_2': 0.1, 'dropout_r': 0, 'embedding_size': 75, 'epochs': 2, 'opt_algo': 'adam', 'units': 52}
Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2
	Score 0.049074465629109665

Training with params: {'batch_size': 31, 'cell_type': 'lstm', 'dense_1': 55, 'dropout_1': 0.025, 'dropout_2': 0.1, 'dropout_r': 0, 'embedding_size': 116, 'epochs': 2, 'opt_algo': 'adam', 'units': 59}
Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2
	Score 0.04971459474542928

Training with params: {'batch_size': 16, 'cell_type': 'lstm', 'dense_1': 64, 'dropout_1': 0.125, 'dropout_2': 0.125, 'dropout_r': 0, 'embedding_size': 80, 'epochs': 2, 'opt_algo': 'adam', 'units': 48}
Train on 86265 samples, validate on 9586 samples
Epoch 1/2
Epoch 2/2
	Score 0.050394979134184906

Training wi

In [11]:
p = {'cell_type': 'gru', 'opt_algo': 'adam', 'units': 50, 'batch_size': 32, 'embedding_size': 80,  
     'dense_1': 64, 'dropout_1': 0.05, 'dropout_2': 0.05, 'dropout_r': 0.0, 'epochs': 2}
manual_model = get_model_with_params(p)
manual_model.summary()
h = manual_model.fit(X_t, y, batch_size=p['batch_size'], epochs=p['epochs'], validation_split=0.1, callbacks=callbacks)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 80)           1920000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 100)          39300     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_______________________________________________________

In [10]:
from crossvalidation import multilabel_cross_validation, multilabel_label_combinations
from multilabel_classifier import MultilabelClassifier
from transform_pipeline import TransformPipeline

#from nltk.tokenize import wordpunct_tokenize
#from nltk.stem.snowball import EnglishStemmer
#from nltk.stem import WordNetLemmatizer
#from functools import lru_cache

#from textblob import TextBlob

#from collections import OrderedDict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.base import TransformerMixin

from visualizations import topn_features, confusion_matrix

In [52]:
# class ModelTransformer(TransformerMixin):

#     def __init__(self, model):
#         self.model = model

#     def fit(self, *args, **kwargs):
#         self.model.fit(*args, **kwargs)
#         return self

#     def transform(self, X, **transform_params):
#         return DataFrame(self.model.predict(X))
    

class KerasAnswerExtractor(TransformerMixin):

    def __init__(self, model, column_name):
        self.model = model
        self.column_name = column_name

    def fit(self, *args, **kwargs):
        #self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return X
    
    def predict(self, X, *args, **kwargs):
        prediction = (self.model.predict(X)[:, list_classes.index(self.column_name)])
        return prediction.reshape(len(prediction), -1)
    

In [37]:
class_name = 'toxic'
class_y = train[class_name].values
confusion_matrix(KerasAnswerExtractor(manual_model, class_name),
                 X_t,
                 class_y.reshape(len(class_y), -1), 
                 do_fit=False)

Unnamed: 0,predicted negative,predicted positive
negative,0.986284,0.013716
positive,0.137722,0.862278


In [15]:
final_model = load_model(file_path, custom_objects={ })
final_model.summary()
y_train = final_model.predict(X_t)
for class_name in list_classes:
    one_column_answer = y_train[:, list_classes.index(class_name)]
    pred = one_column_answer.reshape(len(one_column_answer), -1)
    class_y = train[class_name].values
    cm = confusion_matrix(class_y.reshape(len(class_y), -1), (pred > 0.5).astype(int))
    print(class_name, "log loss:", log_loss(class_y.reshape(len(class_y), -1), pred))
    print(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_22 (InputLayer)        (None, 100)               0         
_________________________________________________________________
embedding_22 (Embedding)     (None, 100, 78)           1872000   
_________________________________________________________________
bidirectional_22 (Bidirectio (None, 100, 128)          54912     
_________________________________________________________________
global_max_pooling1d_22 (Glo (None, 128)               0         
_________________________________________________________________
dropout_43 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_43 (Dense)             (None, 60)                7740      
_________________________________________________________________
dropout_44 (Dropout)         (None, 60)                0         
__________

In [None]:
# for class_name in list_classes:
#     clf = KerasAnswerExtractor(manual_model, class_name)
#     pred = clf.predict(X_t)
#     class_y = train[class_name].values
#     cm = confusion_matrix(class_y.reshape(len(class_y), -1), (pred > 0.5).astype(int))
#     print(class_name, "log loss:", log_loss(class_y.reshape(len(class_y), -1), pred))
#     print(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])

In [None]:
toxic log loss: 0.07620896772834636
[[0.98964371 0.01035629]
 [0.20082278 0.79917722]]
severe_toxic log loss: 0.024454393614495473
[[0.99581603 0.00418397]
 [0.53678756 0.46321244]]
obscene log loss: 0.04788283939460444
[[0.98856098 0.01143902]
 [0.13114112 0.86885888]]
threat log loss: 0.0137781501289922
[[9.99989534e-01 1.04661629e-05]
 [9.96721311e-01 3.27868852e-03]]
insult log loss: 0.06003329518399972
[[0.98591441 0.01408559]
 [0.20986359 0.79013641]]
identity_hate log loss: 0.02840215769807751
[[9.99905300e-01 9.46999590e-05]
 [9.87714988e-01 1.22850123e-02]]


toxic log loss: 0.07290494285026734
[[0.98549888 0.01450112]
 [0.13467576 0.86532424]]
severe_toxic log loss: 0.023505014466099847
[[0.99798706 0.00201294]
 [0.68082902 0.31917098]]
obscene log loss: 0.04956906263386151
[[0.98674263 0.01325737]
 [0.1233118  0.8766882 ]]
threat log loss: 0.013953790726962609
[[1. 0.]
 [1. 0.]]
insult log loss: 0.06100324271304353
[[0.98399315 0.01600685]
 [0.20587618 0.79412382]]
identity_hate log loss: 0.028804451678192445
[[1. 0.]
 [1. 0.]]

In [48]:
(pred > 0.5).astype(int)

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [50]:
log_loss(class_y.reshape(len(class_y), -1), pred)

0.9008559668824437

In [13]:
final_model = load_model(file_path, custom_objects={ })
final_model.summary()
y_test = final_model.predict(X_te)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 100, 128)          3072000   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 100, 100)          53700     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 100)               0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 51)                5151      
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 51)                0         
__________

In [14]:
sample_submission = pd.read_csv("input/sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("output/keras_tuned_0.0457.csv", index=False)

In [None]:
# Plots
# my_plots = ['loss', 'acc']
# for plot in my_plots:
#     plt.plot(history.history[plot])
#     plt.plot(history.history['val_' + plot])
#     plt.title('model ' + plot)
#     plt.ylabel(plot)
#     plt.xlabel('epoch')
#     plt.legend(['train', 'test'], loc='upper left')
#     plt.show()