## Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

## Load features
To load the features you first have to create them, run the notebook feature_engineering. Beware it takes about 2-3 hours to run so save your features!

In [2]:
train = pd.read_pickle('../features/train.pkl')

In [None]:
test = pd.read_pickle('../features/test.pkl')

## Consts
Always use constant SEED otherwise the experiment is not reproducable, in that case why are we doing it? 

In [3]:
SEED = 42

np.random.seed(SEED)
tf.set_random_seed(SEED)

## Feature selection

In [4]:
from sklearn.feature_selection import VarianceThreshold
from keras.preprocessing import sequence, text

try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

Using TensorFlow backend.


In [5]:
features = ['cosin_sim', 'word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
            'start_with_same_world', 'rfidf_share', 'char_difference', 'word_difference',
           'seq_simhash_distance', 'shingle_simhash_distance', 'avg_word_len_q1', 'avg_word_len_q2',
           'avg_word_difference', 'unigrams_common_count', 'bigrams_common_count', 'unigrams_common_ratio',
           'bigrams_common_ratio', 'word2vec_q1_mean', 'word2vec_q2_mean']

target = 'is_duplicate'

X = train[features]
y = train[target]

In [6]:
tokenizer = text.Tokenizer(num_words=20000)

max_len = 40
tokenizer.fit_on_texts(list(train.question1.values.astype(str)) + list(train.question2.values.astype(str)))

x1 = tokenizer.texts_to_sequences(train.question1.values.astype(str))
x1 = sequence.pad_sequences(x1, maxlen=max_len)

x2 = tokenizer.texts_to_sequences(train.question2.values.astype(str))
x2 = sequence.pad_sequences(x2, maxlen=max_len)

word_index = tokenizer.word_index

In [7]:
X = np.array(X)
x1 = np.array(x1)
x2 = np.array(x2)

In [8]:
print(x1.shape, X.shape)

(404290, 40) (404290, 21)


In [9]:
x1

array([[    0,     0,     0, ...,   383,     8,    35],
       [    0,     0,     0, ..., 13893,     5,  4572],
       [    0,     0,     0, ...,   146,     6,  2775],
       ..., 
       [    0,     0,     0, ...,     3,    49,  4401],
       [    0,     0,     0, ...,    32,    82,   234],
       [    0,     0,     0, ...,   155,    29,  4549]], dtype=int32)

## Oversampling
Oversampling leads to local validation score not matching the score from public LB on kaggle. Models with oversampling usually perform a bit better, but due to scores not maching if possible better not use it.

The idea for oversampling came from Kaggle (https://www.kaggle.com/davidthaler/quora-question-pairs/how-many-1-s-are-in-the-public-lb) because the training and test set do not have the same distribution of dublicate questions. The train set has around 37% of duplicates while the private test set has 16.5% but the problem is that we only see the 35% of the prive test set. Final results are calculate on the remaining 65%, what if the distribution of the 35% set doe not match the other 65%, in that case oversampling while increasing the public LB score currently would yield in overfitting the score and poor results in the end.

In [None]:
def oversample(X, y, rate=0.165):
    pos_train = X[y == 1]
    neg_train = X[y == 0]

    # Now we oversample the negative class
    # There is likely a much more elegant way to do this...
    p = 0.165
    scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
    while scale > 1:
        neg_train = pd.concat([neg_train, neg_train])
        scale -=1
    neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
    print(len(pos_train) / (len(pos_train) + len(neg_train)))

    X = pd.concat([pos_train, neg_train])
    y = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

    return X, y

y_untouched = y

x1 = pd.DataFrame(x1) 
x2 = pd.DataFrame(x2) 
X = pd.DataFrame(X)

X, y = oversample(X, y_untouched)
x1, y = oversample(x1, y_untouched)
x2, y = oversample(x2, y_untouched)

X = np.array(X)
x1 = np.array(x1)
x2 = np.array(x2)

print(len(X), len(x1), len(x2), len(y))

## Normalization
Normalization helps but only if X is normalized, normalizing x1 and x2 does not allow the model to converge and pass the val_logloss of 0.42 -> bad. So far it seems that StandardScaler applied only on X does the trick.

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [11]:
scaler_X = MinMaxScaler()

X = scaler_X.fit_transform(X)

In [None]:
scaler_x1 = MinMaxScaler()
x1 = scaler_x1.fit_transform(x1)

scaler_x2 = MinMaxScaler()
x2 = scaler_x2.fit_transform(x2)

## Model

In [40]:
from sklearn.metrics import log_loss

from keras.layers.advanced_activations import PReLU
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU

#from keras.layers.merge import Concatenate
from keras.layers import Merge

from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense, BatchNormalization, TimeDistributed
from keras_tqdm import TQDMNotebookCallback

from keras.optimizers import Adam, RMSprop, Adamax, Adagrad, Nadam
from keras.activations import elu

import keras
from keras import backend as K
K.set_image_dim_ordering('tf')

In [41]:
model_q1 = Sequential()
model_q1.add(Embedding(len(word_index) + 1, 64, input_length=max_len, input_shape=(x1.shape[1],)))
model_q1.add(GRU(64, recurrent_dropout=0.2, dropout=0.2, return_sequences=True))
model_q1.add(GRU(64, recurrent_dropout=0.2, dropout=0.2, return_sequences=False))

model_q2 = Sequential()
model_q2.add(Embedding(len(word_index) + 1, 64, input_length=max_len, input_shape=(x2.shape[1],)))
model_q2.add(GRU(64, recurrent_dropout=0.2, dropout=0.2, return_sequences=True))
model_q2.add(GRU(64, recurrent_dropout=0.2, dropout=0.2, return_sequences=False))

model_features = Sequential()
model_features.add(Dense(128, input_shape=(X.shape[1],), activation=elu))
model_features.add(BatchNormalization())

model_features.add(Dense(256, activation=elu))
model_features.add(BatchNormalization())

model_features.add(Dense(512, activation=elu))
model_features.add(Dropout(0.2))
model_features.add(BatchNormalization())

In [42]:
merged_model = Sequential()
merged_model.add(Merge([model_q1, model_q2, model_features], mode = 'concat'))
#merged_model.add(Concatenate([model_q1, model_q2, model_features]))
merged_model.add(BatchNormalization())

merged_model.add(Dense(128, activation=elu))
merged_model.add(BatchNormalization())

merged_model.add(Dense(256, activation=elu))
merged_model.add(BatchNormalization())

merged_model.add(Dense(512, activation=elu))
model_features.add(Dropout(0.2))
merged_model.add(BatchNormalization())

merged_model.add(Dense(1, activation='sigmoid'))

  from ipykernel import kernelapp as app


In [43]:
merged_model.compile(loss='binary_crossentropy',
              optimizer=Adamax(),
              metrics=['accuracy'])

In [37]:
#merged_model.load_weights('MergeNet.h5')

In [38]:
# keras_logger = keras.callbacks.TensorBoard(log_dir='../notebooks/tensor_logs/mergnet5',
#                                            histogram_freq=1, write_graph=True, write_images=True)

# keras_logger.set_model(merged_model)

In [None]:
merged_model.fit([x1, x2, X],
          y,
          batch_size=256,
          epochs=50,
          verbose=0,
          validation_split=0.1,
          #callbacks=[TQDMNotebookCallback(), keras_logger])
          callbacks=[TQDMNotebookCallback()])

In [None]:
#merged_model.save_weights('MergeNet.h5')

In [None]:
# scores = model.evaluate(X, y, verbose=0, batch_size=4096 * 8)

# print("Model validation accuracy: %.2f" % (scores[1]*100))
# print("Model validation loss: %.4f" % (scores[0]))

## Generate submission

Run the version with the chunker if you don't have more then 24GB of RAM.

In [None]:
import math

def chunker(collection, chunk_size=300000):
    chunk_num = math.ceil(collection.shape[0] / float(chunk_size))
    for i in range(chunk_num):
        yield collection[chunk_size*i : chunk_size*(i+1)]

In [None]:
preds = []
for q1, q2, X_test_row in zip(
    chunker(test.question1), chunker(test.question2), chunker(X_test)
):
    print('%d / %d' % (len(preds), len(X_test)))
    x1_test_row = tokenizer.texts_to_sequences(q1.values.astype(str))
    x1_test_row = sequence.pad_sequences(x1_test_row, maxlen=max_len)

    x2_test_row = tokenizer.texts_to_sequences(q2.values.astype(str))
    x2_test_row = sequence.pad_sequences(x2_test_row, maxlen=max_len)
    
    #x1_test_row = scaler_x1.transform(x1_test_row)
    #x2_test_row = scaler_x2.transform(x2_test_row)

    batch_preds = merged_model.predict([x1_test_row, x2_test_row, X_test_row], batch_size=128 * 32)
    preds.extend(batch_preds)

In [None]:
print(len(preds))

In [None]:
X_test = test[features]
X_test = scaler_X.transform(X_test)

In [None]:
np.savetxt(
    '../submissions/submission.csv', np.c_[range(len(preds)), preds],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%f'
)