## Libraries

In [1]:
import pandas as pd
import numpy as np

## Load features

In [2]:
train = pd.read_pickle('../features/train.pkl')

In [3]:
test = pd.read_pickle('../features/test.pkl')

## Feature selection

In [4]:
from sklearn.feature_selection import VarianceThreshold
from keras.preprocessing import sequence, text

try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

Using TensorFlow backend.


In [5]:
features = ['cosin_sim', 'word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
            'start_with_same_world', 'rfidf_share', 'char_difference', 'word_difference',
           'seq_simhash_distance', 'shingle_simhash_distance', 'avg_word_len_q1', 'avg_word_len_q2',
           'avg_word_difference', 'unigrams_common_count', 'bigrams_common_count', 'unigrams_common_ratio',
           'bigrams_common_ratio', 'word2vec_q1_mean', 'word2vec_q2_mean']

target = 'is_duplicate'

X = train[features]
y = train[target]

X_test = test[features]

In [6]:
tokenizer = text.Tokenizer(num_words=10000)

max_len = 40
tokenizer.fit_on_texts(list(train.question1.values.astype(str)) + list(train.question2.values.astype(str)))

x1 = tokenizer.texts_to_sequences(train.question1.values.astype(str))
x1 = sequence.pad_sequences(x1, maxlen=max_len)

x2 = tokenizer.texts_to_sequences(train.question2.values.astype(str))
x2 = sequence.pad_sequences(x2, maxlen=max_len)

word_index = tokenizer.word_index

In [7]:
print(x1.shape, X.shape)

(404290, 40) (404290, 21)


In [None]:
# Run this only if you have > 24GB of RAM!

# x1_test = tokenizer.texts_to_sequences(test.question1.values.astype(str))
# x1_test = sequence.pad_sequences(x1_test, maxlen=max_len)

# x2_test = tokenizer.texts_to_sequences(test.question2.values.astype(str))
# x2_test = sequence.pad_sequences(x2_test, maxlen=max_len)

## Oversampling

In [None]:
def oversample(X, y, rate=0.165):
    pos_train = X[y == 1]
    neg_train = X[y == 0]

    # Now we oversample the negative class
    # There is likely a much more elegant way to do this...
    p = 0.165
    scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
    while scale > 1:
        neg_train = pd.concat([neg_train, neg_train])
        scale -=1
    neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
    print(len(pos_train) / (len(pos_train) + len(neg_train)))

    X = pd.concat([pos_train, neg_train])
    y = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

    return X, y

y_untouched = y

x1 = pd.DataFrame(x1) 
x2 = pd.DataFrame(x2) 
X = pd.DataFrame(X)

X, y = oversample(X, y_untouched)
x1, y = oversample(x1, y_untouched)
x2, y = oversample(x2, y_untouched)

X = np.array(X)
x1 = np.array(x1)
x2 = np.array(x2)

print(len(X), len(x1), len(x2), len(y))

## Normalization

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler_X = StandardScaler()

X = scaler_X.fit_transform(X)
X_test = scaler_X.transform(X_test)

In [10]:
scaler_x1 = StandardScaler()
x1 = scaler_x1.fit_transform(x1)

scaler_x2 = StandardScaler()
x2 = scaler_x2.fit_transform(x2)



## Model

In [11]:
from sklearn.metrics import log_loss

import tensorflow as tf

from keras.layers.advanced_activations import PReLU
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU

#from keras.layers.merge import Concatenate
from keras.layers import Merge

from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense, BatchNormalization
from keras_tqdm import TQDMNotebookCallback

import keras
from keras import backend as K
K.set_image_dim_ordering('tf')

In [12]:
model_q1 = Sequential()
model_q1.add(Embedding(len(word_index) + 1, 64, input_length=max_len, input_shape=(x1.shape[1],)))
model_q1.add(GRU(64, recurrent_dropout=0.2, dropout=0.2, return_sequences=True))
model_q1.add(GRU(64, recurrent_dropout=0.2, dropout=0.2))

model_q2 = Sequential()
model_q2.add(Embedding(len(word_index) + 1, 64, input_length=max_len, input_shape=(x1.shape[1],)))
model_q2.add(GRU(64, recurrent_dropout=0.2, dropout=0.2, return_sequences=True,))
model_q2.add(GRU(64, recurrent_dropout=0.2, dropout=0.2))

model_features = Sequential()
model_features.add(Dense(64, input_shape=(X.shape[1],), activation='relu'))
model_features.add(BatchNormalization())

model_features.add(Dense(128, activation='relu'))
model_features.add(BatchNormalization())

model_features.add(Dense(256, activation='relu'))
model_features.add(BatchNormalization())

In [13]:
merged_model = Sequential()
merged_model.add(Merge([model_q1, model_q2, model_features], mode = 'concat'))
#merged_model.add(Concatenate([model_q1, model_q2, model_features]))
merged_model.add(BatchNormalization())

merged_model.add(Dense(16, activation='relu'))
merged_model.add(BatchNormalization())

merged_model.add(Dense(32, activation='relu'))
merged_model.add(BatchNormalization())

merged_model.add(Dense(1, activation='sigmoid'))

  from ipykernel import kernelapp as app


In [14]:
merged_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [15]:
#merged_model.load_weights('MergeNet.h5')

In [16]:
keras_logger = keras.callbacks.TensorBoard(log_dir='../notebooks/tensor_logs/GRU_FC_merge',
                                           histogram_freq=1, write_graph=True, write_images=True)

In [None]:
merged_model.fit([x1, x2, X],
          y,
          batch_size=16 * 16,
          epochs=7,
          verbose=0,
          validation_split=0.1,
          #callbacks=[TQDMNotebookCallback(), keras_logger])
          callbacks=[TQDMNotebookCallback()])

          363776/|/[loss: 0.412, acc: 0.779] 100%|| 363776/363861 [09:13<00:00, 664.96it/s]

          363776/|/[loss: 0.409, acc: 0.782] 100%|| 363776/363861 [09:18<00:00, 692.22it/s]

          363776/|/[loss: 0.405, acc: 0.784] 100%|| 363776/363861 [09:23<00:00, 696.34it/s]

In [None]:
#merged_model.save_weights('MergeNet.h5')

In [None]:
# scores = model.evaluate(X, y, verbose=0, batch_size=4096 * 8)

# print("Model validation accuracy: %.2f" % (scores[1]*100))
# print("Model validation loss: %.4f" % (scores[0]))

## Generate submission

Run the version with the chunker if you don't have more then 24GB of RAM.

In [18]:
import math

def chunker(collection, chunk_size=300000):
    chunk_num = math.ceil(collection.shape[0] / float(chunk_size))
    for i in range(chunk_num):
        yield collection[chunk_size*i : chunk_size*(i+1)]

In [19]:
preds = []
for q1, q2, X_test_row in zip(
    chunker(test.question1), chunker(test.question2), chunker(X_test)
):
    print('%d / %d' % (len(preds), len(X_test)))
    x1_test_row = tokenizer.texts_to_sequences(q1.values.astype(str))
    x1_test_row = sequence.pad_sequences(x1_test_row, maxlen=max_len)

    x2_test_row = tokenizer.texts_to_sequences(q2.values.astype(str))
    x2_test_row = sequence.pad_sequences(x2_test_row, maxlen=max_len)
    
    x1_test_row = scaler_x1.transform(x1_test_row)
    x2_test_row = scaler_x2.transform(x2_test_row)

    batch_preds = merged_model.predict([x1_test_row, x2_test_row, X_test_row], batch_size=128 * 32)
    preds.extend(batch_preds)

0 / 2345796




300000 / 2345796
600000 / 2345796
900000 / 2345796
1200000 / 2345796
1500000 / 2345796
1800000 / 2345796
2100000 / 2345796


In [20]:
# Run this only if you have > 24GB of RAM!

# x1_test = tokenizer.texts_to_sequences(test.question1.values.astype(str))
# x1_test = sequence.pad_sequences(x1_test, maxlen=max_len)

# x2_test = tokenizer.texts_to_sequences(test.question2.values.astype(str))
# x2_test = sequence.pad_sequences(x2_test, maxlen=max_len)

# preds = model.predict([x1_test, x2_test, X_test], batch_size=64 * 64)
# print(preds.shape)

In [21]:
print(len(preds))

2345796


In [22]:
np.savetxt(
    '../submissions/submission.csv', np.c_[range(len(preds)), preds],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%f'
)