## Libraries

In [1]:
import pandas as pd
import numpy as np

## Load features

In [2]:
train = pd.read_pickle('../features/train.pkl')

In [3]:
test = pd.read_pickle('../features/test.pkl')

## Feature selection

In [4]:
from sklearn.feature_selection import VarianceThreshold

try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split

In [5]:
features = ['cosin_sim', 'word_share', 'q1_char_num', 'q1_word_num', 'q2_char_num', 'q2_word_num',
            'start_with_same_world', 'rfidf_share', 'char_difference', 'word_difference',
           'seq_simhash_distance', 'shingle_simhash_distance', 'avg_word_len_q1', 'avg_word_len_q2',
           'avg_word_difference', 'unigrams_common_count', 'bigrams_common_count', 'unigrams_common_ratio',
           'bigrams_common_ratio', 'word2vec_q1_mean', 'word2vec_q2_mean']

target = 'is_duplicate'

X, y = train[features], train[target]
X_test = test[features]

## Oversampling

In [6]:
pos_train = X[y == 1]
neg_train = X[y == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

X = pd.concat([pos_train, neg_train])
y = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

del pos_train, neg_train

0.19124366100096607


## Normalization

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler().fit(X)

X = scaler.transform(X)
X_test = scaler.transform(X_test)

## Model

In [9]:
from sklearn.metrics import log_loss

import tensorflow as tf

from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense, BatchNormalization
from keras_tqdm import TQDMNotebookCallback
import keras

from keras import backend as K
K.set_image_dim_ordering('tf')

Using TensorFlow backend.


In [41]:
# model = Sequential()
# model.add(Dense(256, input_dim=X.shape[1]))
# model.add(BatchNormalization())
# model.add(Activation('relu'))

# model.add(Dense(256))
# model.add(BatchNormalization())
# model.add(Activation('relu'))

# model.add(Dense(512))
# model.add(BatchNormalization())
# model.add(Activation('relu'))

# model.add(Dense(512))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))
# model.add(Activation('relu'))

# model.add(Dense(1, activation='sigmoid'))

model = Sequential()
model.add(Dense(124, input_dim=X.shape[1], activation='relu'))
model.add(Dense(124, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [42]:
keras_logger = keras.callbacks.TensorBoard(log_dir='../notebooks/tensor_logs/relu_6w_2o', histogram_freq=1,  
          write_graph=True, write_images=True)

In [None]:
model.fit(X,
          y,
          batch_size=4096 * 8,
          epochs=300,
          verbose=0,
          validation_split=0.1,
          #validation_data = (X_vald_scaled, y_vald),
          callbacks=[TQDMNotebookCallback(), keras_logger])

In [39]:
model.save_weights('relu_6w_2o.h5')

In [None]:
scores = model.evaluate(X_vald_scaled, y_vald, verbose=0, batch_size=4096 * 8)

print("Model validation accuracy: %.2f" % (scores[1]*100))
print("Model validation loss: %.4f" % (scores[0]))

## Generate submission

In [20]:
preds = model.predict(X_test, batch_size=4096 * 16)
print(preds.shape)

(2345796, 1)


In [21]:
np.savetxt(
    '../submissions/submission.csv', np.c_[range(len(preds)), preds],
    delimiter=',', header='test_id,is_duplicate', comments='', fmt='%d,%f'
)