In [16]:
from __future__ import print_function, division

import gensim
import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K
from keras.layers import Input, Embedding, Dense, Conv1D, GlobalAveragePooling1D, Flatten, concatenate, Lambda, BatchNormalization, Dropout
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

In [2]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("../models/word2vec.bin", binary=True)

In [3]:
def model_conv1D(emb_matrix, input_len=60, magic_feature_num=5, distance_feature_num=20):
    emb_layer = Embedding(input_dim=emb_matrix.shape[0], output_dim=emb_matrix.shape[1], input_length=input_len, 
                          trainable=False, embeddings_initializer=keras.initializers.Constant(emb_matrix))
    input_a = Input(shape=(input_len, ))
    input_b = Input(shape=(input_len, ))
    emb_a = emb_layer(input_a)
    emb_b = emb_layer(input_b)
    sizes = [(128, 1), (128, 2), (128, 3), (128, 4), (32, 5), (32, 6)]
    global_as = []
    global_bs = []
    for filter_size, kernel_size in sizes:
        conv_a = Conv1D(filters=filter_size, kernel_size=kernel_size, padding="SAME", activation="relu")(emb_a)
        global_a = GlobalAveragePooling1D()(conv_a)
        conv_b = Conv1D(filters=filter_size, kernel_size=kernel_size, padding="SAME", activation="relu")(emb_b)
        global_b = GlobalAveragePooling1D()(conv_b)
        global_as.append(global_a)
        global_bs.append(global_b)
    merge_a = concatenate(global_as)
    merge_b = concatenate(global_bs)
    diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(4 * 128 + 2 * 32, ))([merge_a, merge_b])
    mul = Lambda(lambda x: [0] * x[1], output_shape=(4 * 128 + 2 * 32, ))([merge_a, merge_b])
    
    magic_input = Input(shape=(magic_feature_num, ))
    magic_dense = BatchNormalization()(magic_input)
    magic_dense = Dense(64, activation="relu")(magic_dense)
    
    distance_input = Input(shape=(distance_feature_num, ))
    distance_dense = BatchNormalization()(distance_input)
    distance_dense = Dense(128, activation="relu")(distance_input)
    
    merge = concatenate([diff, mul, magic_dense, distance_dense])
    
    x = Dropout(0.2)(merge)
    x = BatchNormalization()(x)
    x = Dense(300, activation="relu")(x)
    
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    pred = Dense(1, activation="sigmoid")(x)
    
    model = Model([input_a, input_b, magic_input, distance_input], outputs=pred)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
    return model

In [4]:
embedding_matrix = np.array([word2vec_model[w] for w in word2vec_model.wv.index2word])
print(embedding_matrix.shape)

(23550, 128)


In [5]:
train_data = pd.read_csv("../datasets/train.csv")
train_data = train_data.fillna("")
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
train_token1 = train_data.question1.apply(lambda x: x.lower().split())
train_token2 = train_data.question2.apply(lambda x: x.lower().split())

In [7]:
test_token1 = np.load("../datasets/test_words1.npy")
test_token2 = np.load("../datasets/test_words2.npy")

In [8]:
MAX_SEQUENCE_LEN = 60

In [9]:
word2index_dict = {w: i for i, w in enumerate(word2vec_model.wv.index2word, 1)}

def word2index(sequence):
    return [word2index_dict.get(w, 0) for w in sequence]

In [10]:
train_token_code1 = np.array([word2index(s) for s in train_token1])
train_token_code2 = np.array([word2index(s) for s in train_token2])
test_token_code1 = np.array([word2index(s) for s in test_token1])
test_token_code2 = np.array([word2index(s) for s in test_token2])

In [11]:
train_token_code1 = pad_sequences(train_token_code1, maxlen=MAX_SEQUENCE_LEN)
train_token_code2 = pad_sequences(train_token_code2, maxlen=MAX_SEQUENCE_LEN)
test_token_code1 = pad_sequences(test_token_code1, maxlen=MAX_SEQUENCE_LEN)
test_token_code2 = pad_sequences(test_token_code2, maxlen=MAX_SEQUENCE_LEN)

In [12]:
train_magic_v1 = pd.read_csv("../datasets/train_magic_feature_v1.csv")
train_magic_v2 = pd.read_csv("../datasets/train_magic_feature_v2.csv")
test_magic_v1 = pd.read_csv("../datasets/test_magic_feature_v1.csv")
test_magic_v2 = pd.read_csv("../datasets/test_magic_feature_v2.csv")
columns_v1 = ['q1_freq', 'q2_freq']
columns_v2 = ['q1_q2_intersect']
train_magic = pd.concat([train_magic_v1[columns_v1], train_magic_v2[columns_v2]], axis=1)
test_magic = pd.concat([test_magic_v1[columns_v1], test_magic_v2[columns_v2]], axis=1)
print("magic train:", train_magic.shape)
print("magic test:", test_magic.shape)

magic train: (404290, 3)
magic test: (2345796, 3)


In [13]:
train_distance = pd.read_csv("../datasets/train_featured.csv")
test_distance = pd.read_csv("../datasets/test_featured_split.csv")
test_distance = test_distance.drop('diff_len', axis=1)
print("train shape:", train_distance.shape)
print("test shape:", test_distance.shape)

train shape: (404290, 18)
test shape: (2345796, 18)


In [17]:
distance_scaler = StandardScaler()
train_distance_scaled = distance_scaler.fit_transform(train_distance)
test_distance_scaled = distance_scaler.transform(test_distance)

magic_scaler = StandardScaler()
train_magic_scaled = magic_scaler.fit_transform(train_magic)
test_magic_scaled = magic_scaler.transform(test_magic)

In [29]:
model = model_conv1D(embedding_matrix, input_len=MAX_SEQUENCE_LEN, magic_feature_num=3, distance_feature_num=18)

In [30]:
y = train_data.is_duplicate.values
early_stopping = EarlyStopping(patience=5, verbose=1)
model_checkpoint = ModelCheckpoint("../models/cnn-1d-achive-0.1-v2.model", verbose=1, save_best_only=True)
model.fit([train_token_code1, train_token_code2, train_magic_scaled, train_distance_scaled], y, batch_size=128, 
          epochs=30, validation_split=0.1, callbacks=[early_stopping, model_checkpoint])
# , class_weight={0: 1.309028344, 1: 0.472001959}

Train on 363861 samples, validate on 40429 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.27421, saving model to ../models/cnn-1d-achive-0.1-v2.model
Epoch 2/30

Epoch 00002: val_loss improved from 0.27421 to 0.25323, saving model to ../models/cnn-1d-achive-0.1-v2.model
Epoch 3/30

Epoch 00003: val_loss did not improve from 0.25323
Epoch 4/30

Epoch 00004: val_loss improved from 0.25323 to 0.25269, saving model to ../models/cnn-1d-achive-0.1-v2.model
Epoch 5/30

Epoch 00005: val_loss improved from 0.25269 to 0.24903, saving model to ../models/cnn-1d-achive-0.1-v2.model
Epoch 6/30

Epoch 00006: val_loss improved from 0.24903 to 0.24516, saving model to ../models/cnn-1d-achive-0.1-v2.model
Epoch 7/30

Epoch 00007: val_loss improved from 0.24516 to 0.24363, saving model to ../models/cnn-1d-achive-0.1-v2.model
Epoch 8/30

Epoch 00008: val_loss improved from 0.24363 to 0.24204, saving model to ../models/cnn-1d-achive-0.1-v2.model
Epoch 9/30

Epoch 00009: val_loss did not i

<keras.callbacks.History at 0x7f86c7c26850>

In [31]:
model.load_weights("../models/cnn-1d-achive-0.1-v2.model")

In [32]:
submission = model.predict([test_token_code1, test_token_code2, test_magic_scaled, test_distance_scaled], batch_size=512, verbose=1)



In [33]:
df_submission = pd.DataFrame({'is_duplicate': submission.reshape(-1, ), 'test_id': range(len(submission))})
df_submission.head()

Unnamed: 0,is_duplicate,test_id
0,0.009247,0
1,0.381514,1
2,0.376577,2
3,0.001449,3
4,0.306953,4


In [34]:
submission.reshape(-1, ).sum()

419114.5

In [35]:
train_pred = model.predict([train_token_code1, train_token_code2, train_magic, train_distance], batch_size=512, verbose=1).reshape(-1, )



In [36]:
train_pred

array([9.9996662e-01, 2.0979383e-23, 4.5031977e-22, ..., 3.0439164e-28,
       4.6619011e-33, 9.0071702e-01], dtype=float32)

In [37]:
sum((train_pred >= 0.5) == train_data.is_duplicate.values) / train_data.shape[0]

0.6450295579905513

In [38]:
(submission.reshape(-1, ) >= 0.5).sum()

314384

In [39]:
df_submission.to_csv("../predictions/cnn-1d-v2.csv.gz", index=False, compression="gzip")