In [53]:
# data
data = './data/'
MAX_SEQUENCE_LENGTH = 80
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.3

In [54]:
import pandas as pd

In [55]:
data = pd.read_csv('./data/train.csv', na_values='NULL')

In [56]:
data = data.dropna()
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [57]:
from keras.layers import Input, Conv1D, MaxPool1D, Dense, Activation, Flatten, concatenate, Subtract, Multiply, Embedding

In [58]:
qus1 = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
qus2 = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')

In [59]:
embedding = Embedding(output_dim=EMBEDDING_DIM, input_dim=MAX_NB_WORDS+1, input_length=MAX_SEQUENCE_LENGTH)
x1 = embedding(qus1)
x2 = embedding(qus2)

In [60]:
conv = Conv1D(32, 4, activation='tanh')
c1 = conv(x1)
c2 = conv(x2)

In [61]:
maxp = MaxPool1D(pool_size=4)
m1 = maxp(c1)
m2 = maxp(c2)

In [62]:
fla = Flatten()
f1 = fla(m1)
f2 = fla(m2)

In [63]:
sub = Subtract()([f1, f2])
mul = Multiply()([f1, f2])
merge = concatenate([sub, mul])

In [64]:
den1 = Dense(units=20, activation='relu')(merge)

In [65]:
den2 = Dense(units=1, activation='sigmoid')(den1)

In [66]:
from keras.models import Model

In [67]:
model = Model(inputs=[qus1, qus2], outputs=den2)

In [68]:
data_q1 = data['question1'].values
data_q1.shape

(404288,)

In [69]:
data_q2 = data['question2'].values
data_q2.shape

(404288,)

In [70]:
label = data['is_duplicate'].values
label.shape

(404288,)

In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404288 entries, 0 to 404289
Data columns (total 6 columns):
id              404288 non-null int64
qid1            404288 non-null int64
qid2            404288 non-null int64
question1       404288 non-null object
question2       404288 non-null object
is_duplicate    404288 non-null int64
dtypes: int64(4), object(2)
memory usage: 21.6+ MB


In [73]:
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [74]:
label[0:10]

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0])

In [75]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [82]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

In [84]:
#data_q1, data_q2 = [x.tolist() for x in [data_q1, data_q2]]

In [85]:
len(data_q1)

404288

In [86]:
data_q1[0:10]

['What is the step by step guide to invest in share market in india?',
 'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'How can I increase the speed of my internet connection while using a VPN?',
 'Why am I mentally very lonely? How can I solve it?',
 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?',
 'Should I buy tiago?',
 'How can I be a good geologist?',
 'When do you use シ instead of し?',
 'Motorola (company): Can I hack my Charter Motorolla DCX3400?']

In [87]:
tokenizer.fit_on_texts(data_q1+data_q2)

In [88]:
seq1 = tokenizer.texts_to_sequences(data_q1)
seq2 = tokenizer.texts_to_sequences(data_q2)

In [96]:
seq1_pad = pad_sequences(seq1, maxlen=MAX_SEQUENCE_LENGTH)

In [97]:
seq2_pad = pad_sequences(seq2, maxlen=MAX_SEQUENCE_LENGTH)

In [98]:
seq1_pad[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          2,    3,    1, 1222,   57, 1222, 2581,    7,  576,    8,  763,
        383,    8,   35], dtype=int32)

In [100]:
seq2_pad[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    2,    3,    1, 1222,   57, 1222, 2581,    7,  576,
          8,  763,  383], dtype=int32)

In [102]:
import numpy as np

In [103]:
print(seq1_pad.shape)
indices = np.arange(seq1_pad.shape[0])

(404288, 80)


In [105]:
np.random.shuffle(indices)

In [107]:
seq1_pad_suf = seq1_pad[indices]
seq2_pad_suf = seq2_pad[indices]

In [110]:
label_suf = label[indices]

In [112]:
nb_validation_samples = int(VALIDATION_SPLIT * seq2_pad_suf.shape[0])

In [113]:
nb_validation_samples

121286

In [114]:
VALIDATION_SPLIT

0.3

In [118]:
seq1_train = seq1_pad_suf[:-nb_validation_samples]
seq1_test = seq1_pad_suf[-nb_validation_samples:]
seq2_train = seq2_pad_suf[:-nb_validation_samples]
seq2_test = seq2_pad_suf[-nb_validation_samples:]
y_train = label_suf[:-nb_validation_samples]
y_test = label_suf[-nb_validation_samples:]

In [119]:
seq1_train.shape

(283002, 80)

In [120]:
seq1_test.shape

(121286, 80)

In [121]:
y_train.shape

(283002,)

In [122]:
y_test.shape

(121286,)

In [None]:
model.fit([seq1_train, seq2_train], y_train, epochs=2)

Epoch 1/2