In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras import metrics, models, optimizers
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers.merge import Concatenate
from keras.layers import (
    TimeDistributed,
    Lambda, Convolution1D,
    GlobalMaxPooling1D,
    merge,
    add,
    SpatialDropout1D,
    Reshape,
    Flatten,
    Input,
    RepeatVector,
    InputLayer
)
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_csv('/data/train.csv')

In [3]:
y_labels = data['is_duplicate'].values

In [4]:
tk = text.Tokenizer(num_words=200000)

In [5]:
max_len = 40
tk.fit_on_texts(list(data['question1'].values) + list(data['question2'].values.astype(str)))

Pad the questions to all have the same length of 40

In [6]:
x1 = tk.texts_to_sequences(data['question1'].values)
x1 = sequence.pad_sequences(x1, maxlen=max_len)

In [7]:
x2 = tk.texts_to_sequences(data['question2'].values.astype(str))
x2 = sequence.pad_sequences(x2, maxlen=max_len)

In [8]:
word_index = tk.word_index

Extract the word embedding using the glove text file

In [9]:
embeddings_index = {}
f = open('/glove/glove.6B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

400000it [00:36, 10990.32it/s]


In [10]:
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [11]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 95596/95596 [00:00<00:00, 339280.82it/s]


Construct a very simple Long Short-Term Memory neural network.

In [13]:
input1 = Input(shape=(max_len,), dtype='int32')
model1 = Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     trainable=False
                    )(input1)
model1 = TimeDistributed(Dense(300, activation='relu'))(model1)
model1 = Lambda(lambda x: K.sum(x, axis=1), output_shape=(300,))(model1)


input2 = Input(shape=(max_len,), dtype='int32')
model2 = Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     trainable=False
                    )(input2)
model2 = TimeDistributed(Dense(300, activation='relu'))(model2)
model2 = Lambda(lambda x: K.sum(x, axis=1), output_shape=(300,))(model2)


input3 = Input(shape=(max_len,), dtype='int32')
model3 = Embedding(len(word_index) + 1,
                     300,
                     input_length=40,
                    )(input3)
model3 = Dropout(0.3)(model3)
model3 = LSTM(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)(model3)
model3 = Lambda(lambda x: K.sum(x, axis=1), output_shape=(300,))(model3)


input4 = Input(shape=(max_len,), dtype='int32')
model4 = Embedding(len(word_index) + 1,
                     300,
                     input_length=40,
                    )(input4)
model4 = Dropout(0.3)(model4)
model4 = LSTM(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)(model4)
model4 = Lambda(lambda x: K.sum(x, axis=1), output_shape=(300,))(model4)


merged_model = Concatenate()([model1, model2, model3, model4])

merged_model = BatchNormalization()(merged_model)

merged_model = Dense(300)(merged_model)
merged_model = PReLU()(merged_model)
merged_model = Dropout(0.3)(merged_model)
merged_model = BatchNormalization()(merged_model)

merged_model = Dense(300)(merged_model)
merged_model = PReLU()(merged_model)
merged_model = Dropout(0.3)(merged_model)
merged_model = BatchNormalization()(merged_model)

merged_model = Dense(300)(merged_model)
merged_model = PReLU()(merged_model)
merged_model = Dropout(0.3)(merged_model)
merged_model = BatchNormalization()(merged_model)

merged_model = Dense(1)(merged_model)
merged_model = Activation('sigmoid')(merged_model)

In [None]:
model = models.Model(inputs=[input1, input2, input3, input4], outputs=merged_model)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpoint = ModelCheckpoint('weight.h5', monitor='val_acc', save_best_only=True, verbose=2)

model.fit([x1, x2, x1, x2], y=y_labels, batch_size=384, nb_epoch=15,
                 verbose=1, validation_split=0.1, shuffle=True,
                 callbacks=[checkpoint])

  import sys


Train on 363861 samples, validate on 40429 samples
Epoch 1/15
Epoch 00001: val_acc improved from -inf to 0.76641, saving model to weight.h5
Epoch 2/15
Epoch 00002: val_acc improved from 0.76641 to 0.78070, saving model to weight.h5
Epoch 3/15
Epoch 00003: val_acc improved from 0.78070 to 0.79500, saving model to weight.h5
Epoch 4/15
Epoch 00004: val_acc did not improve
Epoch 5/15
Epoch 00005: val_acc improved from 0.79500 to 0.80000, saving model to weight.h5
Epoch 6/15
Epoch 00006: val_acc improved from 0.80000 to 0.81071, saving model to weight.h5
Epoch 7/15
Epoch 00007: val_acc improved from 0.81071 to 0.81634, saving model to weight.h5
Epoch 8/15
Epoch 00008: val_acc improved from 0.81634 to 0.81884, saving model to weight.h5
Epoch 9/15
Epoch 00009: val_acc did not improve
Epoch 10/15
Epoch 00010: val_acc did not improve
Epoch 11/15
Epoch 00011: val_acc improved from 0.81884 to 0.81889, saving model to weight.h5
Epoch 12/15
Epoch 00012: val_acc did not improve
Epoch 13/15
Epoch 000

Here a loss of 0.1688 and a validation loss of 0.5807. There may be some overfitting here. This model can be improved by adding a convolution layer and adjusting the dropout rates.