### Imports

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import warnings
import networkx as nx
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter('ignore', SettingWithCopyWarning)

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Input 
from keras.layers import Lambda 
from keras.layers import LSTM, Bidirectional
from keras.layers import Conv1D, MaxPooling1D
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adadelta

Using TensorFlow backend.


### Read Data

In [2]:
df = pd.read_csv('train.csv')
df = df[['question1', 'question2', 'is_duplicate']]
df = df.dropna()

### Train Validation Split

In [3]:
df_train, df_val = train_test_split(df, random_state=42, test_size=0.2)

### Preprocessing Helper

In [4]:
def preprocess(x):
    x = str(x).lower()
    x = x.replace("won't", "will not")
    x = x.replace("cannot", "can not")
    x = x.replace("can't", "can not")
    x = x.replace("n't", " not")
    x = x.replace("what's", "what is")
    x = x.replace("it's", "it is")
    x = x.replace("'ve", " have")
    x = x.replace("i'm", "i am")
    x = x.replace("'re", " are")
    x = x.replace("he's", "he is")
    x = x.replace("she's", "she is")
    x = x.replace("'s", " own")
    x = x.replace("%", " percent ")
    x = x.replace("₹", " rupee ")
    x = x.replace("$", " dollar ")
    x = x.replace("€", " euro ")
    x = x.replace("'ll", " will")
    x = x.strip()
    x = ' '.join(x.split())

    return x

### Train Data Preprocessing

In [5]:
df_train['question1'] = df_train['question1'].apply(preprocess)
df_train['question2'] = df_train['question2'].apply(preprocess)
df_train = df_train[(df_train.question1!='') & (df_train.question2!='')]

### Validation Data Preprocessing

In [6]:
df_val['question1'] = df_val['question1'].apply(preprocess)
df_val['question2'] = df_val['question2'].apply(preprocess)

### Pre Trained Embedding

In [7]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [8]:
MAX_LENGTH = 50

### Tokenizer

In [9]:
def get_tokenizer(df):
    
    tokenizer =  Tokenizer()
    question_text_list = df['question1'].astype(str).tolist() + df['question2'].astype(str).tolist()
    tokenizer.fit_on_texts(question_text_list)
    
    return tokenizer

tokenizer = get_tokenizer(df_train)
vocab_size = len(tokenizer.word_index) + 1

### Prepare input and output

In [10]:
def prepare_X_y(df, max_length, tokenizer):
    
    encoded_question1 = tokenizer.texts_to_sequences(df['question1'].astype(str).tolist())
    encoded_question2 = tokenizer.texts_to_sequences(df['question2'].astype(str).tolist())
    padded_question1 = pad_sequences(encoded_question1, maxlen=max_length, padding='post')
    padded_question2 = pad_sequences(encoded_question2, maxlen=max_length, padding='post')
    
    X_1 = padded_question1
    X_2 = padded_question2
    y = df.is_duplicate.values
    
    return X_1, X_2, y

In [11]:
X_train_1, X_train_2, y_train = prepare_X_y(df_train, max_length=MAX_LENGTH, tokenizer=tokenizer)
X_val_1, X_val_2, y_val = prepare_X_y(df_val, max_length=MAX_LENGTH, tokenizer=tokenizer)

### Embedding Matrix

In [12]:
def get_embedding_matrix(tokenizer, vector_model):
    
    embedding_matrix = np.zeros((vocab_size, 300))
    for word, i in tokenizer.word_index.items():
        if word in vector_model.vocab:
            embedding_vector = vector_model.word_vec(word)
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

embedding_matrix = get_embedding_matrix(tokenizer, model)

### Siamese Model with 1D CNN and Bidirectional LSTM

In [13]:
def get_model(embedding_matrix):
    
    input_1 = Input(shape=(MAX_LENGTH,), dtype='int32')
    input_2 = Input(shape=(MAX_LENGTH,), dtype='int32')

    embedding_layer = Embedding(len(embedding_matrix),
                                300, weights=[embedding_matrix],
                                input_length=MAX_LENGTH,
                                trainable=False)

    encoded_1 = embedding_layer(input_1)
    encoded_2 = embedding_layer(input_2)

    shared_conv = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
    max_pool = MaxPooling1D(pool_size=2)
    shared_lstm = Bidirectional(LSTM(256))

    output_1 = shared_conv(encoded_1)
    output_1 = max_pool(output_1)
    output_1 = shared_lstm(output_1)
    
    output_2 = shared_conv(encoded_2)
    output_2 = max_pool(output_2)
    output_2 = shared_lstm(output_2)

    dist = Lambda(function=lambda x: K.exp(-K.sum(K.abs(x[0]-x[1]), axis=1, keepdims=True)),
                             output_shape=lambda x: (x[0][0], 1))([output_1, output_2])

    # Pack it all up into a model
    model = Model([input_1, input_2], [dist])

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    print(model.summary())
    
    return model

model = get_model(embedding_matrix)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 300)      23759100    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 50, 128)      115328      embedding_1[0][0]                
          

In [14]:
model.fit([X_train_1, X_train_2], y_train, batch_size=256, epochs=8, validation_data=([X_val_1, X_val_2], y_val))

Train on 291086 samples, validate on 72772 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f81229b5c88>

### Saving Model and Tokenizer

In [16]:
import pickle

with open('models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

model.save_weights('models/model_weights.h5')

with open('models/model_architecture.json', 'w') as f:
    f.write(model.to_json())