In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!cp '/content/drive/My Drive/train.csv.zip' '/content/train.csv.zip'
!cp '/content/drive/My Drive/GoogleNews-vectors-negative300.bin' '/content/GoogleNews-vectors-negative300.bin'

In [0]:
!unzip '/content/train.csv.zip'

Archive:  /content/train.csv.zip
  inflating: train.csv               


## Importing Libraries


In [0]:
import nltk
import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
import re, nltk, gensim
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.models import Sequential
from tensorflow.keras.layers import Embedding,GRU,Dense,Input,\
BatchNormalization,Bidirectional,concatenate,Dropout,Conv1D,\
MaxPooling1D,Flatten,add,Lambda
import tensorflow.keras.backend as K

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading Data

In [0]:
def load_data(dataset):
  train=pd.read_csv(dataset)
  train.dropna(axis=0, inplace=True)
  return train

data=load_data('train.csv')
data=data[:100000]
#Creating two list one for left and another for the right question
def list_data(train):
  q1 = pd.Series(train.question1.tolist()).astype(str)
  q2 = pd.Series(train.question2.tolist()).astype(str)
  return q1,q2

q1,q2=list_data(data)


In [0]:
#Checking for the output counts (Check for data imbalance)
data['is_duplicate'].value_counts()

0    62746
1    37254
Name: is_duplicate, dtype: int64

## Preparing the text data

### Data cleaning

In [0]:
def text_clean(corpus):
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs_list = []
        for word in row.split():
            word = word.lower()
            word = re.sub(r"[^a-zA-Z0-9^.']"," ",word)
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            qs_list.append(p1)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs_list)))
    return cleaned_corpus

all_corpus = q1.append(q2)
all_corpus = text_clean(all_corpus)


  


In [0]:
#The data is in format like all q1 are the in the starting 
#rows of all_corpus
#then once q1 gets finished, q2 starts. So again 
#separating q1 and q2 and merging them into a data frame.
def clean_data(all_corpus,q1,q2,train):
  q1 = all_corpus[0:q1.shape[0]]
  q2 = all_corpus[q2.shape[0]::]
  data_out = pd.DataFrame({'q1': q1, 'q2': q2})
  data_out.index=list(range(0,len(data_out)))
  data_out['output']=train['is_duplicate']
  return data_out
data_new=clean_data(all_corpus,q1,q2,data)


### Creating word to index

In [0]:
#creating word to index using keras tokenizer
def word_to_index(all_corpus):
  lines = []
  for key in all_corpus:
    lines.append(key)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return(tokenizer.word_index)
word2index=word_to_index(all_corpus)
index2word = dict((v,k) for k,v in word2index.items())


### Implementing word2vec embedding on text data


In [0]:
# Loading pre-trained word vectors
def load_embedding(EMBEDDING_FILE,embedding_dim):
  word2vec_model = gensim.models.KeyedVectors.\
  load_word2vec_format(EMBEDDING_FILE, binary = True)
  w2v = dict(zip(word2vec_model.wv.index2word,\
                 word2vec_model.wv.syn0))
  
# This will be the embedding matrix
  embeddings = 1 * np.random.randn(len(word2index) \
                                   + 1, embedding_dim)  
  embeddings[0] = 0  # So that the padding will be ignored


  # Build the embedding matrix
  for word, index in word2index.items():
      if word in word2vec_model.vocab:
          embeddings[index] = word2vec_model.word_vec(word)
  return embeddings
embedding_dim=300
EMBEDDING_FILE = '/content/GoogleNews-vectors-negative300.bin'
embeddings=load_embedding(EMBEDDING_FILE,embedding_dim)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


### Max length 


In [0]:
def max_length(all_corpus):
  lines=[]
  max_len=-1
  for key in all_corpus:
    for d in key:
      if len(d.split())>max_len:
        max_len=len(d.split())
  return max_len

max_len=max_length(all_corpus)

## Creating training data 


In [0]:
#If len is not equal to max_len then doing post padding
max_len=50
def create_train_data(dataset,max_length,column):
  X1=list()
  for idx in range(len(dataset)):
    for words in (data_new.iloc[idx][[column]].values):
      numeric_seq = [word2index[word] for word \
                     in words.split() if word in word2index]
      in_seq=numeric_seq
      in_seq=pad_sequences([in_seq],maxlen=max_length,\
                           padding='post')[0]
    X1.append(in_seq)
  return X1


q1=np.array(create_train_data(data_new,max_len,'q1'))
q2=np.array(create_train_data(data_new,max_len,'q2'))

## Train Test Split


In [0]:
def split_train_test(q1,q2,data):
  X = np.stack((q1, q2), axis=1)
  X_train, X_test, y_train, y_test = X[:-10], \
  X[-10:],list(data['is_duplicate'])[:-10],list(data['is_duplicate'])[-10:]
  train_q1 = X_train[:,0]
  train_q2 = X_train[:,1]
  test_q1 = X_test[:,0]
  test_q2 = X_test[:,1]
  return train_q1,train_q2,test_q1,test_q2,\
  y_train,y_test,X_train,X_test
train_q1,train_q2,test_q1,test_q2,y_train,\
y_test,X_train,X_test=split_train_test(q1,q2,data)
y_train=np.array(y_train)
y_test=np.array(y_test)

## Cosine distance

In [0]:
#Cosine distance
def cosine_distance(output):
  x,y=output[0],output[1]
  x = K.l2_normalize(x, axis=-1)
  y = K.l2_normalize(y, axis=-1)
  return -K.mean(x * y, axis=-1, keepdims=True)
  

## Contrastive Loss

In [0]:
#Triplet loss
def contrastive_loss(y_true, y_pred):
    margin = 1
    return K.mean(y_true * K.square(y_pred) + \
                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

## First Model Stacked Birectional GRU with cosine distance


In [0]:
input_q1 = Input(shape=(max_len,))
input_q2 = Input(shape=(max_len,))

embedding_layer = Embedding(len(embeddings), 
                 embedding_dim, 
                 weights=[embeddings], 
                 input_length=max_len, 
                 trainable=False)

embedded_q1 = embedding_layer(input_q1)
embedded_q2 = embedding_layer(input_q2)

lstm_first = Bidirectional(GRU(64, return_sequences=True))

lstm_q1_1 = lstm_first(embedded_q1)
lstm_q2_1 = lstm_first(embedded_q2)

dropout_layer = Dropout(0.2)

lstm_q1_1 = dropout_layer(lstm_q1_1)
lstm_q2_1 = dropout_layer(lstm_q2_1)

lstm_second=Bidirectional(GRU(64,return_sequences=False))

lstm_q1_2 = lstm_second(embedded_q1)
lstm_q2_2 = lstm_second(embedded_q2)

dropout_layer = Dropout(0.2)

lstm_q1_2 = dropout_layer(lstm_q1_2)
lstm_q2_2 = dropout_layer(lstm_q2_2)

lstm_out_1=add([lstm_q1_1,lstm_q1_2])
lstm_out_2=add([lstm_q2_1,lstm_q2_2])

bn_one = BatchNormalization()

bn_q1 = bn_one(lstm_out_1)
bn_q2 = bn_one(lstm_out_2)

output = Lambda(cosine_distance, name='cosine_distance')\
([bn_q1, bn_q2])

model = Model(inputs=[input_q1,input_q2], outputs=output)
model.summary()
model.compile(loss=contrastive_loss, optimizer='adam',\
              metrics=['accuracy'])

callback = [ModelCheckpoint('question_pairs_weights_type1.h5',\
                            monitor='loss', save_best_only=True,mode='min')]

history = model.fit([train_q1,train_q2],
                    np.expand_dims(y_train,axis=-1),
                    epochs=1,
                    batch_size=10,
                    callbacks=callback)

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 50, 300)      13730400    input_17[0][0]                   
                                                                 input_18[0][0]                   
__________________________________________________________________________________________________
bidirectional_10 (Bidirectional (None, 50, 128)      140544      embedding_8[0][0]          

## Result

#### Model Prediction on positive example in Test Data


In [0]:
data_new[-3:-2]

Unnamed: 0,q1,q2,output
99997,who would win black panther or batman,who would win in a fight between black panther...,1


In [0]:
y_pred=model.predict([test_q1[-3:-2],test_q2[-3:-2]])
pred_data=data_new[-3:-2]
print("Model Prediction on positive example in Test Data",\
      [np.argmax(i) for i in y_pred])

Model Prediction on positive example in Test Data [4]


#### Model Prediction on negative example in Test Data

In [0]:
data_new[-10:-9]

Unnamed: 0,q1,q2,output
99990,when is a root canal treatment necessary,what is a root canal treatment,0


In [0]:
y_pred=model.predict([test_q1[-5:-4],test_q2[-5:-4]])
pred_data=data_new[-5:-4]
print("Model Prediction on negative example in Test Data",\
      [np.argmax(i) for i in y_pred])

Model Prediction on negative example in Test Data [9]


## Second model CNN Siamese Network

In [0]:
max_len=50
embedding_dim=300

input_q1 = Input(shape=(max_len,))
input_q2 = Input(shape=(max_len,))

embedding_layer = Embedding(len(embeddings), 
                 embedding_dim, 
                 weights=[embeddings], 
                 input_length=max_len, 
                 trainable=False)

embedded_q1 = embedding_layer(input_q1)
embedded_q2 = embedding_layer(input_q2)

conv_first = Conv1D(filters=64,kernel_size=3,activation='relu')

conv_q1_1 = conv_first(embedded_q1)
conv_q2_1 = conv_first(embedded_q2)

max_pool_first=MaxPooling1D(pool_size=2)

max_q1_1=max_pool_first(conv_q1_1)
max_q2_1=max_pool_first(conv_q2_1)

dropout_layer = Dropout(0.2)

max_q1_1 = dropout_layer(max_q1_1)
max_q2_1 = dropout_layer(max_q2_1)

flat = Flatten()
max_q1_1=flat(max_q1_1)
max_q2_1=flat(max_q2_1)

conv_second = Conv1D(filters=64,kernel_size=2,activation='relu')

conv_q1_2 = conv_second(embedded_q1)
conv_q2_2 = conv_second(embedded_q2)

max_pool_second=MaxPooling1D(pool_size=2)

max_q1_2=max_pool_second(conv_q1_2)
max_q2_2=max_pool_second(conv_q2_2)

dropout_layer = Dropout(0.2)

max_q1_2 = dropout_layer(max_q1_2)
max_q2_2 = dropout_layer(max_q2_2)


flat = Flatten()
max_q1_2=flat(max_q1_2)
max_q2_2=flat(max_q2_2)


conv_out_1=add([max_q1_1,max_q1_2])
conv_out_2=add([max_q2_1,max_q2_2])

bn_one = BatchNormalization()

bn_q1 = bn_one(conv_out_1)
bn_q2 = bn_one(conv_out_2)

concat_input = concatenate([bn_q1,bn_q2])
check_duplicate = Dense(1, activation='sigmoid')(concat_input)

model = Model(inputs=[input_q1,input_q2], outputs=check_duplicate)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam',\
              metrics=['accuracy'])

callbacks = [ModelCheckpoint('question_pairs_weights_type2.h5',\
                             monitor='val_acc', save_best_only=True)]


steps=len(train_q1)//batch_size
history = model.fit([train_q1,train_q2],
                    np.expand_dims(y_train,axis=-1),
                    epochs=5,
                    batch_size=10,
                    callbacks=callbacks)


Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 50, 300)      13730400    input_19[0][0]                   
                                                                 input_20[0][0]                   
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 48, 64)       57664       embedding_9[0][0]          



Epoch 2/5




Epoch 3/5




Epoch 4/5




Epoch 5/5




