In [16]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import optimizers
from tensorflow.keras import layers
import re
from sklearn.model_selection import train_test_split
import nltk
import random as rnd
import tensorflow.keras.backend as K
import numpy as np
from tensorflow.keras.models import Model

In [2]:
df = pd.read_csv("./questions.csv")
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
df.dropna(inplace=True)

In [6]:
N_train = 300000
N_test  = 10*1024
df_train = df[:N_train]
df_test  = df[N_train:N_train+N_test]
print("Train set:", len(df_train), "Test set:", len(df_test))
del(df) # remove to free memory

Train set: 300000 Test set: 10240


In [7]:
data_train = df_train[df_train['is_duplicate'] == 1]

In [8]:
tokenzer = Tokenizer(oov_token="oov")
tokenzer.fit_on_texts(data_train["question1"])
tokenzer.fit_on_texts(data_train["question2"])
Q1_tokened = tokenzer.texts_to_sequences(data_train["question1"])
Q2_tokened = tokenzer.texts_to_sequences(data_train["question2"])

In [9]:
vocab_size = len(tokenzer.word_index)
vocab_size

28139

In [10]:
Q1_train = pad_sequences(Q1_tokened, maxlen=100, padding='post', truncating='post')
Q2_train = pad_sequences(Q2_tokened, maxlen=100, padding='post', truncating='post')

In [11]:
Data = tf.data.Dataset.from_tensor_slices((Q1_train,Q2_train))

In [12]:
valid_size = int(0.2*len(Data))
valid = Data.take(valid_size).batch(128,drop_remainder=True)
train = Data.skip(valid_size).shuffle(1000).batch(128,drop_remainder=True)

# Define model

In [43]:
def initialize_base_network(vocab_size = 41699,d_model = 128):
    model = tf.keras.Sequential()
    model.add(layers.Embedding(vocab_size, 128))
    model.add(layers.LSTM(128,activation='relu'))
    model.add(layers.Dense(128))
    model.add(layers.Lambda(lambda x: K.l2_normalize(x,axis=1)))
    return model

# Define Loss

In [44]:
#loss Function
from tensorflow.keras.losses import Loss
class TripletLoss(Loss):
    def __init__(self,margin):
        
        super().__init__()
        
        self.margin = margin
        
    def call(self,v1,v2):
        scores = tf.matmul(v1, tf.transpose(v2))  

        batch_size = len(scores)

        positive = tf.linalg.diag_part(scores)  

        #closest negative
        negative_without_positive = scores - 2.0 * tf.eye(batch_size)

        closest_negative = tf.math.reduce_max(negative_without_positive,axis=1)

        #mean negative
        negative_zero_on_duplicate = scores * (1.0 - tf.eye(batch_size))

        mean_negative = tf.math.reduce_sum(negative_zero_on_duplicate, axis=1) / (batch_size-1)

        #calculate total loss
        triplet_loss1 = tf.math.maximum(closest_negative - positive + 0.25 ,0)

        triplet_loss2 = tf.math.maximum(mean_negative - positive + 0.25 ,0)

        triplet_loss = tf.math.reduce_mean(triplet_loss1 + triplet_loss2)

        return triplet_loss

In [45]:
model = initialize_base_network()



# Train

In [46]:
def training_function(model,data_train,data_valid,epochs=3):
    optimizer = tf.keras.optimizers.Adam()
    loss_object = TripletLoss(0.25)
    epochs_train_losses = []
    epochs_val_losses = []
    for epoch in range(epochs):
        train_losses = []
        valid_losses = []
        #training
        for step,(q1,q2) in enumerate(data_train):
            with tf.GradientTape() as tape:
                v1 = model(q1)
                v2 = model(q2)
                loss_value = loss_object(v1, v2)
                
            train_losses.append(loss_value)
            gradients = tape.gradient(loss_value, model.trainable_weights)
            optimizer.apply_gradients(zip(gradients, model.trainable_weights))
            if step % 100 == 0:
                print(f"Step = {step} , Train loss = {np.mean(train_losses)}")
            
        #Validation
        for q1, q2 in data_valid:
            v1 = model(q1)
            v2 = model(q2)
            loss_value = loss_object(v1, v2)
            valid_losses.append(loss_value)
        
        losses_train_mean = np.mean(train_losses)
        losses_val_mean = np.mean(valid_losses)
        epochs_val_losses.append(losses_val_mean)
        epochs_train_losses.append(losses_train_mean)
        
        print('\n Epoch %s: Train loss: %.4f  Validation Loss: %.4f' % (epoch, float(losses_train_mean), float(losses_val_mean)))

In [47]:
training_function(model,train,valid)

Step = 0 , Train loss = 0.5000073909759521
Step = 100 , Train loss = 0.5000094771385193
Step = 200 , Train loss = 0.500007152557373
Step = 300 , Train loss = 0.5000061392784119
Step = 400 , Train loss = 0.5000055432319641
Step = 500 , Train loss = 0.5000051856040955
Step = 600 , Train loss = 0.5000049471855164

 Epoch 0: Train loss: 0.5000  Validation Loss: 0.5000
Step = 0 , Train loss = 0.5000374913215637
Step = 100 , Train loss = 0.5007869005203247
Step = 200 , Train loss = 0.5004115104675293
Step = 300 , Train loss = nan
Step = 400 , Train loss = nan
Step = 500 , Train loss = nan
Step = 600 , Train loss = nan

 Epoch 1: Train loss: nan  Validation Loss: nan
Step = 0 , Train loss = nan
Step = 100 , Train loss = nan
Step = 200 , Train loss = nan
Step = 300 , Train loss = nan
Step = 400 , Train loss = nan
Step = 500 , Train loss = nan
Step = 600 , Train loss = nan

 Epoch 2: Train loss: nan  Validation Loss: nan
