In [1]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('/kaggle/input/question-pairs-dataset/questions.csv')
data

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404346,404346,789792,789793,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404347,404347,789794,789795,Do you believe there is life after death?,Is it true that there is life after death?,1
404348,404348,789796,789797,What is one coin?,What's this coin?,0
404349,404349,789798,789799,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [4]:
train_len = 300000
test_len = 10*1024
train_data = data[:train_len]
test_data = data[train_len : train_len + test_len]
print("Train set", len(train_data), "Test set", len(test_data))
del(data)

Train set 300000 Test set 10240


In [5]:
#selected duplicate questions to train model
duplicate_index = (train_data['is_duplicate'] == 1).to_numpy()
duplicate_index = [i for i, x in enumerate(duplicate_index) if x]
print('Number of duplicate questions: ', len(duplicate_index))
duplicate_index[:10]

Number of duplicate questions:  111486


[5, 7, 11, 12, 13, 15, 16, 18, 20, 29]

In [6]:
q1_train_data = np.array(train_data['question1'][duplicate_index])
q2_train_data = np.array(train_data['question2'][duplicate_index])

q1_test_data = np.array(test_data['question1'])
q2_test_data = np.array(test_data['question2'])
y_test = np.array(test_data['is_duplicate'])

In [32]:
from sklearn.model_selection import train_test_split
from collections import defaultdict
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the tokenizer
tokenizer = Tokenizer(oov_token="<OOV>") 
combined_train_words = q1_train_data + q2_train_data

# Fit tokenizer on the training data
tokenizer.fit_on_texts(combined_train_words)
vocab_size = len(tokenizer.word_index) + 1 
print('The length of the vocabulary is:', vocab_size)

vocab = defaultdict(lambda: tokenizer.word_index.get("<OOV>", 0), tokenizer.word_index)

# Tokenize and convert questions to sequences
Q1_train_sequences = tokenizer.texts_to_sequences(q1_train_data)
Q2_train_sequences = tokenizer.texts_to_sequences(q2_train_data)
Q1_test_sequences = tokenizer.texts_to_sequences(q1_test_data)
Q2_test_sequences = tokenizer.texts_to_sequences(q2_test_data)

# Pad sequences to ensure they are of equal length
Q1_train_padded = pad_sequences(Q1_train_sequences, padding='post')
Q2_train_padded = pad_sequences(Q2_train_sequences, padding='post')
Q1_test_padded = pad_sequences(Q1_test_sequences, padding='post')
Q2_test_padded = pad_sequences(Q2_test_sequences, padding='post')

# Split the training data into training and validation sets
train_Q1, val_Q1, train_Q2, val_Q2 = train_test_split(Q1_train_padded, Q2_train_padded, test_size=0.2, random_state=42)

# Print lengths and information
print('Train set has reduced to:', len(train_Q1))
print('Test set length:', len(Q1_test_padded))
print('Number of duplicate questions:', len(Q1_train_padded))
print("The length of the training set is:", len(train_Q1))
print("The length of the validation set is:", len(val_Q1))

The length of the vocabulary is: 28311
Train set has reduced to: 89188
Test set length: 10240
Number of duplicate questions: 111486
The length of the training set is: 89188
The length of the validation set is: 22298


In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class QuestionDuplicateDataset(Dataset):
    def __init__(self, q1, q2):
        self.q1 = q1
        self.q2 = q2
    
    def __len__(self):
        return len(self.q1)
    
    def __getitem__(self, idx):
        return self.q1[idx], self.q2[idx]

train_dataset = QuestionDuplicateDataset(train_Q1, train_Q2)
val_dataset = QuestionDuplicateDataset(val_Q1, val_Q2)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(val_dataset, batch_size=128, shuffle=True)

In [9]:
#define model
class Siamese(nn.Module):
    def __init__(self, vocab_size=41699, d_model = 128):
        super(Siamese, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.lstm = nn.LSTM(d_model, d_model, batch_first=True)
        self.d_model = d_model
    
    def forward(self, q1, q2):
        q1_processed = self.process(q1)
        q2_processed = self.process(q2)

        return q1_processed, q2_processed
    
    def process(self, x):
        #embedding
        x = self.embedding(x)
        #x shape: [batch_size, seq_len, d_model]
        
        #LSTM with hidden state and cell state
        x, (hn, cn) = self.lstm(x)
        #x shape: [batch_size, seq_len, d_model]
        
        #mean
        x = torch.mean(x, dim=1)
        #x shape: [batch_size, d_model]
    
        #normalize
        x = F.normalize(x, p=2, dim=1)
        #x shape: [batch_size, d_model]
        return x

model = Siamese()
print(model)

Siamese(
  (embedding): Embedding(41699, 128)
  (lstm): LSTM(128, 128, batch_first=True)
)


\begin{align}
 \mathcal{Loss_{1}(A,P,N)} &=\max \left( -cos(A,P)  + mean_{neg} +\alpha, 0\right) \\
 \mathcal{Loss_{2}(A,P,N)} &=\max \left( -cos(A,P)  + closest_{neg} +\alpha, 0\right) \\
\mathcal{Loss(A,P,N)} &= mean(Loss_1 + Loss_2) \\
\end{align}

In [10]:
#define loss function
def TripletLossFn(v1, v2, margin=0.25):
    scores = F.cosine_similarity(v1.unsqueeze(1), v2.unsqueeze(0), dim=-1)
    #shape: [batch_size, batch_size]
    batch_size = len(scores)
    
    positive = torch.diagonal(scores)
    #shape: [batch_size]
    eye = torch.eye(batch_size, device=scores.device)
    #identity matrix shape: [batch_size, batch_size]
    negative_zero_on_duplicate = scores * (1.0 - eye)
    #shape: [batch_size, batch_size]
    mean_negative = torch.sum(negative_zero_on_duplicate, dim=1) / (batch_size - 1)
    #shape: [batch_size]
    closest_negative = torch.max(negative_zero_on_duplicate, dim=1).values
    #shape: [batch_size]
    triplet_loss1 = torch.clamp(margin - positive + closest_negative, min=0.0)
    #shape: [batch_size]
    triplet_loss2 = torch.clamp(margin - positive + mean_negative, min=0.0)
    #shape: [batch_size]
    triplet_loss = torch.sum(triplet_loss1 + triplet_loss2)
    #scalar
    return triplet_loss

In [25]:
from tqdm import tqdm

def train_model(model, train_loader, val_loader, epochs=10, learning_rate=0.01, margin=0.25):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(epochs):
        model.train()
        
        running_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for batch in progress_bar:
            q1, q2 = batch
            q1, q2 = q1.to(device), q2.to(device)
            
            optimizer.zero_grad()

            v1, v2 = model(q1, q2)

            loss = TripletLossFn(v1, v2, margin=margin)

            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            progress_bar.set_postfix(loss=running_loss / len(train_loader))
        
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader)}")
        
        # Validation
        validate_model(model, val_loader)

def validate_model(model, val_loader):
    model.eval()  
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            q1, q2 = batch
            q1, q2 = q1.to(device), q2.to(device)

            v1, v2 = model(q1, q2)

            loss = TripletLossFn(v1, v2)
            val_loss += loss.item()

    print(f"Validation Loss: {val_loss/len(val_loader)}")

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = Siamese().to(device)

cuda


In [27]:
#train model
train_model(model, train_loader, test_loader, epochs=10, learning_rate=0.001, margin=0.25)

Epoch 1/10: 100%|██████████| 697/697 [00:09<00:00, 73.34it/s, loss=9.32]


Epoch [1/10], Loss: 9.32205470973142
Validation Loss: 6.02158684355872


Epoch 2/10: 100%|██████████| 697/697 [00:09<00:00, 73.89it/s, loss=4.96] 


Epoch [2/10], Loss: 4.961131506020917
Validation Loss: 4.861126567295619


Epoch 3/10: 100%|██████████| 697/697 [00:09<00:00, 73.38it/s, loss=3.98] 


Epoch [3/10], Loss: 3.9785283008641117
Validation Loss: 4.454091246298381


Epoch 4/10: 100%|██████████| 697/697 [00:09<00:00, 73.66it/s, loss=3.52] 


Epoch [4/10], Loss: 3.5248368255890252
Validation Loss: 4.304513124397823


Epoch 5/10: 100%|██████████| 697/697 [00:09<00:00, 73.74it/s, loss=3.22] 


Epoch [5/10], Loss: 3.217541006066365
Validation Loss: 4.13596147945949


Epoch 6/10: 100%|██████████| 697/697 [00:09<00:00, 73.76it/s, loss=3]    


Epoch [6/10], Loss: 2.9974023468012105
Validation Loss: 3.849488261767796


Epoch 7/10: 100%|██████████| 697/697 [00:09<00:00, 73.93it/s, loss=2.83] 


Epoch [7/10], Loss: 2.83347126033419
Validation Loss: 3.8568617675134114


Epoch 8/10: 100%|██████████| 697/697 [00:09<00:00, 73.68it/s, loss=2.66] 


Epoch [8/10], Loss: 2.6576051110858727
Validation Loss: 3.7236525538989476


Epoch 9/10: 100%|██████████| 697/697 [00:09<00:00, 73.58it/s, loss=2.6]  


Epoch [9/10], Loss: 2.5971395254477194
Validation Loss: 3.6846274985585894


Epoch 10/10: 100%|██████████| 697/697 [00:09<00:00, 73.77it/s, loss=2.55] 


Epoch [10/10], Loss: 2.550795706777354
Validation Loss: 3.7308007631983076


In [28]:
#test model
def classify(test_Q1, test_Q2, y, threshold, model, vocab, batch_size=64):
    model.eval() 
    accuracy = 0
    total = len(test_Q1)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for i in range(0, total, batch_size):
        q1_batch = torch.tensor(test_Q1[i:i + batch_size]).to(device)
        q2_batch = torch.tensor(test_Q2[i:i + batch_size]).to(device)
        y_batch = torch.tensor(y[i:i + batch_size]).to(device)

        with torch.no_grad():
            v1, v2 = model(q1_batch, q2_batch)

        for j in range(len(q1_batch)):
            #cosine similarity
            d = torch.dot(v1[j], v2[j]) / (torch.norm(v1[j]) * torch.norm(v2[j]))
            # Check if the cosine similarity is greater than the threshold
            res = d.item() > threshold
            # Increment accuracy if prediction matches actual label
            accuracy += (y_batch[j].item() == res)

    accuracy = accuracy / total
    return accuracy

In [29]:
accuracy = classify(Q1_test_padded, Q2_test_padded, y_test, 0.7, model, vocab, batch_size = 512)
print("Accuracy", accuracy)

Accuracy 0.701171875


In [51]:
#Check with my own example
def predict(question1, question2, threshold, model, verbose=False):
    q1_sequence = tokenizer.texts_to_sequences([question1])
    q2_sequence = tokenizer.texts_to_sequences([question2])

    q1_padded = pad_sequences(q1_sequence, padding='post')
    q2_padded = pad_sequences(q2_sequence, padding='post')

    device = next(model.parameters()).device 
    q1_tensor = torch.tensor(q1_padded, dtype=torch.long).to(device)
    q2_tensor = torch.tensor(q2_padded, dtype=torch.long).to(device)

    with torch.no_grad():
        v1, v2 = model(q1_tensor, q2_tensor)

    d = np.dot(v1[0].cpu().numpy(), v2[0].cpu().numpy().T)

    result = "Duplicate" if d > threshold else "Non-duplicate"

    if verbose:
        print("d   =", d)
        #print("result =", result)

    return result

In [52]:
question1 = "When will I see you?"
question2 = "When can I see you again?"

predict(question1, question2, 0.7, model, True)

d   = 0.80519843


'Duplicate'