In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


In [3]:
!pip install -q sentence-transformers

In [4]:
# Required Libraries
import torch
from torch.utils.data import DataLoader, Dataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from sklearn.metrics import f1_score
import pandas as pd
import zipfile

  from tqdm.autonotebook import tqdm, trange


In [5]:
# Unzipping the train.csv.zip file
with zipfile.ZipFile('/kaggle/input/quora-question-pairs/train.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('/kaggle/working')

# Load the extracted train.csv file
df = pd.read_csv('/kaggle/working/train.csv')

# Preprocess the data by dropping missing values
df = df.dropna()

In [6]:
# Create examples for bi-encoder training
examples = []
for _, row in df.iterrows():
    examples.append(InputExample(texts=[row['question1'], row['question2']], label=float(row['is_duplicate'])))

In [7]:
# Split the dataset into train and validation sets (80% train, 20% validation)
train_size = int(0.8 * len(examples))
train_examples = examples[:train_size]
val_examples = examples[train_size:]

In [8]:
# Create Dataset Loaders
class QuoraDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

In [9]:
train_dataset = QuoraDataset(train_examples)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=32)

val_dataset = QuoraDataset(val_examples)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=32)

In [10]:
# -------- OFF-THE-SHELF MODEL EVALUATION --------

# Load an off-the-shelf model without fine-tuning
off_the_shelf_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
# Encode validation data
val_q1 = [example.texts[0] for example in val_examples]
val_q2 = [example.texts[1] for example in val_examples]

In [12]:
val_q1_embeddings = off_the_shelf_model.encode(val_q1, convert_to_tensor=True)
val_q2_embeddings = off_the_shelf_model.encode(val_q2, convert_to_tensor=True)

Batches:   0%|          | 0/2527 [00:00<?, ?it/s]

Batches:   0%|          | 0/2527 [00:00<?, ?it/s]

In [13]:
# Compute cosine similarity between the pairs
cosine_scores_off_the_shelf = torch.nn.functional.cosine_similarity(val_q1_embeddings, val_q2_embeddings)

In [14]:
# Define a threshold for duplicate detection (cosine similarity > threshold)
threshold = 0.7
y_true = [int(example.label) for example in val_examples]
y_pred_off_the_shelf = (cosine_scores_off_the_shelf > threshold).int().tolist()

In [15]:
# Compute F1-Score for the off-the-shelf model
f1_off_the_shelf = f1_score(y_true, y_pred_off_the_shelf)

In [16]:
print(f"F1-Score for Off-the-Shelf Model: {f1_off_the_shelf:.4f}")

F1-Score for Off-the-Shelf Model: 0.6811


In [17]:
# -------- FINE-TUNING THE MODEL --------

# Load a pre-trained model for fine-tuning
fine_tuned_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Define Contrastive Loss
train_loss = losses.ContrastiveLoss(model=fine_tuned_model)




In [19]:
# Fine-tune the model
fine_tuned_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,  # Increase epochs for better results
    warmup_steps=100,
    output_path="./output"
)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
500,0.0169
1000,0.0149
1500,0.0141
2000,0.0136
2500,0.013
3000,0.0129
3500,0.0126
4000,0.0123
4500,0.0122
5000,0.0122


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [20]:
# Encode validation data using fine-tuned model
val_q1_embeddings_fine_tuned = fine_tuned_model.encode(val_q1, convert_to_tensor=True)
val_q2_embeddings_fine_tuned = fine_tuned_model.encode(val_q2, convert_to_tensor=True)

Batches:   0%|          | 0/2527 [00:00<?, ?it/s]

Batches:   0%|          | 0/2527 [00:00<?, ?it/s]

In [21]:
# Compute cosine similarity using fine-tuned model
cosine_scores_fine_tuned = torch.nn.functional.cosine_similarity(val_q1_embeddings_fine_tuned, val_q2_embeddings_fine_tuned)

# Predict duplicates based on similarity scores using the same threshold
y_pred_fine_tuned = (cosine_scores_fine_tuned > threshold).int().tolist()

# Compute F1-Score for the fine-tuned model
f1_fine_tuned = f1_score(y_true, y_pred_fine_tuned)

In [22]:
print(f"F1-Score for Fine-Tuned Model: {f1_fine_tuned:.4f}")

# -------- COMPARISON --------
print(f"\nOff-the-Shelf Model F1-Score: {f1_off_the_shelf:.4f}")
print(f"Fine-Tuned Model F1-Score: {f1_fine_tuned:.4f}")

F1-Score for Fine-Tuned Model: 0.8344

Off-the-Shelf Model F1-Score: 0.6811
Fine-Tuned Model F1-Score: 0.8344
