### **1. Data pre-processing**

In [1]:
import pandas as pd
import random

# Load example dataset (Replace with the path to your original dataset for fine-tuning the model)
data = pd.read_csv("data\merged_dataset.csv")

# Remove rows with missing values
data = data[data.notna()["q1"] * data.notna()["q2"]].reset_index(drop=True)

In [2]:
# Add random label to the samples
data["labels"] = [random.choice([0,1]) for i in range(len(data))]

In [3]:
# Display few samples
data.head()

Unnamed: 0,q1,q2,labels
0,Can I make a new phone out of this old Motorol...,My brother blocked me on WhatsApp so I want to...,0
1,Is there a log or something on an Android devi...,How do I use the MyCo app for earning?,0
2,"Which mobile is best with 256 ROM, 8 or 12 RAM...",What can you tell me about the Realme 10 Pro P...,0
3,What are the features of FMWhatsApp?,What Tecno phone supports VR videos?,1
4,How do handsfree devices enhance the mobile ex...,How can I tell if someone is listening to my m...,1


In [4]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

### **2. Model Fine-Tuning**

In [6]:
# Import the CrossEncoder class from the sentence_transformers library
from sentence_transformers import CrossEncoder

# Initialize a CrossEncoder model using the 'quora-distilroberta-base' pre-trained model, which is trained on the Quora dataset
# Specify the device to use for computation (e.g., 'cuda' for GPU or 'cpu' for CPU)
model = CrossEncoder('cross-encoder/quora-distilroberta-base', device='cuda')

In [7]:
# Predict similarity scores for a pair of input sentences using the pre-trained CrossEncoder model.
# The input is provided as a list of tuples, each containing two sentences to compare.
# The model computes a score indicating the similarity between the two sentences.
scores = model.predict([('What are the exams to get a job in forensic', 'Which exams should i attend to get a job in forensic')])

# Output the similarity score computed by the model for the given input sentences.
print(scores)

[0.9724152]


#### **Training data**

In [24]:
# Import necessary modules
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from sentence_transformers import InputExample, losses, LoggingHandler
from torch.utils.data import DataLoader
import math
import logging

In [25]:
# Initialize empty lists to store training and validation examples
train_examples = []
valid_examples = []

# Loop through each row in the train_data DataFrame
for ind in range(len(train_data)):
    # Create an InputExample object for each pair of questions in the train_data DataFrame
    # The texts argument contains a list of the two questions
    # The label argument contains the corresponding label (cast to integer)
    train_examples.append(InputExample(texts=[train_data.loc[ind, "q1"], train_data.loc[ind, "q2"]], label=int(train_data.loc[ind, "labels"])))

# Loop through each row in the test_data DataFrame
for ind in range(len(test_data)):
    # Create an InputExample object for each pair of questions in the test_data DataFrame
    # The texts argument contains a list of the two questions
    # The label argument contains the corresponding label (cast to integer)
    valid_examples.append(InputExample(texts=[test_data.loc[ind, "q1"], test_data.loc[ind, "q2"]], label=int(test_data.loc[ind, "labels"])))


In [10]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
valid_dataloader = DataLoader(valid_examples, shuffle=False, batch_size=16)

In [11]:
train_loss = losses.CosineSimilarityLoss(model)

In [13]:
evaluator = CEBinaryClassificationEvaluator(list(zip(test_data["q1"].to_list(),
                                            test_data["q2"].to_list())),
                                            test_data["labels"].to_list(),
                                            name="Test-Quora")

In [14]:
num_epochs = 10
warm_up_steps = math.ceil(len(train_dataloader)*0.1)
output_path = "model_checkpoints"

In [22]:

logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)

logging.info("Train samples: {}".format(len(train_examples)))
logging.info("Warmup-steps: {}".format(warm_up_steps))


2024-04-10 13:53:11 - Train samples: 16076
2024-04-10 13:53:11 - Warmup-steps: 101


In [26]:
# model.fit(train_dataloader=train_dataloader,
#           epochs=num_epochs,
#           evaluator=evaluator,
#           evaluation_steps=100,
#           warmup_steps=warm_up_steps,
#           output_path=output_path,
#           show_progress_bar=True,
#           use_amp=True)