## Importing the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, util
from sentence_transformers import models, losses
from sentence_transformers.readers import STSDataReader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter

### Reading the Pilot Data using pandas

The dataset has both the sentences seperated using a delimiter. The sentences are split and placed in two different columns.
The columns of the data include:
- `SourceID` - indicates the source dataset from which this sentence pair is taken
- `SubsetID` - indicates the subset inside the source dataset
- `PairID` - represents a unique identifier for each pair of sentences 
- `Sentence_1` - represents first sentence
- `Sentence_2` - represents seconds sentence
- `Score` - semantic relatedness score in the range of `0` and `1`

In [3]:
DATA_DIR = "Semantic_Relatedness_SemEval2024/Pilot_data"
pilotdata = pd.read_csv("data.csv")
pilotdata[["Sentence_1", "Sentence_2"]] = pilotdata["Text"].str.split("\n", expand = True)
pilotdata.drop(["Text"], inplace = True, axis = 1)
print(len(pilotdata))
pilotdata.head(10)

5500


Unnamed: 0,Index,SourceID,SubsetID,PairID,Score,Sentence_1,Sentence_2
0,0,Formality,Formality_pp,Formality_pp_222,1.0,"It that happens, just pull the plug.","if that ever happens, just pull the plug."
1,1,STS,STS,STS_237,1.0,A black dog running through water.,A black dog is running through some water.
2,2,ParaNMT,ParaNMT_pp,ParaNMT_pp_204,1.0,I've been searchingthe entire abbey for you.,I'm looking for you all over the abbey.
3,3,Formality,Formality_pp,Formality_pp_119,1.0,If he is good looking and has a good personali...,"If he's good looking, and a good personality, ..."
4,4,Formality,Formality_pp,Formality_pp_174,1.0,"She does not hate you, she is just annoyed wit...","She doesn't hate you, she is just annoyed."
5,5,STS,STS,STS_211,1.0,Actor Gazzara dead at 81,Actor Ben Gazzara dies at 81
6,6,Formality,Formality_pp,Formality_pp_277,1.0,"No, I really didn't want New York to win.",No i didn't want New york to win
7,7,Formality,Formality_pp,Formality_pp_167,1.0,I hae no problems with them.,lol i have no problems with them.
8,8,Formality,Formality_pp,Formality_pp_123,1.0,Your parents do not have to like your boyfrien...,"your parents dont have to like your bf, you do."
9,9,Formality,Formality_pp,Formality_pp_194,1.0,"I think Taylor is really cute, but I hate his ...",I think Taylor is SUPER cute...but I hate his ...


### Splitting the data into train, validation and test sets

In [4]:
train_file = "str_train.csv"
test_file = "str_test.csv"
val_file = "str_val.csv"
DATA_DIR = "STR_dataset"
train_size = int(len(pilotdata)*0.9)
val_size = int(len(pilotdata)*0.05)
test_size = len(pilotdata) - train_size - val_size
train_data = pilotdata.loc[:train_size]
test_data = pilotdata.loc[train_size + val_size:]
val_data = pilotdata.loc[train_size:train_size + val_size]
print(len(train_data))
print(len(test_data))
print(len(val_data))

if os.path.exists(os.path.join(DATA_DIR,"str_train.csv")) and os.path.exists(os.path.join(DATA_DIR,"str_val.csv")) and os.path.exists(os.path.join(DATA_DIR,"str_test.csv")):
    os.remove(os.path.join(DATA_DIR,"str_train.csv"))
    os.remove(os.path.join(DATA_DIR,"str_val.csv"))
    os.remove(os.path.join(DATA_DIR,"str_test.csv"))

# preparing the dataset
for idx, row in train_data.iterrows():
    with open(os.path.join(DATA_DIR,train_file), "a") as file:
        line = row["Sentence_1"] + "\t" + row["Sentence_2"] + "\t" + str(row["Score"])
        file.write(line+"\n")
for idx, row in test_data.iterrows():
    with open(os.path.join(DATA_DIR,test_file), "a") as file:
        line = row["Sentence_1"] + "\t" + row["Sentence_2"] + "\t" + str(row["Score"])
        file.write(line+"\n")  
for idx, row in val_data.iterrows():
    with open(os.path.join(DATA_DIR,val_file), "a") as file:
        line = row["Sentence_1"] + "\t" + row["Sentence_2"] + "\t" + str(row["Score"])
        file.write(line+"\n") 

4951
275
276


### Creation of lists for storing the set of first sentences and second sentences in each pair to be fed to the transformers for generating embeddings

In [5]:
def collate_fn(batch):
    # Splitting the batch of InputExample objects into separate lists
    sentences1 = [example.texts[0] for example in batch]
    sentences2 = [example.texts[1] for example in batch]
    scores = [example.label for example in batch]
    return sentences1, sentences2, scores

### Finetuning the `distiluse-base-multilingual-cased-v1` model
The given multilingual model is finetuned over the given `pilot` sentence tokens for 10 epochs and evaluated on cosine similarity

In [7]:
train_batch_size = 16
model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
str_reader = STSDataReader('STR_dataset', normalize_scores=True)
traindataset = SentencesDataset(examples = str_reader.get_examples(train_file), model = model)
trainloader = DataLoader(traindataset, batch_size=train_batch_size,collate_fn=collate_fn,shuffle=True)
train_loss = losses.CosineSimilarityLoss(model = model)
valdataset = SentencesDataset(examples = str_reader.get_examples(val_file), model = model)
valloader = DataLoader(valdataset, batch_size=train_batch_size, shuffle=False, collate_fn=collate_fn)

sentences1 = []
sentences2 = []
scores = []

for batch in valloader:
#     print(batch)
    batch_sentences1, batch_sentences2, batch_scores = batch
    sentences1.extend(batch_sentences1)
    sentences2.extend(batch_sentences2)
    scores.extend(batch_scores)

evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

num_epochs = 10
model.fit(train_objectives=[(trainloader, train_loss)],
         evaluator = evaluator,
         epochs = num_epochs,
         evaluation_steps= 10000,
         warmup_steps = 100,
         output_path="model_path")

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

In [8]:
with open('STR_dataset/str_train.csv', 'r') as f:
    for idx, line in enumerate(f):
        parts = line.strip().split("\t")
        try:
            float(parts[0])
        except ValueError:
            print(f"Error on line {idx + 1}: {line.strip()}")

Error on line 1: It that happens, just pull the plug.	if that ever happens, just pull the plug.	1.0
Error on line 2: A black dog running through water.	A black dog is running through some water.	1.0
Error on line 3: I've been searchingthe entire abbey for you.	I'm looking for you all over the abbey.	1.0
Error on line 4: If he is good looking and has a good personality, he might be straight - but is more likely bisexual.	If he's good looking, and a good personality, he MIGHT be straight, but more likely bi.	1.0
Error on line 5: She does not hate you, she is just annoyed with you.	She doesn't hate you, she is just annoyed.	1.0
Error on line 6: Actor Gazzara dead at 81	Actor Ben Gazzara dies at 81	1.0
Error on line 7: No, I really didn't want New York to win.	No i didn't want New york to win	1.0
Error on line 8: I hae no problems with them.	lol i have no problems with them.	1.0
Error on line 9: Your parents do not have to like your boyfriend, you do.	your parents dont have to like your bf

## Performing Inference on the test dataset

In [9]:
testdataset = SentencesDataset(examples=str_reader.get_examples('str_test.csv'), model=model)
testloader = DataLoader(testdataset, shuffle=False, batch_size=train_batch_size, collate_fn=collate_fn)

sentences1 = []
sentences2 = []
scores = []

for batch in testloader:
    batch_sentences1, batch_sentences2, batch_scores = batch
    sentences1.extend(batch_sentences1)
    sentences2.extend(batch_sentences2)
    scores.extend(batch_scores)

test_evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

model.evaluate(test_evaluator)


0.07363612074724793

### Evaluation on a sample sentence pair to obtain the relatedness score

In [10]:
model_path = 'model_path'
# model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
model = SentenceTransformer(model_path)
sentence1 = "It that happens, just pull the plug."
sentence2 = "if that just happens, just pull the plug."

embedding1 = model.encode(sentence1)
embedding2 = model.encode(sentence2)

similarity = 1 - cosine(embedding1, embedding2)  
print("Similarity:", similarity)

Similarity: 0.2571742534637451
