In [1]:
import re
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Data Import

The training data will have a real-values semantic textual relatedness score (between 0 and 1) for a pair of Marathi-language sentences.

The data is structured as a CSV file with the following fields:
- PairID: a unique identifier for the sentence pair
- Text: two sentences separated by a newline ('\n') character
- Score: the semantic textual relatedness score for the two sentences

Below we will show you how to load and re-format the provided data file.

In [2]:
# Load the File
df_str_rel = pd.read_csv('https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/eng/eng_train.csv')
df_str_rel.head()

Unnamed: 0,PairID,Text,Score
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0
3,ENG-train-0003,If he is good looking and has a good personali...,1.0
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0


In [3]:
y = df_str_rel['Score']

In [4]:
# Creating a column "Split_Text" which is a list of two sentences.
df_str_rel['Split_Text'] = df_str_rel['Text'].apply(lambda x: x.split("\n"))
df_str_rel.head()

Unnamed: 0,PairID,Text,Score,Split_Text
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,"[It that happens, just pull the plug., if that..."
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,"[A black dog running through water., A black d..."
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,"[I've been searchingthe entire abbey for you.,..."
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,[If he is good looking and has a good personal...
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,"[She does not hate you, she is just annoyed wi..."


In [5]:
# from tensorflow.keras.preprocessing.text import Tokenizer

# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(df_str_rel['Split_Text'].apply(lambda x: x[0] + ' ' + x[1]))

# Convert each text to sequences
X1 = df_str_rel['Split_Text'].apply(lambda x: x[0])
X2 = df_str_rel['Split_Text'].apply(lambda x: x[1])

type(X1)

pandas.core.series.Series

In [6]:
X1 = X1.values
X2 = X2.values
type(X1)

numpy.ndarray

In [7]:
df_str_rel = df_str_rel.assign(sentence1 = X1)
df_str_rel = df_str_rel.assign(sentence2 = X2)
df_str_rel

Unnamed: 0,PairID,Text,Score,Split_Text,sentence1,sentence2
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,"[It that happens, just pull the plug., if that...","It that happens, just pull the plug.","if that ever happens, just pull the plug."
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,"[A black dog running through water., A black d...",A black dog running through water.,A black dog is running through some water.
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,"[I've been searchingthe entire abbey for you.,...",I've been searchingthe entire abbey for you.,I'm looking for you all over the abbey.
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,[If he is good looking and has a good personal...,If he is good looking and has a good personali...,"If he's good looking, and a good personality, ..."
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,"[She does not hate you, she is just annoyed wi...","She does not hate you, she is just annoyed wit...","She doesn't hate you, she is just annoyed."
...,...,...,...,...,...,...
5495,ENG-train-5495,A young boy pounding on an anvil.\nWoman sits ...,0.0,"[A young boy pounding on an anvil., Woman sits...",A young boy pounding on an anvil.,Woman sits on the curb talking on a cellphone.
5496,ENG-train-5496,I love how he recognized his wife tempered his...,0.0,[I love how he recognized his wife tempered hi...,I love how he recognized his wife tempered his...,"Torpedo Ink is Viktor's Band of Brothers, the ..."
5497,ENG-train-5497,I actually read a chapter or two beyond that p...,0.0,[I actually read a chapter or two beyond that ...,I actually read a chapter or two beyond that p...,Lets say she's a blend of two types of beings.
5498,ENG-train-5498,A boy gives being in the snow two thumbs up.\n...,0.0,"[A boy gives being in the snow two thumbs up.,...",A boy gives being in the snow two thumbs up.,A satisfied cat is perched beside a crystal lamp.


In [8]:
df_str_rel.drop(columns = ['Text', 'Split_Text'], inplace = True)
df_str_rel

Unnamed: 0,PairID,Score,sentence1,sentence2
0,ENG-train-0000,1.0,"It that happens, just pull the plug.","if that ever happens, just pull the plug."
1,ENG-train-0001,1.0,A black dog running through water.,A black dog is running through some water.
2,ENG-train-0002,1.0,I've been searchingthe entire abbey for you.,I'm looking for you all over the abbey.
3,ENG-train-0003,1.0,If he is good looking and has a good personali...,"If he's good looking, and a good personality, ..."
4,ENG-train-0004,1.0,"She does not hate you, she is just annoyed wit...","She doesn't hate you, she is just annoyed."
...,...,...,...,...
5495,ENG-train-5495,0.0,A young boy pounding on an anvil.,Woman sits on the curb talking on a cellphone.
5496,ENG-train-5496,0.0,I love how he recognized his wife tempered his...,"Torpedo Ink is Viktor's Band of Brothers, the ..."
5497,ENG-train-5497,0.0,I actually read a chapter or two beyond that p...,Lets say she's a blend of two types of beings.
5498,ENG-train-5498,0.0,A boy gives being in the snow two thumbs up.,A satisfied cat is perched beside a crystal lamp.


In [9]:
df_str_rel = df_str_rel[['PairID', 'sentence1', 'sentence2', 'Score']]
df_str_rel

Unnamed: 0,PairID,sentence1,sentence2,Score
0,ENG-train-0000,"It that happens, just pull the plug.","if that ever happens, just pull the plug.",1.0
1,ENG-train-0001,A black dog running through water.,A black dog is running through some water.,1.0
2,ENG-train-0002,I've been searchingthe entire abbey for you.,I'm looking for you all over the abbey.,1.0
3,ENG-train-0003,If he is good looking and has a good personali...,"If he's good looking, and a good personality, ...",1.0
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...","She doesn't hate you, she is just annoyed.",1.0
...,...,...,...,...
5495,ENG-train-5495,A young boy pounding on an anvil.,Woman sits on the curb talking on a cellphone.,0.0
5496,ENG-train-5496,I love how he recognized his wife tempered his...,"Torpedo Ink is Viktor's Band of Brothers, the ...",0.0
5497,ENG-train-5497,I actually read a chapter or two beyond that p...,Lets say she's a blend of two types of beings.,0.0
5498,ENG-train-5498,A boy gives being in the snow two thumbs up.,A satisfied cat is perched beside a crystal lamp.,0.0


# Preprocessing is done till here.

In [10]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_trans

In [11]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import InputExample
from sentence_transformers import losses

model = SentenceTransformer( "sentence-transformers/all-mpnet-base-v2" )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [12]:
ds = df_str_rel[ [ 'sentence1', 'sentence2', 'Score'] ].to_numpy()

train_examples = []
for i in range( ds.shape[0] ):
  train_examples.append( InputExample(texts=[ ds[i][0] , ds[i][1] ] , label=ds[i][2] ) )


In [13]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

In [14]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/344 [00:00<?, ?it/s]

# Training completed here

In [15]:
test_ds = pd.read_csv( "https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/eng/eng_dev.csv" )
test_ds

Unnamed: 0,PairID,Text
0,ENG-dev-0000,The story is gripping and interesting.\nIt's a...
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...
2,ENG-dev-0002,and from your post i think you are to young to...
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...
4,ENG-dev-0004,I am still confused about how I feel about thi...
...,...,...
245,ENG-dev-0245,thats just how they are :( its a shame lol\nIt...
246,ENG-dev-0246,I feel sorry for the books that I will read af...
247,ENG-dev-0247,Uwe Seeler -LRB- born 5 November 1936 in Hambu...
248,ENG-dev-0248,Waco is a city in and the county seat of McLen...


In [16]:
test_ds['Split_Text'] = test_ds['Text'].apply(lambda x: x.split("\n"))
test_ds.head()

Unnamed: 0,PairID,Text,Split_Text
0,ENG-dev-0000,The story is gripping and interesting.\nIt's a...,"[The story is gripping and interesting., It's ..."
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...,[The majority of Southeast Alaska 's area is p...
2,ENG-dev-0002,and from your post i think you are to young to...,[and from your post i think you are to young t...
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...,[The film 's success also made Dreamworks Anim...
4,ENG-dev-0004,I am still confused about how I feel about thi...,[I am still confused about how I feel about th...


In [17]:
X1_dev = test_ds['Split_Text'].apply(lambda x: x[0])
X2_dev = test_ds['Split_Text'].apply(lambda x: x[1])

test_ds = test_ds.assign(sentence1 =X1_dev)
test_ds = test_ds.assign(sentence2 =X2_dev)

test_ds.drop(columns = ['Text', 'Split_Text'], inplace = True)

test_ds

Unnamed: 0,PairID,sentence1,sentence2
0,ENG-dev-0000,The story is gripping and interesting.,"It's a brilliant, compelling, and heartfelt st..."
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...,A lot of of the panhandle is part of the Tonga...
2,ENG-dev-0002,and from your post i think you are to young to...,I think it will be very bad if he acquires her...
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...,There have also been two sequels -LRB- follow-...
4,ENG-dev-0004,I am still confused about how I feel about thi...,"In this particular book, Blue and Gansey are s..."
...,...,...,...
245,ENG-dev-0245,thats just how they are :( its a shame lol,It is a shame that they are like that.
246,ENG-dev-0246,I feel sorry for the books that I will read af...,: More than one person recommended this book t...
247,ENG-dev-0247,Uwe Seeler -LRB- born 5 November 1936 in Hambu...,Alain Giresse -LRB- born 2 August 1952 in Lang...
248,ENG-dev-0248,Waco is a city in and the county seat of McLen...,It is the county seat of Morris County .


In [18]:
def cosine( e1 , e2 ):
  return np.dot( e1.T , e2 ) / ( np.linalg.norm( e1 , ord=2 ) * np.linalg.norm( e2 , ord=2 ) )

test_ds = test_ds.to_numpy()
scores = []
for id , s1 , s2 in test_ds:
  e = model.encode( [ s1 , s2 ] )
  scores.append( cosine( e[0] , e[1] ) )


# Generate file for submission

Submission file has two columns: '**PairID**' and '**Pred_Score**'

In [19]:
pred_ds = {
    "PairID": test_ds[ : , 0 ] ,
    "Pred_Score": scores
}
pred_ds = pd.DataFrame.from_dict( pred_ds )
pred_ds

Unnamed: 0,PairID,Pred_Score
0,ENG-dev-0000,0.766970
1,ENG-dev-0001,0.752152
2,ENG-dev-0002,0.268990
3,ENG-dev-0003,0.748619
4,ENG-dev-0004,0.540451
...,...,...
245,ENG-dev-0245,0.696491
246,ENG-dev-0246,0.265119
247,ENG-dev-0247,0.524571
248,ENG-dev-0248,0.343185


In [20]:
pred_ds.to_csv( "pred_eng_a.csv", index = False )