In [1]:
import re
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
pip install -U deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m30.7/42.3 kB[0m [31m784.0 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m769.4 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [3]:
from deep_translator import (GoogleTranslator)

In [4]:
langs_dict = GoogleTranslator().get_supported_languages(as_dict=True)  # output: {arabic: ar, french: fr, english:en etc...}
print(langs_dict)

{'afrikaans': 'af', 'albanian': 'sq', 'amharic': 'am', 'arabic': 'ar', 'armenian': 'hy', 'assamese': 'as', 'aymara': 'ay', 'azerbaijani': 'az', 'bambara': 'bm', 'basque': 'eu', 'belarusian': 'be', 'bengali': 'bn', 'bhojpuri': 'bho', 'bosnian': 'bs', 'bulgarian': 'bg', 'catalan': 'ca', 'cebuano': 'ceb', 'chichewa': 'ny', 'chinese (simplified)': 'zh-CN', 'chinese (traditional)': 'zh-TW', 'corsican': 'co', 'croatian': 'hr', 'czech': 'cs', 'danish': 'da', 'dhivehi': 'dv', 'dogri': 'doi', 'dutch': 'nl', 'english': 'en', 'esperanto': 'eo', 'estonian': 'et', 'ewe': 'ee', 'filipino': 'tl', 'finnish': 'fi', 'french': 'fr', 'frisian': 'fy', 'galician': 'gl', 'georgian': 'ka', 'german': 'de', 'greek': 'el', 'guarani': 'gn', 'gujarati': 'gu', 'haitian creole': 'ht', 'hausa': 'ha', 'hawaiian': 'haw', 'hebrew': 'iw', 'hindi': 'hi', 'hmong': 'hmn', 'hungarian': 'hu', 'icelandic': 'is', 'igbo': 'ig', 'ilocano': 'ilo', 'indonesian': 'id', 'irish': 'ga', 'italian': 'it', 'japanese': 'ja', 'javanese': 'j

In [5]:
lang = "en"
translator = GoogleTranslator(source='es', target=lang)

# Data Import

The training data will have a real-values semantic textual relatedness score (between 0 and 1) for a pair of Marathi-language sentences.

The data is structured as a CSV file with the following fields:
- PairID: a unique identifier for the sentence pair
- Text: two sentences separated by a newline ('\n') character
- Score: the semantic textual relatedness score for the two sentences

Below we will show you how to load and re-format the provided data file.

In [6]:
# Load the File
df_str_rel = pd.read_csv('https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/esp/esp_train.csv')
df_str_rel.head()

Unnamed: 0,PairID,Text,Score
0,ESP-train-0000,Una mujer a punto de comer pescado.\nUna mujer...,1.0
1,ESP-train-0001,"Las protestas volvieron a Honduras, en este ca...",0.5
2,ESP-train-0002,"Acapulco: pasado espléndido, presente feroz\nO...",0.18
3,ESP-train-0003,Estuvo arriba y comenzó nuevamente desde abajo...,0.57
4,ESP-train-0004,No era más que un desierto; había artemisa en ...,0.26


In [7]:
y = df_str_rel['Score']

In [8]:
# Creating a column "Split_Text" which is a list of two sentences.
df_str_rel['Split_Text'] = df_str_rel['Text'].apply(lambda x: x.split("\n"))
df_str_rel.head()

Unnamed: 0,PairID,Text,Score,Split_Text
0,ESP-train-0000,Una mujer a punto de comer pescado.\nUna mujer...,1.0,"[Una mujer a punto de comer pescado., Una muje..."
1,ESP-train-0001,"Las protestas volvieron a Honduras, en este ca...",0.5,"[Las protestas volvieron a Honduras, en este c..."
2,ESP-train-0002,"Acapulco: pasado espléndido, presente feroz\nO...",0.18,"[Acapulco: pasado espléndido, presente feroz, ..."
3,ESP-train-0003,Estuvo arriba y comenzó nuevamente desde abajo...,0.57,[Estuvo arriba y comenzó nuevamente desde abaj...
4,ESP-train-0004,No era más que un desierto; había artemisa en ...,0.26,[No era más que un desierto; había artemisa en...


In [9]:


# Convert each text to sequences
X1 = df_str_rel['Split_Text'].apply(lambda x: x[0])
X2 = df_str_rel['Split_Text'].apply(lambda x: x[1])

type(X1)

pandas.core.series.Series

In [10]:
X1 = X1.values
X2 = X2.values
type(X1)

numpy.ndarray

In [11]:
df_str_rel = df_str_rel.assign(sentence1 = X1)
df_str_rel = df_str_rel.assign(sentence2 = X2)
df_str_rel

Unnamed: 0,PairID,Text,Score,Split_Text,sentence1,sentence2
0,ESP-train-0000,Una mujer a punto de comer pescado.\nUna mujer...,1.00,"[Una mujer a punto de comer pescado., Una muje...",Una mujer a punto de comer pescado.,Una mujer a punto de comer trucha.
1,ESP-train-0001,"Las protestas volvieron a Honduras, en este ca...",0.50,"[Las protestas volvieron a Honduras, en este c...","Las protestas volvieron a Honduras, en este ca...","Desde los sectores afectados, se considera que..."
2,ESP-train-0002,"Acapulco: pasado espléndido, presente feroz\nO...",0.18,"[Acapulco: pasado espléndido, presente feroz, ...","Acapulco: pasado espléndido, presente feroz","Otra nieta, de sólo dos años, también fue ases..."
3,ESP-train-0003,Estuvo arriba y comenzó nuevamente desde abajo...,0.57,[Estuvo arriba y comenzó nuevamente desde abaj...,Estuvo arriba y comenzó nuevamente desde abajo.,Y para mí Elio Berhanyer era como un dios que ...
4,ESP-train-0004,No era más que un desierto; había artemisa en ...,0.26,[No era más que un desierto; había artemisa en...,No era más que un desierto; había artemisa en ...,"Ciertamente, una parte de lo que necesitamos e..."
...,...,...,...,...,...,...
1557,ESP-train-1557,¿Cuál es el nombre de Goldfinger?\n¿Cuál es el...,0.14,"[¿Cuál es el nombre de Goldfinger?, ¿Cuál es e...",¿Cuál es el nombre de Goldfinger?,¿Cuál es el nombre del programa de televisión ...
1558,ESP-train-1558,¿Cuándo se publicó el primer Diario de Wall St...,0.24,[¿Cuándo se publicó el primer Diario de Wall S...,¿Cuándo se publicó el primer Diario de Wall St...,¿Qué año nuevo se celebra el 16 de febrero?
1559,ESP-train-1559,Menciona una parte del cuerpo afectada por el ...,0.74,[Menciona una parte del cuerpo afectada por el...,Menciona una parte del cuerpo afectada por el ...,Menciona un síntoma del virus Ébola.
1560,ESP-train-1560,¿En qué día se celebra el aniversario del desc...,0.50,[¿En qué día se celebra el aniversario del des...,¿En qué día se celebra el aniversario del desc...,¿Qué año nuevo se celebra el 16 de febrero?


In [12]:
df_str_rel.drop(columns = ['Text', 'Split_Text'], inplace = True)
df_str_rel

Unnamed: 0,PairID,Score,sentence1,sentence2
0,ESP-train-0000,1.00,Una mujer a punto de comer pescado.,Una mujer a punto de comer trucha.
1,ESP-train-0001,0.50,"Las protestas volvieron a Honduras, en este ca...","Desde los sectores afectados, se considera que..."
2,ESP-train-0002,0.18,"Acapulco: pasado espléndido, presente feroz","Otra nieta, de sólo dos años, también fue ases..."
3,ESP-train-0003,0.57,Estuvo arriba y comenzó nuevamente desde abajo.,Y para mí Elio Berhanyer era como un dios que ...
4,ESP-train-0004,0.26,No era más que un desierto; había artemisa en ...,"Ciertamente, una parte de lo que necesitamos e..."
...,...,...,...,...
1557,ESP-train-1557,0.14,¿Cuál es el nombre de Goldfinger?,¿Cuál es el nombre del programa de televisión ...
1558,ESP-train-1558,0.24,¿Cuándo se publicó el primer Diario de Wall St...,¿Qué año nuevo se celebra el 16 de febrero?
1559,ESP-train-1559,0.74,Menciona una parte del cuerpo afectada por el ...,Menciona un síntoma del virus Ébola.
1560,ESP-train-1560,0.50,¿En qué día se celebra el aniversario del desc...,¿Qué año nuevo se celebra el 16 de febrero?


In [13]:


df_str_rel['sentence1'] = df_str_rel['sentence1'].apply(lambda x: translator.translate(x))
df_str_rel['sentence2'] = df_str_rel['sentence2'].apply(lambda x: translator.translate(x))


In [14]:
df_str_rel = df_str_rel[['PairID', 'sentence1', 'sentence2', 'Score']]
df_str_rel

Unnamed: 0,PairID,sentence1,sentence2,Score
0,ESP-train-0000,A woman about to eat fish.,A woman about to eat trout.,1.00
1,ESP-train-0001,"The protests returned to Honduras, in this cas...","From the affected sectors, it is considered th...",0.50
2,ESP-train-0002,"Acapulco: splendid past, fierce present","Another granddaughter, only two years old, was...",0.18
3,ESP-train-0003,He was at the top and started again from the b...,And for me Elio Berhanyer was like a god who l...,0.57
4,ESP-train-0004,It was nothing but a desert; There was sagebru...,"Certainly, part of what we need is a way to di...",0.26
...,...,...,...,...
1557,ESP-train-1557,What is Goldfinger's name?,What is the name of the television show starri...,0.14
1558,ESP-train-1558,When was the first Wall Street Journal published?,What new year is celebrated on February 16?,0.24
1559,ESP-train-1559,Name a part of the body affected by the Ebola ...,Name a symptom of the Ebola virus.,0.74
1560,ESP-train-1560,On what day is the anniversary of the discover...,What new year is celebrated on February 16?,0.50


# Preprocessing is done till here.

In [15]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_trans

In [16]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import InputExample
from sentence_transformers import losses

model = SentenceTransformer( "sentence-transformers/all-mpnet-base-v2" )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [17]:
ds = df_str_rel[ [ 'sentence1', 'sentence2', 'Score'] ].to_numpy()

train_examples = []
for i in range( ds.shape[0] ):
  train_examples.append( InputExample(texts=[ ds[i][0] , ds[i][1] ] , label=ds[i][2] ) )


In [18]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

In [19]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/98 [00:00<?, ?it/s]

# Training completed here

In [20]:
test_ds = pd.read_csv( "https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20C/eng/eng_dev.csv" )
test_ds

Unnamed: 0,PairID,Text
0,ENG-dev-0000,The story is gripping and interesting.\nIt's a...
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...
2,ENG-dev-0002,and from your post i think you are to young to...
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...
4,ENG-dev-0004,I am still confused about how I feel about thi...
...,...,...
245,ENG-dev-0245,thats just how they are :( its a shame lol\nIt...
246,ENG-dev-0246,I feel sorry for the books that I will read af...
247,ENG-dev-0247,Uwe Seeler -LRB- born 5 November 1936 in Hambu...
248,ENG-dev-0248,Waco is a city in and the county seat of McLen...


In [21]:
test_ds['Split_Text'] = test_ds['Text'].apply(lambda x: x.split("\n"))
test_ds.head()

Unnamed: 0,PairID,Text,Split_Text
0,ENG-dev-0000,The story is gripping and interesting.\nIt's a...,"[The story is gripping and interesting., It's ..."
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...,[The majority of Southeast Alaska 's area is p...
2,ENG-dev-0002,and from your post i think you are to young to...,[and from your post i think you are to young t...
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...,[The film 's success also made Dreamworks Anim...
4,ENG-dev-0004,I am still confused about how I feel about thi...,[I am still confused about how I feel about th...


In [22]:
X1_dev = test_ds['Split_Text'].apply(lambda x: x[0])
X2_dev = test_ds['Split_Text'].apply(lambda x: x[1])

test_ds = test_ds.assign(sentence1 =X1_dev)
test_ds = test_ds.assign(sentence2 =X2_dev)

test_ds.drop(columns = ['Text', 'Split_Text'], inplace = True)

test_ds

Unnamed: 0,PairID,sentence1,sentence2
0,ENG-dev-0000,The story is gripping and interesting.,"It's a brilliant, compelling, and heartfelt st..."
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...,A lot of of the panhandle is part of the Tonga...
2,ENG-dev-0002,and from your post i think you are to young to...,I think it will be very bad if he acquires her...
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...,There have also been two sequels -LRB- follow-...
4,ENG-dev-0004,I am still confused about how I feel about thi...,"In this particular book, Blue and Gansey are s..."
...,...,...,...
245,ENG-dev-0245,thats just how they are :( its a shame lol,It is a shame that they are like that.
246,ENG-dev-0246,I feel sorry for the books that I will read af...,: More than one person recommended this book t...
247,ENG-dev-0247,Uwe Seeler -LRB- born 5 November 1936 in Hambu...,Alain Giresse -LRB- born 2 August 1952 in Lang...
248,ENG-dev-0248,Waco is a city in and the county seat of McLen...,It is the county seat of Morris County .


In [23]:
def cosine( e1 , e2 ):
  return np.dot( e1.T , e2 ) / ( np.linalg.norm( e1 , ord=2 ) * np.linalg.norm( e2 , ord=2 ) )

test_ds = test_ds.to_numpy()
scores = []
for id , s1 , s2 in test_ds:
  e = model.encode( [ s1 , s2 ] )
  scores.append( cosine( e[0] , e[1] ) )


# Generate file for submission

Submission file has two columns: '**PairID**' and '**Pred_Score**'

In [24]:
pred_ds = {
    "PairID": test_ds[ : , 0 ] ,
    "Pred_Score": scores
}
pred_ds = pd.DataFrame.from_dict( pred_ds )
pred_ds

Unnamed: 0,PairID,Pred_Score
0,ENG-dev-0000,0.759089
1,ENG-dev-0001,0.759000
2,ENG-dev-0002,0.284710
3,ENG-dev-0003,0.764621
4,ENG-dev-0004,0.569528
...,...,...
245,ENG-dev-0245,0.642856
246,ENG-dev-0246,0.310111
247,ENG-dev-0247,0.410979
248,ENG-dev-0248,0.357457


In [25]:
pred_ds.to_csv( "pred_eng_c.csv", index = False )