<a href="https://colab.research.google.com/github/shmuhammadd/semantic_relatedness/blob/main/Simple_English_Baseline_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package Imports

In [1]:
import re
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr, linregress
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Data Import / Format / Export

Functions for importing, formatting, and exporting data

In [2]:
# Load data from csv, format into proper split
def load_data(filepath):
    data = pd.read_csv(filepath)
    data['Split Text'] = data['Text'].apply(lambda x: x.split("\n"))
    data['Pred_Score'] = 0.0
    return data

In [3]:
# Export data
def save_predictions(data, filepath):
    data[['PairID', 'Pred_Score']].to_csv(filepath, index=False)

# Baseline model : Dice Score



In [4]:
def dice_score(s1,s2):
  s1 = s1.lower()
  s1_split = re.findall(r"\w+|[^\w\s]", s1, re.UNICODE)

  s2 = s2.lower()
  s2_split = re.findall(r"\w+|[^\w\s]", s2, re.UNICODE)

  dice_coef = len(set(s1_split).intersection(set(s2_split))) / (len(set(s1_split)) + len(set(s2_split)))
  return round(dice_coef, 2)

# Load data

In [5]:
train_data = load_data("./Semantic_Relatedness_SemEval2024/Track A/eng/eng_train.csv")
train_data.head()

Unnamed: 0,PairID,Text,Score,Split Text,Pred_Score
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,"[It that happens, just pull the plug., if that...",0.0
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,"[A black dog running through water., A black d...",0.0
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,"[I've been searchingthe entire abbey for you.,...",0.0
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,[If he is good looking and has a good personal...,0.0
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,"[She does not hate you, she is just annoyed wi...",0.0


In [6]:
test_data = load_data("./Semantic_Relatedness_SemEval2024/Track A/eng/eng_test_with_labels.csv")
test_data.head()

Unnamed: 0,PairID,Text,Score,Split Text,Pred_Score
0,ENG-test-0000,Egypt's Brotherhood stands ground after killin...,0.7,[Egypt's Brotherhood stands ground after killi...,0.0
1,ENG-test-0001,install it for fre and get to know what all u ...,0.71,[install it for fre and get to know what all u...,0.0
2,ENG-test-0002,"Also, it was one of the debut novels that I wa...",0.49,"[Also, it was one of the debut novels that I w...",0.0
3,ENG-test-0003,"Therefore, you can use the code BRAIL, BASIL, ...",0.27,"[Therefore, you can use the code BRAIL, BASIL,...",0.0
4,ENG-test-0004,Solid YA novel with a funky take on zombies an...,0.32,[Solid YA novel with a funky take on zombies a...,0.0


# Evaluate Baseline

In [7]:
# Seperate the actual scores
true_train_scores = train_data['Score'].values
true_test_scores = test_data['Score'].values


In [8]:
pred_train_scores = []
for index,row in train_data.iterrows():
  s1,s2 = row["Split Text"]
  pred_train_scores.append(dice_score(s1,s2))
train_data['Pred_Score'] = pred_train_scores
train_data.head()

Unnamed: 0,PairID,Text,Score,Split Text,Pred_Score
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,"[It that happens, just pull the plug., if that...",0.42
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,"[A black dog running through water., A black d...",0.44
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,"[I've been searchingthe entire abbey for you.,...",0.29
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,[If he is good looking and has a good personal...,0.41
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,"[She does not hate you, she is just annoyed wi...",0.36


In [9]:
pred_test_scores = []
for index,row in test_data.iterrows():
  s1,s2 = row["Split Text"]
  pred_test_scores.append(dice_score(s1,s2))
test_data['Pred_Score'] = pred_test_scores
test_data.head()

Unnamed: 0,PairID,Text,Score,Split Text,Pred_Score
0,ENG-test-0000,Egypt's Brotherhood stands ground after killin...,0.7,[Egypt's Brotherhood stands ground after killi...,0.2
1,ENG-test-0001,install it for fre and get to know what all u ...,0.71,[install it for fre and get to know what all u...,0.21
2,ENG-test-0002,"Also, it was one of the debut novels that I wa...",0.49,"[Also, it was one of the debut novels that I w...",0.13
3,ENG-test-0003,"Therefore, you can use the code BRAIL, BASIL, ...",0.27,"[Therefore, you can use the code BRAIL, BASIL,...",0.18
4,ENG-test-0004,Solid YA novel with a funky take on zombies an...,0.32,[Solid YA novel with a funky take on zombies a...,0.05


In [10]:
def calculate_metrics(preds, scores):
    pearson_corr, _ = pearsonr(scores, preds)
    spearman_corr, _ = spearmanr(scores, preds)
    _, _, r, _, _ = linregress(scores, preds)
    r2 = r**2
    mse = ((scores - preds)**2).mean()
    return (pearson_corr, spearman_corr, r2, mse)

def display_metrics(metrics, title="Metrics:"):
    print(title)
    print("Pearson Corr:", metrics[0])
    print("Spearman Corr:", metrics[1])
    print("R^2:", metrics[2])
    print("MSE:", metrics[3])

In [16]:
# Pearson correlation of predictions vs actual scores
metrics_train = calculate_metrics(true_train_scores, pred_train_scores)
metrics_test = calculate_metrics(true_test_scores, pred_test_scores)
metrics_full = calculate_metrics(np.array(list(true_train_scores) + list(true_test_scores)), 
                                 np.array(list(pred_train_scores) + list(pred_test_scores)))
display_metrics(metrics_train)
print()
display_metrics(metrics_test)
print()
display_metrics(metrics_full)

Metrics:
Pearson Corr: 0.5814607386988245
Spearman Corr: 0.5650747237186234
R^2: 0.3380965906481828
MSE: 0.13865496363636365

Metrics:
Pearson Corr: 0.7417708846670659
Spearman Corr: 0.7390091346542196
R^2: 0.5502240453397627
MSE: 0.10958134615384615

Metrics:
Pearson Corr: 0.6145593827666949
Spearman Corr: 0.6160055842594061
R^2: 0.37768323494658085
MSE: 0.12932269135802468
