<a href="https://colab.research.google.com/github/shmuhammadd/semantic_relatedness/blob/main/Simple_English_Baseline_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package Imports

In [4]:
import pandas as pd
import re
from scipy.stats import spearmanr, pearsonr, linregress
import torch
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModel
import Levenshtein
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Data Import / Format / Export

Functions for importing, formatting, and exporting data

In [5]:
# Load data from csv, format into proper split
def load_data(filepath):
    data = pd.read_csv(filepath)
    data['Split_Text'] = data['Text'].apply(lambda x: x.split("\n"))
    data['Pred_Score'] = 0.0
    return data

# Preprocessing

In [6]:
def jaccard_similarity(s1, s2):
    set1, set2 = set(s1), set(s2)
    return len(set1.intersection(set2)) / len(set1.union(set2))

In [7]:
def word_overlap(s1, s2):
    set1, set2 = set(s1), set(s2)
    return len(set1.intersection(set2)) / len(set1)

In [8]:
def dice_score(s1,s2):
  s1 = s1.lower()
  s1_split = re.findall(r"\w+|[^\w\s]", s1, re.UNICODE)

  s2 = s2.lower()
  s2_split = re.findall(r"\w+|[^\w\s]", s2, re.UNICODE)

  dice_coef = len(set(s1_split).intersection(set(s2_split))) / (len(set(s1_split)) + len(set(s2_split)))
  return round(dice_coef, 2)

In [9]:
# Additional features added to RoBERTa embeddings
def compute_custom_metrics(row):
    metrics = {}
    cosine_sim = F.cosine_similarity(row["Embedding1"].unsqueeze(0), row["Embedding2"].unsqueeze(0))
    metrics["Cosine_Similarity"] = cosine_sim.item()

    set1 = set(row["Sentence1"].split())
    set2 = set(row["Sentence2"].split())
    jaccard_sim = len(set1.intersection(set2)) / len(set1.union(set2)) if len(set1.union(set2)) > 0 else 0
    metrics["Jaccard_Similarity"] = jaccard_sim

    metrics["Length_Diff"] = abs(len(row["Sentence1"].split()) - len(row["Sentence2"].split()))

    metrics['Levenshtein_Distance'] = Levenshtein.distance(row['Sentence1'], row['Sentence2'])

    word_overlap_score = word_overlap(row["Sentence1"].split(), row["Sentence2"].split())
    metrics['Word_Overlap'] = word_overlap_score
    
    dice = dice_score(row["Sentence1"], row["Sentence2"])
    metrics['Dice_Score'] = dice

    return metrics

In [10]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Needed batch sizes due to memory issues
def get_roberta_embeddings(sentences, batch_size=32):
    embeddings_list = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings_list.append(embeddings)
    return torch.cat(embeddings_list, dim=0)

In [12]:
def preprocess_with_roberta(data, batch_size=32):
    # Split into two sentences
    data[['Sentence1', 'Sentence2']] = pd.DataFrame(data['Split_Text'].tolist(), index=data.index)
    
    # Lowercase sentences, strip whitespace
    data["Sentence1"] = data["Sentence1"].str.lower().str.strip()
    data["Sentence2"] = data["Sentence2"].str.lower().str.strip()

    # Generate RoBERTa embeddings in batches (keeping everything as tensors)
    embeddings1 = get_roberta_embeddings(data["Sentence1"].tolist(), batch_size)
    embeddings2 = get_roberta_embeddings(data["Sentence2"].tolist(), batch_size)
    
    # Save embeddings for custom metrics
    data["Embedding1"] = list(embeddings1)
    data["Embedding2"] = list(embeddings2)

    # Compute custom metrics for each row
    metrics = data.apply(compute_custom_metrics, axis=1, result_type="expand")

    # Convert metrics to tensor
    metrics_tensor = torch.tensor(metrics.values, dtype=torch.float32)

    # Standardize custom metrics
    scaler = StandardScaler()
    standardized_metrics = scaler.fit_transform(metrics_tensor.numpy())
    standardized_metrics_tensor = torch.tensor(standardized_metrics, dtype=torch.float32)

    # Combine embeddings and metrics
    features = torch.cat([
        embeddings1,
        embeddings2,
        standardized_metrics_tensor
    ], dim=1)

    # Returned processed features as tensors
    return features, data

# Tools

In [13]:
def calculate_metrics(preds, scores):
    pearson_corr, _ = pearsonr(scores, preds)
    spearman_corr, _ = spearmanr(scores, preds)
    _, _, r, _, _ = linregress(scores, preds) # probably a better way of doing this, fix later
    r2 = r**2
    mse = ((scores - preds)**2).mean() # Scikit's mean_squared_error complained about being deprecated, so this is my temp fix
    return (pearson_corr, spearman_corr, r2, mse)

In [14]:
# Make sure these match the metrics above
def display_metrics(metrics, title="Metrics:"):
    print(title)
    print("Pearson Corr:", metrics[0])
    print("Spearman Corr:", metrics[1])
    print("R^2:", metrics[2])
    print("MSE:", metrics[3])

# Load data

In [15]:
train_data = load_data("./Semantic_Relatedness_SemEval2024/Track A/eng/eng_train.csv")
train_data.head()

Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,"[It that happens, just pull the plug., if that...",0.0
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,"[A black dog running through water., A black d...",0.0
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,"[I've been searchingthe entire abbey for you.,...",0.0
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,[If he is good looking and has a good personal...,0.0
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,"[She does not hate you, she is just annoyed wi...",0.0


In [16]:
test_data = load_data("./Semantic_Relatedness_SemEval2024/Track A/eng/eng_test_with_labels.csv")
test_data.head()

Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score
0,ENG-test-0000,Egypt's Brotherhood stands ground after killin...,0.7,[Egypt's Brotherhood stands ground after killi...,0.0
1,ENG-test-0001,install it for fre and get to know what all u ...,0.71,[install it for fre and get to know what all u...,0.0
2,ENG-test-0002,"Also, it was one of the debut novels that I wa...",0.49,"[Also, it was one of the debut novels that I w...",0.0
3,ENG-test-0003,"Therefore, you can use the code BRAIL, BASIL, ...",0.27,"[Therefore, you can use the code BRAIL, BASIL,...",0.0
4,ENG-test-0004,Solid YA novel with a funky take on zombies an...,0.32,[Solid YA novel with a funky take on zombies a...,0.0


In [17]:
train_features, train_data = preprocess_with_roberta(train_data)
print(train_data.shape)
train_data.head()

(5500, 9)


Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score,Sentence1,Sentence2,Embedding1,Embedding2
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,"[It that happens, just pull the plug., if that...",0.0,"it that happens, just pull the plug.","if that ever happens, just pull the plug.","[tensor(-0.1094), tensor(0.1345), tensor(-0.04...","[tensor(-0.1166), tensor(0.1211), tensor(-0.04..."
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,"[A black dog running through water., A black d...",0.0,a black dog running through water.,a black dog is running through some water.,"[tensor(-0.1038), tensor(0.0925), tensor(-0.00...","[tensor(-0.0920), tensor(0.0753), tensor(-0.00..."
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,"[I've been searchingthe entire abbey for you.,...",0.0,i've been searchingthe entire abbey for you.,i'm looking for you all over the abbey.,"[tensor(-0.1287), tensor(0.0527), tensor(-0.01...","[tensor(-0.1227), tensor(0.0650), tensor(0.013..."
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,[If he is good looking and has a good personal...,0.0,if he is good looking and has a good personali...,"if he's good looking, and a good personality, ...","[tensor(-0.0881), tensor(0.0848), tensor(-0.01...","[tensor(-0.1034), tensor(0.0648), tensor(-0.02..."
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,"[She does not hate you, she is just annoyed wi...",0.0,"she does not hate you, she is just annoyed wit...","she doesn't hate you, she is just annoyed.","[tensor(-0.0909), tensor(0.1275), tensor(0.007...","[tensor(-0.1069), tensor(0.1247), tensor(0.013..."


In [18]:
print(train_features.shape)
print(train_features)
torch.save(train_features, "train_features.pt")

torch.Size([5500, 1542])
tensor([[-1.0935e-01,  1.3450e-01, -4.0140e-02,  ..., -1.7812e+00,
          3.0215e+00,  2.7462e+00],
        [-1.0378e-01,  9.2506e-02, -3.3523e-03,  ..., -1.7041e+00,
          3.7620e+00,  2.9718e+00],
        [-1.2873e-01,  5.2712e-02, -1.4779e-02,  ..., -7.4082e-01,
         -6.8096e-01,  1.2792e+00],
        ...,
        [-1.2664e-01,  6.4262e-02,  4.8498e-03,  ...,  4.5370e-01,
         -8.4551e-01, -6.3912e-01],
        [-9.2601e-02,  1.0219e-01,  2.0890e-03,  ..., -5.8669e-01,
         -9.0311e-01, -8.6480e-01],
        [-9.2274e-02,  1.2332e-01, -5.6928e-03,  ..., -5.4815e-01,
         -7.7821e-03,  3.7936e-02]])


In [19]:
train_labels = torch.tensor(train_data['Score'], dtype=torch.float32)
print(train_labels.shape)
print(train_labels)
torch.save(train_labels, "train_labels.pt")

torch.Size([5500])
tensor([1., 1., 1.,  ..., 0., 0., 0.])


In [20]:
test_features, test_data = preprocess_with_roberta(test_data)
print(test_data.shape)
test_data.head()

(2600, 9)


Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score,Sentence1,Sentence2,Embedding1,Embedding2
0,ENG-test-0000,Egypt's Brotherhood stands ground after killin...,0.7,[Egypt's Brotherhood stands ground after killi...,0.0,egypt's brotherhood stands ground after killings,egypt: muslim brotherhood stands behind morsi,"[tensor(-0.0477), tensor(0.0616), tensor(0.005...","[tensor(-0.0468), tensor(0.0575), tensor(0.008..."
1,ENG-test-0001,install it for fre and get to know what all u ...,0.71,[install it for fre and get to know what all u...,0.0,install it for fre and get to know what all u ...,"install the program, which is free to download...","[tensor(-0.0327), tensor(0.0544), tensor(-0.03...","[tensor(-0.0901), tensor(0.1307), tensor(-0.04..."
2,ENG-test-0002,"Also, it was one of the debut novels that I wa...",0.49,"[Also, it was one of the debut novels that I w...",0.0,"also, it was one of the debut novels that i wa...",pretty much the first thing people mentioned w...,"[tensor(-0.1055), tensor(0.0924), tensor(0.001...","[tensor(-0.1187), tensor(0.0733), tensor(-0.01..."
3,ENG-test-0003,"Therefore, you can use the code BRAIL, BASIL, ...",0.27,"[Therefore, you can use the code BRAIL, BASIL,...",0.0,"therefore, you can use the code brail, basil, ...",you can watch the wiggles every day on nick jr.,"[tensor(-0.0907), tensor(0.1206), tensor(-0.03...","[tensor(-0.0965), tensor(0.0422), tensor(0.012..."
4,ENG-test-0004,Solid YA novel with a funky take on zombies an...,0.32,[Solid YA novel with a funky take on zombies a...,0.0,solid ya novel with a funky take on zombies an...,my 13-year-old son recommended this book to me...,"[tensor(-0.1368), tensor(0.0987), tensor(-0.00...","[tensor(-0.0597), tensor(0.0636), tensor(-0.06..."


In [21]:
print(test_features.shape)
print(test_features)
torch.save(test_features, "test_features.pt")

torch.Size([2600, 1542])
tensor([[-4.7668e-02,  6.1642e-02,  5.0457e-03,  ..., -8.5190e-01,
         -1.4553e-01, -5.6737e-02],
        [-3.2685e-02,  5.4365e-02, -3.8078e-02,  ...,  4.3447e-01,
          3.7257e-01,  3.4154e-02],
        [-1.0550e-01,  9.2436e-02,  1.2523e-03,  ...,  6.6836e-01,
         -9.1528e-01, -6.9298e-01],
        ...,
        [-1.4587e-01,  1.0338e-01,  1.2481e-02,  ..., -6.1801e-01,
         -1.4926e+00, -1.1474e+00],
        [-1.1664e-01,  8.5891e-02,  1.1200e-02,  ..., -1.1126e-01,
         -1.4926e+00, -1.5110e+00],
        [-1.0706e-01,  9.1530e-02,  1.7318e-03,  ...,  8.6326e-01,
         -7.7944e-01, -6.9298e-01]])


In [22]:
test_labels = torch.tensor(test_data['Score'], dtype=torch.float32)
print(test_labels.shape)
print(test_labels)
torch.save(test_labels, "test_labels.pt")

torch.Size([2600])
tensor([0.7000, 0.7100, 0.4900,  ..., 0.4500, 0.4500, 0.2200])
