<a href="https://colab.research.google.com/github/shmuhammadd/semantic_relatedness/blob/main/Simple_English_Baseline_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package Imports

In [1]:
import string
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr, pearsonr, linregress
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import Levenshtein
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Data Import / Format / Export

Functions for importing, formatting, and exporting data

In [2]:
# Load data from csv, format into proper split
def load_data(filepath):
    data = pd.read_csv(filepath)
    data['Split_Text'] = data['Text'].apply(lambda x: x.split("\n"))
    data['Pred_Score'] = 0.0
    return data

# Preprocessing

In [3]:
def preprocess_data(data, vectorizer=None, fit_vectorizer=True):
    # Split sentences into two columns
    data[['Sentence1', 'Sentence2']] = pd.DataFrame(data['Split_Text'].tolist(), index=data.index)
    
    # Lowercasing and removing punctuation
    data['Sentence1'] = data['Sentence1'].str.lower().str.translate(str.maketrans('', '', string.punctuation)).str.strip()
    data['Sentence2'] = data['Sentence2'].str.lower().str.translate(str.maketrans('', '', string.punctuation)).str.strip()

    # Tokenization
    data['Tokens_Sentence1'] = data['Sentence1'].apply(lambda x: x.split())
    data['Tokens_Sentence2'] = data['Sentence2'].apply(lambda x: x.split())
    
    # Use TF-IDF vectorization if not passed one
    if vectorizer is None:
        vectorizer = TfidfVectorizer()
        
    # Fit the vectorizer if fit_vectorizer=True (training data), else transform (testing data)
    tfidf_sentence1 = None
    tfidf_sentence2 = None
    if fit_vectorizer:
        tfidf_sentence1 = vectorizer.fit_transform(data['Sentence1'])
        tfidf_sentence2 = vectorizer.transform(data['Sentence2'])
    else:
        tfidf_sentence1 = vectorizer.transform(data['Sentence1'])
        tfidf_sentence2 = vectorizer.transform(data['Sentence2'])
    
    # --- Cosine Similarity Feature ---
    data['Cosine_Similarity'] = [cosine_similarity(tfidf_sentence1[i], tfidf_sentence2[i])[0][0] for i in range(tfidf_sentence1.shape[0])]
    
    # --- Jaccard Similarity Feature ---
    def jaccard_similarity(s1, s2):
        set1, set2 = set(s1.split()), set(s2.split())
        return len(set1.intersection(set2)) / len(set1.union(set2))
    
    data['Jaccard_Similarity'] = data.apply(lambda x: jaccard_similarity(x['Sentence1'], x['Sentence2']), axis=1)
    
    # --- Levenshtein Distance Feature ---
    data['Levenshtein_Distance'] = data.apply(lambda x: Levenshtein.distance(x['Sentence1'], x['Sentence2']), axis=1)
    
    # --- Sentence Length Difference Feature ---
    data['Length_Diff'] = data.apply(lambda x: abs(len(x['Sentence1']) - len(x['Sentence2'])), axis=1)
    
    # --- Word Overlap Feature ---
    def word_overlap(s1, s2):
        set1, set2 = set(s1.split()), set(s2.split())
        return len(set1.intersection(set2)) / len(set1)
    
    data['Word_Overlap'] = data.apply(lambda x: word_overlap(x['Sentence1'], x['Sentence2']), axis=1)
    
    # Convert TF-IDF sparse matrices to arrays for combining
    tfidf_sentence1_array = tfidf_sentence1.toarray()
    tfidf_sentence2_array = tfidf_sentence2.toarray()
    
    # Create a final array combining features
    additional_features = data[['Cosine_Similarity', 'Jaccard_Similarity', 'Levenshtein_Distance', 'Length_Diff', 'Word_Overlap']].values
    
    # Combine the features: You can either keep TF-IDF and features separate or concatenate them as one feature matrix
    combined_features = np.hstack([tfidf_sentence1_array, tfidf_sentence2_array, additional_features])
    
    # Return the processed DataFrame with the combined features
    return combined_features, data, vectorizer

# Initial models

Each model should have the following:
1. A train function that takes in the training data and gives back a model
2. An evaluate function that takes the model and data, giving back predicted scores

These predictions will be passed with true scores to the calculate metrics function which returns a set of evaluation metrics. They can be displayed with the display metrics function.

In [4]:
def calculate_metrics(preds, scores):
    pearson_corr, _ = pearsonr(scores, preds)
    spearman_corr, _ = spearmanr(scores, preds)
    _, _, r, _, _ = linregress(scores, preds)
    r2 = r**2
    mse = ((scores - preds)**2).mean()
    return (pearson_corr, spearman_corr, r2, mse)

In [5]:
def display_metrics(metrics, title="Metrics:"):
    print(title)
    print("Pearson Corr:", metrics[0])
    print("Spearman Corr:", metrics[1])
    print("R^2:", metrics[2])
    print("MSE:", metrics[3])

### Linear Regression

In [6]:
def train_lr(train_features, train_labels):
    model = LinearRegression()
    model.fit(train_features, train_labels)
    return model

In [7]:
def evaluate_lr(model, features):
    return model.predict(features)

# Load data

In [8]:
train_data = load_data("./Semantic_Relatedness_SemEval2024/Track A/eng/eng_dev_with_labels.csv")
train_data.head()

Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score
0,ENG-dev-0000,The story is gripping and interesting.\nIt's a...,0.64,"[The story is gripping and interesting., It's ...",0.0
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...,0.61,[The majority of Southeast Alaska 's area is p...,0.0
2,ENG-dev-0002,and from your post i think you are to young to...,0.31,[and from your post i think you are to young t...,0.0
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...,0.59,[The film 's success also made Dreamworks Anim...,0.0
4,ENG-dev-0004,I am still confused about how I feel about thi...,0.5,[I am still confused about how I feel about th...,0.0


In [9]:
test_data = load_data("./Semantic_Relatedness_SemEval2024/Track A/eng/eng_test_with_labels.csv")
test_data.head()

Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score
0,ENG-test-0000,Egypt's Brotherhood stands ground after killin...,0.7,[Egypt's Brotherhood stands ground after killi...,0.0
1,ENG-test-0001,install it for fre and get to know what all u ...,0.71,[install it for fre and get to know what all u...,0.0
2,ENG-test-0002,"Also, it was one of the debut novels that I wa...",0.49,"[Also, it was one of the debut novels that I w...",0.0
3,ENG-test-0003,"Therefore, you can use the code BRAIL, BASIL, ...",0.27,"[Therefore, you can use the code BRAIL, BASIL,...",0.0
4,ENG-test-0004,Solid YA novel with a funky take on zombies an...,0.32,[Solid YA novel with a funky take on zombies a...,0.0


In [10]:
train_features, train_data, vectorizer = preprocess_data(train_data)
print(train_data.shape)
train_data.head()

(250, 14)


Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score,Sentence1,Sentence2,Tokens_Sentence1,Tokens_Sentence2,Cosine_Similarity,Jaccard_Similarity,Levenshtein_Distance,Length_Diff,Word_Overlap
0,ENG-dev-0000,The story is gripping and interesting.\nIt's a...,0.64,"[The story is gripping and interesting., It's ...",0.0,the story is gripping and interesting,its a brilliant compelling and heartfelt story,"[the, story, is, gripping, and, interesting]","[its, a, brilliant, compelling, and, heartfelt...",0.356736,0.181818,29,9,0.333333
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...,0.61,[The majority of Southeast Alaska 's area is p...,0.0,the majority of southeast alaska s area is par...,a lot of of the panhandle is part of the tonga...,"[the, majority, of, southeast, alaska, s, area...","[a, lot, of, of, the, panhandle, is, part, of,...",0.790995,0.428571,62,1,0.6
2,ENG-dev-0002,and from your post i think you are to young to...,0.31,[and from your post i think you are to young t...,0.0,and from your post i think you are to young to...,i think it will be very bad if he acquires her...,"[and, from, your, post, i, think, you, are, to...","[i, think, it, will, be, very, bad, if, he, ac...",0.157548,0.125,53,15,0.2
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...,0.59,[The film 's success also made Dreamworks Anim...,0.0,the film s success also made dreamworks animat...,there have also been two sequels lrb followups...,"[the, film, s, success, also, made, dreamworks...","[there, have, also, been, two, sequels, lrb, f...",0.680547,0.269231,79,26,0.411765
4,ENG-dev-0004,I am still confused about how I feel about thi...,0.5,[I am still confused about how I feel about th...,0.0,i am still confused about how i feel about thi...,in this particular book blue and gansey are st...,"[i, am, still, confused, about, how, i, feel, ...","[in, this, particular, book, blue, and, gansey...",0.231539,0.12,89,66,0.333333


In [11]:
print(train_features.shape)
print(train_features)

(250, 2453)
[[ 0.          0.          0.         ... 29.          9.
   0.33333333]
 [ 0.          0.          0.         ... 62.          1.
   0.6       ]
 [ 0.          0.          0.         ... 53.         15.
   0.2       ]
 ...
 [ 0.          0.          0.         ... 86.         15.
   0.42105263]
 [ 0.          0.          0.         ... 31.         25.
   0.41666667]
 [ 0.          0.          0.         ... 28.         28.
   1.        ]]


In [12]:
train_labels = train_data['Score'].to_numpy()
print(train_labels.shape)
print(train_labels)

(250,)
[0.64 0.61 0.31 0.59 0.5  0.42 0.7  0.7  0.59 0.63 0.73 0.63 0.56 0.35
 0.32 0.64 0.44 0.64 0.7  0.48 0.42 0.32 0.25 0.22 0.41 0.88 0.5  0.45
 0.42 0.42 0.36 0.3  0.68 0.73 0.77 0.35 0.39 0.41 0.33 0.87 0.53 0.71
 0.5  0.76 0.77 0.3  0.59 0.57 0.73 0.4  0.33 0.41 0.61 0.48 0.61 0.59
 0.45 0.56 0.65 0.3  0.37 0.62 0.36 0.58 0.54 0.39 0.57 0.52 0.48 0.43
 0.52 0.63 0.36 0.5  0.73 0.39 0.44 0.56 0.58 0.59 0.39 0.68 0.41 0.37
 0.35 0.41 0.59 0.31 0.47 0.67 0.7  0.52 0.65 0.44 0.73 0.42 0.19 0.32
 0.82 0.61 0.54 0.68 0.42 0.73 0.46 0.21 0.77 0.55 0.57 0.29 0.55 0.66
 0.28 0.72 0.31 0.47 0.56 0.45 0.21 0.7  0.58 0.66 0.69 0.42 0.6  0.45
 0.43 0.45 0.26 0.34 0.59 0.74 0.54 0.41 0.64 0.31 0.3  0.24 0.39 0.63
 0.62 0.62 0.55 0.45 0.52 0.39 0.52 0.68 0.65 0.62 0.58 0.39 0.44 0.4
 0.38 0.46 0.27 0.39 0.53 0.6  0.68 0.63 0.48 0.49 0.36 0.64 0.63 0.52
 0.5  0.43 0.48 0.48 0.33 0.5  0.35 0.35 0.37 0.4  0.72 0.8  0.65 0.24
 0.45 0.39 0.46 0.3  0.65 0.31 0.56 0.4  0.38 0.67 0.35 0.52 0.66 0.53


In [13]:
test_features, test_data, _ = preprocess_data(test_data, vectorizer, False)
print(test_data.shape)
test_data.head()

(2600, 14)


Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score,Sentence1,Sentence2,Tokens_Sentence1,Tokens_Sentence2,Cosine_Similarity,Jaccard_Similarity,Levenshtein_Distance,Length_Diff,Word_Overlap
0,ENG-test-0000,Egypt's Brotherhood stands ground after killin...,0.7,[Egypt's Brotherhood stands ground after killi...,0.0,egypts brotherhood stands ground after killings,egypt muslim brotherhood stands behind morsi,"[egypts, brotherhood, stands, ground, after, k...","[egypt, muslim, brotherhood, stands, behind, m...",0.0,0.2,22,3,0.333333
1,ENG-test-0001,install it for fre and get to know what all u ...,0.71,[install it for fre and get to know what all u...,0.0,install it for fre and get to know what all u ...,install the program which is free to download ...,"[install, it, for, fre, and, get, to, know, wh...","[install, the, program, which, is, free, to, d...",0.534905,0.285714,53,28,0.461538
2,ENG-test-0002,"Also, it was one of the debut novels that I wa...",0.49,"[Also, it was one of the debut novels that I w...",0.0,also it was one of the debut novels that i was...,pretty much the first thing people mentioned w...,"[also, it, was, one, of, the, debut, novels, t...","[pretty, much, the, first, thing, people, ment...",0.025541,0.12,61,20,0.214286
3,ENG-test-0003,"Therefore, you can use the code BRAIL, BASIL, ...",0.27,"[Therefore, you can use the code BRAIL, BASIL,...",0.0,therefore you can use the code brail basil etc,you can watch the wiggles every day on nick jr,"[therefore, you, can, use, the, code, brail, b...","[you, can, watch, the, wiggles, every, day, on...",0.67266,0.1875,37,0,0.333333
4,ENG-test-0004,Solid YA novel with a funky take on zombies an...,0.32,[Solid YA novel with a funky take on zombies a...,0.0,solid ya novel with a funky take on zombies an...,my 13yearold son recommended this book to me a...,"[solid, ya, novel, with, a, funky, take, on, z...","[my, 13yearold, son, recommended, this, book, ...",0.045587,0.035714,59,25,0.083333


In [14]:
print(test_features.shape)
print(test_features)

(2600, 2453)
[[ 0.          0.          0.         ... 22.          3.
   0.33333333]
 [ 0.          0.          0.         ... 53.         28.
   0.46153846]
 [ 0.          0.          0.         ... 61.         20.
   0.21428571]
 ...
 [ 0.          0.          0.         ... 29.         16.
   0.2       ]
 [ 0.          0.          0.         ... 41.         16.
   0.        ]
 [ 0.          0.          0.         ... 65.         40.
   0.1875    ]]


In [15]:
test_labels = test_data['Score'].to_numpy()
print(test_labels.shape)
print(test_labels)

(2600,)
[0.7  0.71 0.49 ... 0.45 0.45 0.22]


# Evaluate Initial Models

In [16]:
model_lr = train_lr(train_features, train_labels)
train_preds_lr = evaluate_lr(model_lr, train_features)
test_preds_lr = evaluate_lr(model_lr, test_features)
train_metrics_lr = calculate_metrics(train_preds_lr, train_labels)
test_metrics_lr = calculate_metrics(test_preds_lr, test_labels)
display_metrics(train_metrics_lr, "Training Metrics:")
print()
display_metrics(test_metrics_lr, "Testing Metrics:")

Training Metrics:
Pearson Corr: 1.0
Spearman Corr: 0.9996848720960497
R^2: 1.0
MSE: 1.4407530624339056e-29

Testing Metrics:
Pearson Corr: 0.6440257081976024
Spearman Corr: 0.6446774366625729
R^2: 0.41476911281942375
MSE: 0.017092369388471442


In [17]:
print("Pred vs True for training data")
for i in range(10):
    print(f"{train_preds_lr[i]:.4f}, {train_labels[i]:.4f}")
print()
print("Pred vs True for testing data")
for i in range(10):
    print(f"{test_preds_lr[i]:.4f}, {test_labels[i]:.4f}")

Pred vs True for training data
0.6400, 0.6400
0.6100, 0.6100
0.3100, 0.3100
0.5900, 0.5900
0.5000, 0.5000
0.4200, 0.4200
0.7000, 0.7000
0.7000, 0.7000
0.5900, 0.5900
0.6300, 0.6300

Pred vs True for testing data
0.4637, 0.7000
0.4821, 0.7100
0.5607, 0.4900
0.5193, 0.2700
0.3441, 0.3200
0.3618, 0.4300
0.4547, 0.3100
0.3502, 0.3200
0.5306, 0.7700
0.5330, 0.3400
