<a href="https://colab.research.google.com/github/shmuhammadd/semantic_relatedness/blob/main/Simple_English_Baseline_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package Imports

In [2]:
import pandas as pd
from scipy.stats import spearmanr, pearsonr, linregress
import Levenshtein
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModel
plt.style.use('ggplot')

  from .autonotebook import tqdm as notebook_tqdm


# Data Import / Format / Export

Functions for importing, formatting, and exporting data

In [3]:
# Load data from csv, format into proper split
def load_data(filepath):
    data = pd.read_csv(filepath)
    data['Split_Text'] = data['Text'].apply(lambda x: x.split("\n"))
    data['Pred_Score'] = 0.0
    return data

# Preprocessing

In [4]:
def jaccard_similarity(s1, s2):
    set1, set2 = set(s1), set(s2)
    return len(set1.intersection(set2)) / len(set1.union(set2))

In [5]:
def word_overlap(s1, s2):
    set1, set2 = set(s1), set(s2)
    return len(set1.intersection(set2)) / len(set1)

In [6]:
# Additional features added to RoBERTa embeddings
def compute_custom_metrics(row):
    metrics = {}
    cosine_sim = F.cosine_similarity(row["Embedding1"].unsqueeze(0), row["Embedding2"].unsqueeze(0))
    metrics["Cosine_Similarity"] = cosine_sim.item()

    set1 = set(row["Sentence1"].split())
    set2 = set(row["Sentence2"].split())
    jaccard_sim = len(set1.intersection(set2)) / len(set1.union(set2)) if len(set1.union(set2)) > 0 else 0
    metrics["Jaccard_Similarity"] = jaccard_sim

    metrics["Length_Diff"] = abs(len(row["Sentence1"].split()) - len(row["Sentence2"].split()))

    metrics['Levenshtein_Distance'] = Levenshtein.distance(row['Sentence1'], row['Sentence2'])

    word_overlap_score = word_overlap(row["Sentence1"].split(), row["Sentence2"].split())
    metrics['Word_Overlap'] = word_overlap_score

    return metrics

In [7]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")

2024-11-26 10:08:07.845405: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-26 10:08:08.486743: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-26 10:08:09.006321: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732633689.432969    7193 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732633689.550283    7193 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been regist

In [8]:
# Needed batch sizes due to memory issues
def get_roberta_embeddings(sentences, batch_size=32):
    embeddings_list = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings_list.append(embeddings)
    return torch.cat(embeddings_list, dim=0)

In [9]:
def preprocess_with_roberta(data, batch_size=32):
    # Split into two sentences
    data[['Sentence1', 'Sentence2']] = pd.DataFrame(data['Split_Text'].tolist(), index=data.index)
    
    # Lowercase sentences, strip whitespace
    data["Sentence1"] = data["Sentence1"].str.lower().str.strip()
    data["Sentence2"] = data["Sentence2"].str.lower().str.strip()

    # Generate RoBERTa embeddings in batches (keeping everything as tensors)
    embeddings1 = get_roberta_embeddings(data["Sentence1"].tolist(), batch_size)
    embeddings2 = get_roberta_embeddings(data["Sentence2"].tolist(), batch_size)
    
    # Save embeddings for custom metrics
    data["Embedding1"] = list(embeddings1)
    data["Embedding2"] = list(embeddings2)

    # Compute custom metrics for each row
    metrics = data.apply(compute_custom_metrics, axis=1, result_type="expand")

    # Convert metrics to tensor
    metrics_tensor = torch.tensor(metrics.values, dtype=torch.float32)

    # Standardize custom metrics
    scaler = StandardScaler()
    standardized_metrics = scaler.fit_transform(metrics_tensor.numpy())
    standardized_metrics_tensor = torch.tensor(standardized_metrics, dtype=torch.float32)

    # Combine embeddings and metrics
    features = torch.cat([
        embeddings1,
        embeddings2,
        standardized_metrics_tensor
    ], dim=1)

    # Returned processed features as tensors
    return features, data

# Tools

In [10]:
def calculate_metrics(preds, scores):
    pearson_corr, _ = pearsonr(scores, preds)
    spearman_corr, _ = spearmanr(scores, preds)
    _, _, r, _, _ = linregress(scores, preds) # probably a better way of doing this, fix later
    r2 = r**2
    mse = ((scores - preds)**2).mean() # Scikit's mean_squared_error complained about being deprecated, so this is my temp fix
    return (pearson_corr, spearman_corr, r2, mse)

In [11]:
# Make sure these match the metrics above
def display_metrics(metrics, title="Metrics:"):
    print(title)
    print("Pearson Corr:", metrics[0])
    print("Spearman Corr:", metrics[1])
    print("R^2:", metrics[2])
    print("MSE:", metrics[3])

# Load data

In [12]:
train_data = load_data("./Semantic_Relatedness_SemEval2024/Track A/eng/eng_train.csv")
train_data.head()

Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,"[It that happens, just pull the plug., if that...",0.0
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,"[A black dog running through water., A black d...",0.0
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,"[I've been searchingthe entire abbey for you.,...",0.0
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,[If he is good looking and has a good personal...,0.0
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,"[She does not hate you, she is just annoyed wi...",0.0


In [13]:
test_data = load_data("./Semantic_Relatedness_SemEval2024/Track A/eng/eng_test_with_labels.csv")
test_data.head()

Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score
0,ENG-test-0000,Egypt's Brotherhood stands ground after killin...,0.7,[Egypt's Brotherhood stands ground after killi...,0.0
1,ENG-test-0001,install it for fre and get to know what all u ...,0.71,[install it for fre and get to know what all u...,0.0
2,ENG-test-0002,"Also, it was one of the debut novels that I wa...",0.49,"[Also, it was one of the debut novels that I w...",0.0
3,ENG-test-0003,"Therefore, you can use the code BRAIL, BASIL, ...",0.27,"[Therefore, you can use the code BRAIL, BASIL,...",0.0
4,ENG-test-0004,Solid YA novel with a funky take on zombies an...,0.32,[Solid YA novel with a funky take on zombies a...,0.0


In [14]:
train_features, train_data = preprocess_with_roberta(train_data)
print(train_data.shape)
train_data.head()

(5500, 9)


Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score,Sentence1,Sentence2,Embedding1,Embedding2
0,ENG-train-0000,"It that happens, just pull the plug.\nif that ...",1.0,"[It that happens, just pull the plug., if that...",0.0,"it that happens, just pull the plug.","if that ever happens, just pull the plug.","[tensor(-0.1094), tensor(0.1345), tensor(-0.04...","[tensor(-0.1166), tensor(0.1211), tensor(-0.04..."
1,ENG-train-0001,A black dog running through water.\nA black do...,1.0,"[A black dog running through water., A black d...",0.0,a black dog running through water.,a black dog is running through some water.,"[tensor(-0.1038), tensor(0.0925), tensor(-0.00...","[tensor(-0.0920), tensor(0.0753), tensor(-0.00..."
2,ENG-train-0002,I've been searchingthe entire abbey for you.\n...,1.0,"[I've been searchingthe entire abbey for you.,...",0.0,i've been searchingthe entire abbey for you.,i'm looking for you all over the abbey.,"[tensor(-0.1287), tensor(0.0527), tensor(-0.01...","[tensor(-0.1227), tensor(0.0650), tensor(0.013..."
3,ENG-train-0003,If he is good looking and has a good personali...,1.0,[If he is good looking and has a good personal...,0.0,if he is good looking and has a good personali...,"if he's good looking, and a good personality, ...","[tensor(-0.0881), tensor(0.0848), tensor(-0.01...","[tensor(-0.1034), tensor(0.0648), tensor(-0.02..."
4,ENG-train-0004,"She does not hate you, she is just annoyed wit...",1.0,"[She does not hate you, she is just annoyed wi...",0.0,"she does not hate you, she is just annoyed wit...","she doesn't hate you, she is just annoyed.","[tensor(-0.0909), tensor(0.1275), tensor(0.007...","[tensor(-0.1069), tensor(0.1247), tensor(0.013..."


In [16]:
print(train_features.shape)
print(train_features)
torch.save(train_features, "train_features.pt")

torch.Size([5500, 1541])
tensor([[-1.0935e-01,  1.3450e-01, -4.0140e-02,  ..., -6.6621e-01,
         -1.7812e+00,  3.0215e+00],
        [-1.0378e-01,  9.2506e-02, -3.3523e-03,  ..., -3.3545e-01,
         -1.7041e+00,  3.7620e+00],
        [-1.2873e-01,  5.2712e-02, -1.4779e-02,  ..., -6.6621e-01,
         -7.4082e-01, -6.8096e-01],
        ...,
        [-1.2664e-01,  6.4262e-02,  4.8498e-03,  ...,  1.6491e+00,
          4.5370e-01, -8.4551e-01],
        [-9.2601e-02,  1.0219e-01,  2.0890e-03,  ..., -6.6621e-01,
         -5.8669e-01, -9.0311e-01],
        [-9.2274e-02,  1.2332e-01, -5.6928e-03,  ..., -3.3545e-01,
         -5.4815e-01, -7.7821e-03]])


In [24]:
train_labels = torch.tensor(train_data['Score'], dtype=torch.float32)
print(train_labels.shape)
print(train_labels)
torch.save(train_labels, "train_labels.pt")

torch.Size([5500])
tensor([1., 1., 1.,  ..., 0., 0., 0.])


In [19]:
test_features, test_data = preprocess_with_roberta(test_data)
print(test_data.shape)
test_data.head()

(2600, 9)


Unnamed: 0,PairID,Text,Score,Split_Text,Pred_Score,Sentence1,Sentence2,Embedding1,Embedding2
0,ENG-test-0000,Egypt's Brotherhood stands ground after killin...,0.7,[Egypt's Brotherhood stands ground after killi...,0.0,egypt's brotherhood stands ground after killings,egypt: muslim brotherhood stands behind morsi,"[tensor(-0.0477), tensor(0.0616), tensor(0.005...","[tensor(-0.0468), tensor(0.0575), tensor(0.008..."
1,ENG-test-0001,install it for fre and get to know what all u ...,0.71,[install it for fre and get to know what all u...,0.0,install it for fre and get to know what all u ...,"install the program, which is free to download...","[tensor(-0.0327), tensor(0.0544), tensor(-0.03...","[tensor(-0.0901), tensor(0.1307), tensor(-0.04..."
2,ENG-test-0002,"Also, it was one of the debut novels that I wa...",0.49,"[Also, it was one of the debut novels that I w...",0.0,"also, it was one of the debut novels that i wa...",pretty much the first thing people mentioned w...,"[tensor(-0.1055), tensor(0.0924), tensor(0.001...","[tensor(-0.1187), tensor(0.0733), tensor(-0.01..."
3,ENG-test-0003,"Therefore, you can use the code BRAIL, BASIL, ...",0.27,"[Therefore, you can use the code BRAIL, BASIL,...",0.0,"therefore, you can use the code brail, basil, ...",you can watch the wiggles every day on nick jr.,"[tensor(-0.0907), tensor(0.1206), tensor(-0.03...","[tensor(-0.0965), tensor(0.0422), tensor(0.012..."
4,ENG-test-0004,Solid YA novel with a funky take on zombies an...,0.32,[Solid YA novel with a funky take on zombies a...,0.0,solid ya novel with a funky take on zombies an...,my 13-year-old son recommended this book to me...,"[tensor(-0.1368), tensor(0.0987), tensor(-0.00...","[tensor(-0.0597), tensor(0.0636), tensor(-0.06..."


In [20]:
print(test_features.shape)
print(test_features)
torch.save(test_features, "test_features.pt")

torch.Size([2600, 1541])
tensor([[-4.7668e-02,  6.1642e-02,  5.0457e-03,  ..., -1.0263e+00,
         -8.5190e-01, -1.4553e-01],
        [-3.2685e-02,  5.4365e-02, -3.8078e-02,  ..., -9.9136e-02,
          4.3447e-01,  3.7257e-01],
        [-1.0550e-01,  9.2436e-02,  1.2523e-03,  ..., -7.1725e-01,
          6.6836e-01, -9.1528e-01],
        ...,
        [-1.4587e-01,  1.0338e-01,  1.2481e-02,  ..., -4.0819e-01,
         -6.1801e-01, -1.4926e+00],
        [-1.1664e-01,  8.5891e-02,  1.1200e-02,  ..., -4.0819e-01,
         -1.1126e-01, -1.4926e+00],
        [-1.0706e-01,  9.1530e-02,  1.7318e-03,  ...,  1.1371e+00,
          8.6326e-01, -7.7944e-01]])


In [25]:
test_labels = torch.tensor(test_data['Score'], dtype=torch.float32)
print(test_labels.shape)
print(test_labels)
torch.save(test_labels, "test_labels.pt")

torch.Size([2600])
tensor([0.7000, 0.7100, 0.4900,  ..., 0.4500, 0.4500, 0.2200])


# Model

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.utils.data import TensorDataset, random_split, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

In [86]:
# Update data for torch usage
X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: torch.Size([5500, 1541])
y_train: torch.Size([5500])
X_test: torch.Size([2600, 1541])
y_test: torch.Size([2600])


### Fine-Tune RoBERTa

In [87]:
'''tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)'''

"tokenizer = RobertaTokenizer.from_pretrained('roberta-base')\nmodel = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)"

In [88]:
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [89]:
'''training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)'''

'training_args = TrainingArguments(\n    output_dir=\'./results\',\n    num_train_epochs=3,\n    per_device_train_batch_size=16,\n    per_device_eval_batch_size=16,\n    warmup_steps=500,\n    weight_decay=0.01,\n    logging_dir=\'./logs\',\n    logging_steps=10,\n    evaluation_strategy="epoch",\n)'

In [90]:
'''train_data = torch.utils.data.TensorDataset(X_train, y_train)
test_data = torch.utils.data.TensorDataset(X_test, y_test)'''

'train_data = torch.utils.data.TensorDataset(X_train, y_train)\ntest_data = torch.utils.data.TensorDataset(X_test, y_test)'

In [91]:
'''trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=train_data,
    data_collator=collate_fn,
)'''

'trainer = Trainer(\n    model=model,\n    args=training_args,\n    train_dataset=train_data,\n    eval_dataset=train_data,\n    data_collator=collate_fn,\n)'

In [92]:
'''trainer.train()'''

'trainer.train()'

In [93]:
# Needed assistance with getting a differentiable spearman correlation for loss function
# https://forum.numer.ai/t/differentiable-spearman-in-pytorch-optimize-for-corr-directly/2287/26
import torchsort

def corrcoef(target, pred):
    pred_n = pred - pred.mean()
    target_n = target - target.mean()
    pred_n = pred_n / pred_n.norm()
    target_n = target_n / target_n.norm()
    return (pred_n * target_n).sum()

def spearman_loss(pred, target, x=1e-2):
    pred = torchsort.soft_rank(pred.reshape(1,-1),regularization_strength=x)
    target = torchsort.soft_rank(target.reshape(1,-1),regularization_strength=x)
    pred = pred - pred.mean()
    pred = pred / pred.norm()
    target = target - target.mean()
    target = target / target.norm()
    return 1 - (pred * target).sum()

In [None]:
class Model(nn.Module):
    def __init__(self, input_size=1539, hidden_size=128, num_layers=1):
        super(Model, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, 
                            batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(p=0.3)
        self.fc = nn.Linear(hidden_size * 2, 1)

    def forward(self, x):
        x = x.unsqueeze(1)
        output, _ = self.lstm(x)
        output = self.dropout(output)
        output = self.fc(output[:, -1, :])
        return output
    
    def split(self, X, y, s = 0.8):
        dataset = TensorDataset(torch.tensor(X, dtype=torch.float32),
                        torch.tensor(y, dtype=torch.float32))
        train_size = int(s * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        
        X_train, y_train = zip(*train_dataset)
        X_train = torch.stack(X_train)
        y_train = torch.stack(y_train)

        X_val, y_val = zip(*val_dataset)
        X_val = torch.stack(X_val)
        y_val = torch.stack(y_val)
        
        return X_train, y_train, X_val, y_val
    
    def fit(self, X, y, num_epochs=20, lr=1e-4, weight_decay=1e-4):
        X_train, y_train, X_val, y_val = self.split(X, y)
        optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
        best_val_loss = float('inf')
        for epoch in range(num_epochs):
            self.train()
            optimizer.zero_grad()
            y_pred = self(X_train)
            loss = spearman_loss(y_train, y_pred)
            loss.backward()
            optimizer.step()

            self.eval()
            with torch.no_grad():
                val_pred = self(X_val)
                val_loss = spearman_loss(y_val, val_pred).item()

            if epoch % (num_epochs // 10) == 0 or epoch == num_epochs - 1:
                print(f"Epoch {epoch+1}/{num_epochs}, Spearman Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(self.state_dict(), './best_model.pth')
                
    def predict(self, x):
        self.eval()
        with torch.no_grad():
            return self(x)

In [95]:
class Transformation(nn.Module):
    def __init__(self):
        super(Transformation, self).__init__()
        self.scale = nn.Parameter(torch.ones(1))
        self.shift = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        if x.ndim == 1:
            x = x.unsqueeze(-1)
        return x * self.scale + self.shift
    
    def fit(self, X_train, y_train, num_epochs=1000, lr=0.01):
        optimizer = optim.Adam(self.parameters(), lr)
        for epoch in range(num_epochs):
            optimizer.zero_grad()
            y_pred = self(X_train)
            loss = nn.functional.mse_loss(y_train.squeeze(), y_pred.squeeze())
            loss.backward()
            optimizer.step()
            if epoch % (num_epochs // 10) == 0 or epoch == num_epochs - 1:
                print(f"Epoch {epoch+1}/{num_epochs}, MSE Loss: {loss.item():.4f}")
                
    def transform(self, x):
        self.eval()
        with torch.no_grad():
            return self(x)
    

In [96]:
X = X_train
y = y_train
model = Model(input_size=X_train.shape[1], hidden_size=128, num_layers=2)
model.fit(X, y, 200, 0.1, 0.0001)

raw_pred = model.predict(X)
print()

trans = Transformation()
trans.fit(raw_pred, y)


  dataset = TensorDataset(torch.tensor(X, dtype=torch.float32),
  torch.tensor(y, dtype=torch.float32))


Epoch 1/200, Spearman Loss: 1.0322, Val Loss: 0.8067
Epoch 21/200, Spearman Loss: 0.3991, Val Loss: 0.4064
Epoch 41/200, Spearman Loss: 0.3712, Val Loss: 0.3873
Epoch 61/200, Spearman Loss: 0.3297, Val Loss: 0.3417
Epoch 81/200, Spearman Loss: 0.3729, Val Loss: 0.4028
Epoch 101/200, Spearman Loss: 0.2727, Val Loss: 0.2863
Epoch 121/200, Spearman Loss: 0.2634, Val Loss: 0.2935
Epoch 141/200, Spearman Loss: 0.2497, Val Loss: 0.2892
Epoch 161/200, Spearman Loss: 0.2434, Val Loss: 0.3006
Epoch 181/200, Spearman Loss: 0.2340, Val Loss: 0.3043
Epoch 200/200, Spearman Loss: 0.3189, Val Loss: 0.3181

Epoch 1/1000, MSE Loss: 57.3320
Epoch 101/1000, MSE Loss: 2.7248
Epoch 201/1000, MSE Loss: 0.0350
Epoch 301/1000, MSE Loss: 0.0237
Epoch 401/1000, MSE Loss: 0.0237
Epoch 501/1000, MSE Loss: 0.0237
Epoch 601/1000, MSE Loss: 0.0237
Epoch 701/1000, MSE Loss: 0.0237
Epoch 801/1000, MSE Loss: 0.0237
Epoch 901/1000, MSE Loss: 0.0237
Epoch 1000/1000, MSE Loss: 0.0237


# Evaluate Model

In [97]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_preds_np = np.array(train_preds).flatten()
train_y_np = np.array(y_train).flatten()
test_preds_np = np.array(test_preds).flatten()
test_y_np = np.array(y_test).flatten()

train_metrics = calculate_metrics(train_y_np, train_preds_np)
test_metrics = calculate_metrics(test_y_np, test_preds_np)

display_metrics(train_metrics, "Training Metrics:")
print()
display_metrics(test_metrics, "Testing Metrics:")

Training Metrics:
Pearson Corr: 0.7178010428554639
Spearman Corr: 0.7107450570732378
R^2: 0.515238297716144
MSE: 57.331993

Testing Metrics:
Pearson Corr: 0.650061010887935
Spearman Corr: 0.6765825204764284
R^2: 0.42257923215071114
MSE: 56.384674


In [98]:
print("Pred vs True for training data")
for i in range(10):
    print(f"{train_preds_np[i]:.4f}, {train_labels[i]:.4f}")
print()
print("Pred vs True for testing data")
for i in range(10):
    print(f"{test_preds_np[i]:.4f}, {test_labels[i]:.4f}")

Pred vs True for training data
19.1608, 1.0000
16.1683, 1.0000
6.4047, 1.0000
15.1955, 1.0000
16.1668, 1.0000
20.0277, 1.0000
16.6414, 1.0000
16.7724, 1.0000
16.7846, 1.0000
18.2593, 1.0000

Pred vs True for testing data
-0.3047, 0.7000
-1.5987, 0.7100
1.1716, 0.4900
-5.7783, 0.2700
-6.8394, 0.3200
0.5209, 0.4300
-6.8074, 0.3100
-5.9296, 0.3200
5.7925, 0.7700
-6.3460, 0.3400


In [99]:
train_preds_trans = trans.transform(train_preds)
test_preds_trans = trans.transform(test_preds)

train_preds_trans_np = np.array(train_preds_trans).flatten()
train_y_trans_np = np.array(y_train).flatten()
test_preds_trans_np = np.array(test_preds_trans).flatten()
test_y_trans_np = np.array(y_test).flatten()

train_metrics_trans = calculate_metrics(train_y_trans_np, train_preds_trans_np)
test_metrics_trans = calculate_metrics(test_y_trans_np, test_preds_trans_np)

display_metrics(train_metrics_trans, "Training Metrics:")
print()
display_metrics(test_metrics_trans, "Testing Metrics:")

Training Metrics:
Pearson Corr: 0.7178010318765529
Spearman Corr: 0.710745149944463
R^2: 0.5152382992135832
MSE: 0.023741225

Testing Metrics:
Pearson Corr: 0.6500610125665887
Spearman Corr: 0.6765823541113402
R^2: 0.42257923137626024
MSE: 0.020314747


In [100]:
print("Pred vs True for training data")
for i in range(10):
    print(f"{train_preds_trans_np[i]:.4f}, {train_labels[i]:.4f}")
print()
print("Pred vs True for testing data")
for i in range(10):
    print(f"{test_preds_trans_np[i]:.4f}, {test_labels[i]:.4f}")

Pred vs True for training data
0.8733, 1.0000
0.8116, 1.0000
0.6104, 1.0000
0.7916, 1.0000
0.8116, 1.0000
0.8911, 1.0000
0.8214, 1.0000
0.8241, 1.0000
0.8243, 1.0000
0.8547, 1.0000

Pred vs True for testing data
0.4722, 0.7000
0.4455, 0.7100
0.5026, 0.4900
0.3594, 0.2700
0.3375, 0.3200
0.4892, 0.4300
0.3382, 0.3100
0.3563, 0.3200
0.5978, 0.7700
0.3477, 0.3400
