<a href="https://colab.research.google.com/github/shmuhammadd/semantic_relatedness/blob/main/Simple_English_Baseline_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package Imports

In [46]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, random_split
from scipy.stats import spearmanr, pearsonr, linregress

In [47]:
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.version.cuda)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

True
12.4
cuda


# Tools

In [48]:
def calculate_metrics(preds, scores):
    pearson_corr, _ = pearsonr(scores, preds)
    spearman_corr, _ = spearmanr(scores, preds)
    _, _, r, _, _ = linregress(scores, preds) # probably a better way of doing this, fix later
    r2 = r**2
    mse = ((scores - preds)**2).mean() # Scikit's mean_squared_error complained about being deprecated, so this is my temp fix
    return (pearson_corr, spearman_corr, r2, mse)

In [49]:
# Make sure these match the metrics above
def display_metrics(metrics, title="Metrics:"):
    print(title)
    print("Pearson Corr:", metrics[0])
    print("Spearman Corr:", metrics[1])
    print("R^2:", metrics[2])
    print("MSE:", metrics[3])

# Load data

In [50]:
# Make sure you have these files, either run "project_final_preprocessing.ipynb" or download the files
train_features = torch.load("train_features.pt", weights_only=True)
train_labels = torch.load("train_labels.pt", weights_only=True)
test_features = torch.load("test_features.pt", weights_only=True)
test_labels = torch.load("test_labels.pt", weights_only=True)

print("Train features:", train_features.shape)
print("Train labels:", train_labels.shape)
print("Test features:", test_features.shape)
print("Test labels:", test_labels.shape)

Train features: torch.Size([5500, 1542])
Train labels: torch.Size([5500])
Test features: torch.Size([2600, 1542])
Test labels: torch.Size([2600])


# Model

In [51]:
# Used different naming scheme when I merged files
X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: torch.Size([5500, 1542])
y_train: torch.Size([5500])
X_test: torch.Size([2600, 1542])
y_test: torch.Size([2600])


In [52]:
# Needed assistance with getting a differentiable spearman correlation for loss function
# https://forum.numer.ai/t/differentiable-spearman-in-pytorch-optimize-for-corr-directly/2287/26
import torchsort

def corrcoef(target, pred):
    pred_n = pred - pred.mean()
    target_n = target - target.mean()
    pred_n = pred_n / pred_n.norm()
    target_n = target_n / target_n.norm()
    return (pred_n * target_n).sum()

def spearman_loss(pred, target, x=1e-2):
    pred = torchsort.soft_rank(pred.reshape(1,-1),regularization_strength=x)
    target = torchsort.soft_rank(target.reshape(1,-1),regularization_strength=x)
    pred = pred - pred.mean()
    pred = pred / pred.norm()
    target = target - target.mean()
    target = target / target.norm()
    return 1 - (pred * target).sum()**2

In [53]:
class Model(nn.Module):
    def __init__(self, input_size=1539, hidden_size=128, num_layers=1):
        super(Model, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, 
                            batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(p=0.3)
        self.fc = nn.Linear(hidden_size * 2, 1)

    def forward(self, x):
        x = x.unsqueeze(1)
        output, _ = self.lstm(x)
        output = self.dropout(output)
        output = self.fc(output[:, -1, :])
        return output
    
    def split(self, X, y, s = 0.8):
        dataset = TensorDataset(X.clone().detach(), y.clone().detach())
        train_size = int(s * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        
        X_train, y_train = zip(*train_dataset)
        X_train = torch.stack(X_train)
        y_train = torch.stack(y_train)

        X_val, y_val = zip(*val_dataset)
        X_val = torch.stack(X_val)
        y_val = torch.stack(y_val)
        
        return X_train, y_train, X_val, y_val
    
    def fit(self, X, y, num_epochs=20, lr=1e-4, weight_decay=1e-4):
        X = X.to(device)
        y = y.to(device)
        X_train, y_train, X_val, y_val = self.split(X, y)
        optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
        best_val_loss = float('inf')
        for epoch in range(num_epochs):
            self.train()
            optimizer.zero_grad()
            y_pred = self(X_train)
            loss = spearman_loss(y_train, y_pred)
            loss.backward()
            optimizer.step()

            self.eval()
            with torch.no_grad():
                val_pred = self(X_val)
                val_loss = spearman_loss(y_val, val_pred).item()

            if epoch % (num_epochs // 10) == 0 or epoch == num_epochs - 1:
                print(f"Epoch {epoch+1}/{num_epochs}, Spearman Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(self.state_dict(), './best_model.pth')
                
    def predict(self, x):
        x = x.to(device)
        self.eval()
        with torch.no_grad():
            return self(x).cpu()

In [None]:
class Transformation(nn.Module):
    def __init__(self):
        super(Transformation, self).__init__()
        self.scale = nn.Parameter(torch.ones(1))
        self.shift = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        if x.ndim == 1:
            x = x.unsqueeze(-1)
        return x * self.scale + self.shift
    
    def fit(self, X_train, y_train, num_epochs=1000, lr=0.1):
        optimizer = optim.Adam(self.parameters(), lr)
        for epoch in range(num_epochs):
            optimizer.zero_grad()
            y_pred = self(X_train)
            loss = nn.functional.mse_loss(y_train.squeeze(), y_pred.squeeze())
            loss.backward()
            optimizer.step()
            if epoch % (num_epochs // 10) == 0 or epoch == num_epochs - 1:
                print(f"Epoch {epoch+1}/{num_epochs}, MSE Loss: {loss.item():.4f}")
                
    def transform(self, x):
        self.eval()
        with torch.no_grad():
            return self(x)
    

In [None]:
X = X_train
y = y_train
model = Model(input_size=X_train.shape[1], hidden_size=512, num_layers=2).to(device)
model.fit(X, y, num_epochs=200, lr=0.1, weight_decay=0.0001)

raw_pred = model.predict(X)

trans = Transformation()
trans.fit(raw_pred, y)


Epoch 1/20, Spearman Loss: 0.9989, Val Loss: 0.8948
Epoch 3/20, Spearman Loss: 0.9815, Val Loss: 0.9283
Epoch 5/20, Spearman Loss: 0.9796, Val Loss: 0.6928
Epoch 7/20, Spearman Loss: 0.8823, Val Loss: 0.7179
Epoch 9/20, Spearman Loss: 0.7391, Val Loss: 0.6945
Epoch 11/20, Spearman Loss: 0.7175, Val Loss: 0.6825
Epoch 13/20, Spearman Loss: 0.7102, Val Loss: 0.6698
Epoch 15/20, Spearman Loss: 0.6854, Val Loss: 0.6616
Epoch 17/20, Spearman Loss: 0.6882, Val Loss: 0.6570
Epoch 19/20, Spearman Loss: 0.7225, Val Loss: 0.6658
Epoch 20/20, Spearman Loss: 0.6878, Val Loss: 0.6605
Epoch 1/1000, MSE Loss: 579.2866
Epoch 101/1000, MSE Loss: 27.3292
Epoch 201/1000, MSE Loss: 0.2578
Epoch 301/1000, MSE Loss: 0.1429
Epoch 401/1000, MSE Loss: 0.1352
Epoch 501/1000, MSE Loss: 0.1269
Epoch 601/1000, MSE Loss: 0.1183
Epoch 701/1000, MSE Loss: 0.1096
Epoch 801/1000, MSE Loss: 0.1010
Epoch 901/1000, MSE Loss: 0.0926
Epoch 1000/1000, MSE Loss: 0.0848


# Evaluate Model

In [56]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_preds_np = np.array(train_preds).flatten()
train_y_np = np.array(y_train).flatten()
test_preds_np = np.array(test_preds).flatten()
test_y_np = np.array(y_test).flatten()

train_metrics = calculate_metrics(train_y_np, train_preds_np)
test_metrics = calculate_metrics(test_y_np, test_preds_np)
full_metrics = calculate_metrics(np.concatenate((train_y_np, test_y_np), axis=0),
                                 np.concatenate((train_preds_np, test_preds_np), axis=0))

display_metrics(train_metrics, "Training Metrics:")
print()
display_metrics(test_metrics, "Testing Metrics:")
print()
display_metrics(full_metrics, "Full data Metrics:")

Training Metrics:
Pearson Corr: -0.6029928549730796
Spearman Corr: -0.5835302496054595
R^2: 0.36360035827483894
MSE: 579.28656

Testing Metrics:
Pearson Corr: -0.7206108533119391
Spearman Corr: -0.7079363766386325
R^2: 0.5192799820257603
MSE: 620.46063

Full data Metrics:
Pearson Corr: -0.6287314715345089
Spearman Corr: -0.6185839880233847
R^2: 0.3953033387742675
MSE: 592.5029


In [57]:
print("Pred vs True for training data")
for i in range(10):
    print(f"{train_preds_np[i]:.4f}, {train_labels[i]:.4f}")
print()
print("Pred vs True for testing data")
for i in range(10):
    print(f"{test_preds_np[i]:.4f}, {test_labels[i]:.4f}")

Pred vs True for training data
-43.9898, 1.0000
-44.1358, 1.0000
-26.5160, 1.0000
-43.2582, 1.0000
-42.0905, 1.0000
-43.7051, 1.0000
-43.4124, 1.0000
-43.6525, 1.0000
-43.5668, 1.0000
-43.7499, 1.0000

Pred vs True for testing data
-19.3353, 0.7000
-14.9599, 0.7100
-11.5634, 0.4900
-12.8064, 0.2700
-5.4662, 0.3200
-17.5851, 0.4300
-10.1705, 0.3100
-7.9478, 0.3200
-26.5567, 0.7700
-13.3385, 0.3400


In [58]:
train_preds_trans = trans.transform(train_preds)
test_preds_trans = trans.transform(test_preds)

train_preds_trans_np = np.array(train_preds_trans).flatten()
train_y_trans_np = np.array(y_train).flatten()
test_preds_trans_np = np.array(test_preds_trans).flatten()
test_y_trans_np = np.array(y_test).flatten()

train_metrics_trans = calculate_metrics(train_y_trans_np, train_preds_trans_np)
test_metrics_trans = calculate_metrics(test_y_trans_np, test_preds_trans_np)

display_metrics(train_metrics_trans, "Training Metrics:")
print()
display_metrics(test_metrics_trans, "Testing Metrics:")

Training Metrics:
Pearson Corr: -0.6029928035196528
Spearman Corr: -0.583530192301254
R^2: 0.36360035907285937
MSE: 0.084676795

Testing Metrics:
Pearson Corr: -0.7206109057057404
Spearman Corr: -0.7079363062400054
R^2: 0.5192799796160503
MSE: 0.06157634


In [59]:
print("Pred vs True for training data")
for i in range(10):
    print(f"{train_preds_trans_np[i]:.4f}, {train_labels[i]:.4f}")
print()
print("Pred vs True for testing data")
for i in range(10):
    print(f"{test_preds_trans_np[i]:.4f}, {test_labels[i]:.4f}")

Pred vs True for training data
0.4631, 1.0000
0.4622, 1.0000
0.5716, 1.0000
0.4677, 1.0000
0.4749, 1.0000
0.4649, 1.0000
0.4667, 1.0000
0.4652, 1.0000
0.4658, 1.0000
0.4646, 1.0000

Pred vs True for testing data
0.6162, 0.7000
0.6433, 0.7100
0.6644, 0.4900
0.6567, 0.2700
0.7023, 0.3200
0.6270, 0.4300
0.6731, 0.3100
0.6869, 0.3200
0.5714, 0.7700
0.6534, 0.3400
