<a href="https://colab.research.google.com/github/shmuhammadd/semantic_relatedness/blob/main/Simple_English_Baseline_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package Imports

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, random_split
from scipy.stats import spearmanr, pearsonr, linregress

In [2]:
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.version.cuda)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

True
12.4
cuda


# Tools

In [3]:
def calculate_metrics(preds, scores):
    pearson_corr, _ = pearsonr(scores, preds)
    spearman_corr, _ = spearmanr(scores, preds)
    _, _, r, _, _ = linregress(scores, preds) # probably a better way of doing this, fix later
    r2 = r**2
    mse = ((scores - preds)**2).mean() # Scikit's mean_squared_error complained about being deprecated, so this is my temp fix
    return (pearson_corr, spearman_corr, r2, mse)

In [4]:
# Make sure these match the metrics above
def display_metrics(metrics, title="Metrics:"):
    print(title)
    print("Pearson Corr:", metrics[0])
    print("Spearman Corr:", metrics[1])
    print("R^2:", metrics[2])
    print("MSE:", metrics[3])

# Load data

In [5]:
languages = ["eng", "arq", "amh", "hau", "kin", "mar", "ary", "tel"] # No esp, has no labeled test data
#languages = ['eng'] # Use this for much smaller data quantity, but reduce epochs heavily (~200)
train_features_list = []
train_labels_list = []
test_features_list = []
test_labels_list = []
test_data = {}
for lang in languages:
    # Load training data
    lang_features = torch.load(f"./processed_data/train_features_{lang}.pt", weights_only=True)
    lang_labels = torch.load(f"./processed_data/train_labels_{lang}.pt", weights_only=True)
    
    # Append to training lists
    train_features_list.append(lang_features)
    train_labels_list.append(lang_labels)
    
    # Load test data and store in dictionary
    lang_features = torch.load(f"./processed_data/test_features_{lang}.pt", weights_only=True)
    lang_labels = torch.load(f"./processed_data/test_labels_{lang}.pt", weights_only=True)
    test_features_list.append(lang_features)
    test_labels_list.append(lang_labels)
    test_data[lang] = {"features": lang_features, "labels": lang_labels}
    
train_features = torch.cat(train_features_list, dim=0)
train_labels = torch.cat(train_labels_list, dim=0)
test_features = torch.cat(test_features_list, dim=0)
test_labels = torch.cat(test_labels_list, dim=0)
del train_features_list
del train_labels_list
del test_features_list
del test_labels_list

In [6]:
print("Combined Train Features Shape:", train_features.shape)
print("Combined Train Labels Shape:", train_labels.shape)
print("Combined Test Features Shape:", test_features.shape)
print("Combined Test Labels Shape:", test_labels.shape)
print("Test Features Shape (English):", test_data["eng"]["features"].shape)
print("Test Labels Shape (English):", test_data["eng"]["labels"].shape)

Combined Train Features Shape: torch.Size([13561, 1542])
Combined Train Labels Shape: torch.Size([13561])
Combined Test Features Shape: torch.Size([5200, 1542])
Combined Test Labels Shape: torch.Size([5200])
Test Features Shape (English): torch.Size([2600, 1542])
Test Labels Shape (English): torch.Size([2600])


In [7]:
'''
# Make sure you have these files, either run "project_final_preprocessing.ipynb" or download the files
train_features = torch.load("train_features_eng.pt", weights_only=True)
train_labels = torch.load("train_labels_eng.pt", weights_only=True)
test_features = torch.load("test_features_eng.pt", weights_only=True)
test_labels = torch.load("test_labels_end.pt", weights_only=True)

print("Train features:", train_features.shape)
print("Train labels:", train_labels.shape)
print("Test features:", test_features.shape)
print("Test labels:", test_labels.shape)
'''

'\n# Make sure you have these files, either run "project_final_preprocessing.ipynb" or download the files\ntrain_features = torch.load("train_features_eng.pt", weights_only=True)\ntrain_labels = torch.load("train_labels_eng.pt", weights_only=True)\ntest_features = torch.load("test_features_eng.pt", weights_only=True)\ntest_labels = torch.load("test_labels_end.pt", weights_only=True)\n\nprint("Train features:", train_features.shape)\nprint("Train labels:", train_labels.shape)\nprint("Test features:", test_features.shape)\nprint("Test labels:", test_labels.shape)\n'

# Model

In [8]:
# Used different naming scheme when I merged files
X_train = train_features
y_train = train_labels
X_test = test_features
y_test = test_labels
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: torch.Size([13561, 1542])
y_train: torch.Size([13561])
X_test: torch.Size([5200, 1542])
y_test: torch.Size([5200])


In [9]:
# Needed assistance with getting a differentiable spearman correlation for loss function
# https://forum.numer.ai/t/differentiable-spearman-in-pytorch-optimize-for-corr-directly/2287/26
import torchsort

def corrcoef(target, pred):
    pred_n = pred - pred.mean()
    target_n = target - target.mean()
    pred_n = pred_n / pred_n.norm()
    target_n = target_n / target_n.norm()
    return (pred_n * target_n).sum()

def spearman_loss(pred, target, x=1e-2):
    pred = torchsort.soft_rank(pred.reshape(1,-1),regularization_strength=x)
    target = torchsort.soft_rank(target.reshape(1,-1),regularization_strength=x)
    pred = pred - pred.mean()
    pred = pred / pred.norm()
    target = target - target.mean()
    target = target / target.norm()
    return 1 - (pred * target).sum()**2

In [10]:
class Model(nn.Module):
    def __init__(self, input_size=1539, hidden_size=128, num_layers=1):
        super(Model, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, 
                            batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(p=0.3)
        self.fc = nn.Linear(hidden_size * 2, 1)

    def forward(self, x):
        x = x.unsqueeze(1)
        output, _ = self.lstm(x)
        output = self.dropout(output)
        output = self.fc(output[:, -1, :])
        return output
    
    def split(self, X, y, s = 0.8):
        dataset = TensorDataset(X.clone().detach(), y.clone().detach())
        train_size = int(s * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        
        X_train, y_train = zip(*train_dataset)
        X_train = torch.stack(X_train)
        y_train = torch.stack(y_train)

        X_val, y_val = zip(*val_dataset)
        X_val = torch.stack(X_val)
        y_val = torch.stack(y_val)
        
        return X_train, y_train, X_val, y_val
    
    def fit(self, X, y, num_epochs=20, lr=1e-4, weight_decay=1e-4):
        X = X.to(device)
        y = y.to(device)
        X_train, y_train, X_val, y_val = self.split(X, y)
        optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
        best_val_loss = float('inf')
        for epoch in range(num_epochs):
            self.train()
            optimizer.zero_grad()
            y_pred = self(X_train)
            loss = spearman_loss(y_train, y_pred)
            loss.backward()
            optimizer.step()

            self.eval()
            with torch.no_grad():
                val_pred = self(X_val)
                val_loss = spearman_loss(y_val, val_pred).item()

            if epoch % (num_epochs // 10) == 0 or epoch == num_epochs - 1:
                print(f"Epoch {epoch+1}/{num_epochs}, Spearman Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(self.state_dict(), './best_model.pth')
        print("Best val loss:", best_val_loss)
                
    def predict(self, x):
        x = x.to(device)
        self.eval()
        with torch.no_grad():
            return self(x).cpu()

In [11]:
class Transformation(nn.Module):
    def __init__(self):
        super(Transformation, self).__init__()
        self.scale = nn.Parameter(torch.ones(1))
        self.shift = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        if x.ndim == 1:
            x = x.unsqueeze(-1)
        return x * self.scale + self.shift
    
    def fit(self, X_train, y_train, num_epochs=1000, lr=0.1):
        optimizer = optim.Adam(self.parameters(), lr)
        for epoch in range(num_epochs):
            optimizer.zero_grad()
            y_pred = self(X_train)
            loss = nn.functional.mse_loss(y_train.squeeze(), y_pred.squeeze())
            loss.backward()
            optimizer.step()
            if epoch % (num_epochs // 10) == 0 or epoch == num_epochs - 1:
                print(f"Epoch {epoch+1}/{num_epochs}, MSE Loss: {loss.item():.4f}")
                
    def transform(self, x):
        self.eval()
        with torch.no_grad():
            return self(x)
    

In [12]:
X = X_train
y = y_train
model = Model(input_size=X_train.shape[1], hidden_size=512, num_layers=2).to(device)
model.fit(X, y, num_epochs=2000, lr=0.1, weight_decay=0.0001)

raw_pred = model.predict(X)

trans = Transformation()
trans.fit(raw_pred, y)


Epoch 1/2000, Spearman Loss: 0.9985, Val Loss: 0.9900
Epoch 201/2000, Spearman Loss: 0.8458, Val Loss: 0.7168
Epoch 401/2000, Spearman Loss: 0.7048, Val Loss: 0.6917
Epoch 601/2000, Spearman Loss: 0.8986, Val Loss: 0.7165
Epoch 801/2000, Spearman Loss: 0.7288, Val Loss: 0.7071
Epoch 1001/2000, Spearman Loss: 0.8906, Val Loss: 0.7251
Epoch 1201/2000, Spearman Loss: 0.7429, Val Loss: 0.7165
Epoch 1401/2000, Spearman Loss: 0.7704, Val Loss: 0.7228
Epoch 1601/2000, Spearman Loss: 0.7232, Val Loss: 0.7069
Epoch 1801/2000, Spearman Loss: 0.7584, Val Loss: 0.7126
Epoch 2000/2000, Spearman Loss: 0.7677, Val Loss: 0.7130
Best val loss: 0.6578329801559448
Epoch 1/1000, MSE Loss: 902.0722
Epoch 101/1000, MSE Loss: 0.0496
Epoch 201/1000, MSE Loss: 0.0358
Epoch 301/1000, MSE Loss: 0.0346
Epoch 401/1000, MSE Loss: 0.0340
Epoch 501/1000, MSE Loss: 0.0338
Epoch 601/1000, MSE Loss: 0.0337
Epoch 701/1000, MSE Loss: 0.0336
Epoch 801/1000, MSE Loss: 0.0336
Epoch 901/1000, MSE Loss: 0.0336
Epoch 1000/1000,

# Evaluate Model

In [13]:
state_dict = torch.load('./best_model.pth', map_location=torch.device('cpu'), weights_only=True)
model.load_state_dict(state_dict)
model.to(device)

Model(
  (lstm): LSTM(1542, 512, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=1024, out_features=1, bias=True)
)

In [14]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_preds_np = np.array(train_preds).flatten()
train_y_np = np.array(y_train).flatten()
test_preds_np = np.array(test_preds).flatten()
test_y_np = np.array(y_test).flatten()

train_metrics = calculate_metrics(train_y_np, train_preds_np)
test_metrics = calculate_metrics(test_y_np, test_preds_np)
full_metrics = calculate_metrics(np.concatenate((train_y_np, test_y_np), axis=0),
                                 np.concatenate((train_preds_np, test_preds_np), axis=0))

display_metrics(train_metrics, "Training Metrics:")
print()
display_metrics(test_metrics, "Testing Metrics:")
print()
display_metrics(full_metrics, "Full data Metrics:")

Training Metrics:
Pearson Corr: 0.5878588589559477
Spearman Corr: 0.5868116648719927
R^2: 0.34557802193017434
MSE: 312.34805

Testing Metrics:
Pearson Corr: 0.6134780924511531
Spearman Corr: 0.6323575545170608
R^2: 0.3763554193159644
MSE: 356.2072

Full data Metrics:
Pearson Corr: 0.5930471918841913
Spearman Corr: 0.5972918014663274
R^2: 0.3517049306460061
MSE: 324.50452


In [15]:
train_preds_trans = trans.transform(train_preds)
test_preds_trans = trans.transform(test_preds)

train_preds_trans_np = np.array(train_preds_trans).flatten()
train_y_trans_np = np.array(y_train).flatten()
test_preds_trans_np = np.array(test_preds_trans).flatten()
test_y_trans_np = np.array(y_test).flatten()

train_dice_preds = X_train[:, -1].numpy().flatten()
test_dice_preds = X_test[:, -1].numpy().flatten()

train_metrics_trans = calculate_metrics(train_y_trans_np, train_preds_trans_np)
test_metrics_trans = calculate_metrics(test_y_trans_np, test_preds_trans_np)
full_metrics_trans = calculate_metrics(np.concatenate((train_y_trans_np, test_y_trans_np), axis=0),
                                 np.concatenate((train_preds_trans_np, test_preds_trans_np), axis=0))
train_dice_metrics_trans = calculate_metrics(train_y_trans_np, train_dice_preds)
test_dice_metrics_trans = calculate_metrics(test_y_trans_np, test_dice_preds)
full_dice_metrics_trans = calculate_metrics(np.concatenate((train_y_trans_np, test_y_trans_np), axis=0),
                                 np.concatenate((train_dice_preds, test_dice_preds), axis=0))

display_metrics(train_metrics_trans, "Training Metrics:")
print()
display_metrics(test_metrics_trans, "Testing Metrics:")
print()
display_metrics(full_metrics_trans, "Full data Metrics:")
print()

Training Metrics:
Pearson Corr: 0.5878588323754048
Spearman Corr: 0.5868116616471577
R^2: 0.3455780222473866
MSE: 0.10745072

Testing Metrics:
Pearson Corr: 0.6134780988809436
Spearman Corr: 0.6323575417398761
R^2: 0.37635542077323936
MSE: 0.09497109

Full data Metrics:
Pearson Corr: 0.5930471873687991
Spearman Corr: 0.5972917978263474
R^2: 0.35170493123780394
MSE: 0.10399174



In [16]:
display_metrics(train_dice_metrics_trans, "Dice Training Metrics:")
print()
display_metrics(test_dice_metrics_trans, "Dice Testing Metrics:")
print()
display_metrics(full_dice_metrics_trans, "Dice Full Metrics:")

Dice Training Metrics:
Pearson Corr: 0.5373415969579298
Spearman Corr: 0.5224577785209864
R^2: 0.28873598082359553
MSE: 1.0594916

Dice Testing Metrics:
Pearson Corr: 0.6112762933539937
Spearman Corr: 0.6197246704911602
R^2: 0.37365871602039336
MSE: 1.0556076

Dice Full Metrics:
Pearson Corr: 0.5559559434649143
Spearman Corr: 0.5476671568020123
R^2: 0.3090870272524448
MSE: 1.058415


In [None]:
torch.save(train_preds_trans, "train_preds_all.pt")
torch.save(test_preds_trans, "test_preds_all.pt")
torch.save(test_dice_preds, "dice_preds_all.pt")
