In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys

root_dir = os.path.abspath('..')
print("Root dir: ", root_dir)
sys.path.append(root_dir)

Root dir:  /Users/user010/Desktop/Programming/ML/STS


In [4]:
from utils.helpers import read_config, print_config
import numpy as np

glob_cfg = read_config("../config.yaml")
cfg = read_config(glob_cfg.configs.trace_tune)

print_config(cfg)

{
  "model": "distilbert",
  "dataset": "sts_traces"
}


In [5]:
from datasets import load_dataset

dataset_params = glob_cfg.datasets[cfg.dataset].hf_params
print("Params:", dataset_params)
dataset = load_dataset(**dataset_params)

Params: {'path': 'under-tree/sts_traces'}


In [6]:
dataset = dataset.shuffle(seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'score'],
        num_rows: 15000
    })
    val: Dataset({
        features: ['text1', 'text2', 'score'],
        num_rows: 3000
    })
})

In [8]:
from transformers import AutoTokenizer, AutoModel

model_params = glob_cfg.models[cfg.model].hf_params
print("Params:", model_params)

tokenizer = AutoTokenizer.from_pretrained(**model_params.tokenizer)
model = AutoModel.from_pretrained(**model_params.model)

Params: {'model': {'pretrained_model_name_or_path': 'distilbert-base-uncased'}, 'tokenizer': {'pretrained_model_name_or_path': 'distilbert-base-uncased'}}


In [9]:
from utils.score_utils import get_score

sample = dataset["train"][0]
score = get_score(model, tokenizer, sample)

print("Cosine similarity:", score)
print("Expected score:", sample["score"])

Cosine similarity: 0.8074988126754761
Expected score: 0.6765367984771729


Evaluation

In [44]:
def tokenize(sample):
    tokens1 = tokenizer(sample["text1"], padding=True, return_tensors="pt")
    tokens2 = tokenizer(sample["text2"], padding=True, return_tensors="pt")

    # rename keys to avoid collision
    for key in ["input_ids", "attention_mask"]:
        tokens1[f"{key}_1"] = tokens1.pop(key)
        tokens2[f"{key}_2"] = tokens2.pop(key)

    tokens = {**tokens1, **tokens2}
    return tokens

dataset = dataset.map(tokenize, batched=True, batch_size=1000)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [45]:
dataset.set_format(type='torch', columns=['input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2', 'score'])
dataset

DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'score', 'input_ids', 'attention_mask', 'input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2'],
        num_rows: 15000
    })
    val: Dataset({
        features: ['text1', 'text2', 'score', 'input_ids', 'attention_mask', 'input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2'],
        num_rows: 3000
    })
})

In [47]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from utils.sts_dataset import TextSimilarityDataset


train_dataset = TextSimilarityDataset(dataset['train'])
val_dataset = TextSimilarityDataset(dataset['val'])

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=0)


DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'score', 'input_ids', 'attention_mask', 'input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2'],
        num_rows: 15000
    })
    val: Dataset({
        features: ['text1', 'text2', 'score', 'input_ids', 'attention_mask', 'input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2'],
        num_rows: 3000
    })
})

In [None]:
loss_function = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=cfg.train.lr)
num_epochs = cfg.train.num_epochs


In [None]:
import torch.nn.functional as F
train_losses = []
val_losses = []


for epoch in range(num_epochs):
    model.train()
    losses = []
    for batch in train_dataloader:
        (input_ids_1, attention_mask_1, input_ids_2, attention_mask_2), real_similarity_score = batch

        
        output1 = model(input_ids=input_ids_1, attention_mask=attention_mask_1)
        output2 = model(input_ids=input_ids_2, attention_mask=attention_mask_2)
        embeddings1 = output1.last_hidden_state
        embeddings2 = output2.last_hidden_state

        # Perform mean pooling on the embeddings
        mean_pooled_embeddings1 = torch.mean(embeddings1, dim=1)
        mean_pooled_embeddings2 = torch.mean(embeddings2, dim=1)

        cosine_similarity = F.cosine_similarity(mean_pooled_embeddings1, mean_pooled_embeddings2)

        loss = loss_function(cosine_similarity, real_similarity_score)
        losses.append(loss.item())
        

        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
    train_losses.append(np.mean(losses))
    
    model.eval()
    losses = []
    with torch.no_grad():
        for batch in val_dataloader:
            (input_ids_1, attention_mask_1, input_ids_2, attention_mask_2), real_similarity_score = batch

        
            output1 = model(input_ids=input_ids_1, attention_mask=attention_mask_1)
            output2 = model(input_ids=input_ids_2, attention_mask=attention_mask_2)
            embeddings1 = output1.last_hidden_state
            embeddings2 = output2.last_hidden_state

            # Perform mean pooling on the embeddings
            mean_pooled_embeddings1 = torch.mean(embeddings1, dim=1)
            mean_pooled_embeddings2 = torch.mean(embeddings2, dim=1)

            cosine_similarity = F.cosine_similarity(mean_pooled_embeddings1, mean_pooled_embeddings2)

            loss = loss_function(cosine_similarity, real_similarity_score)
            losses.append(loss.item())
    val_losses.append(np.mean(losses))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set_style("darkgrid")

plt.plot(train_losses, label='train')
plt.plot(val_losses, label='val')
plt.legend()