In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys

root_dir = os.path.abspath('..')
print("Root dir: ", root_dir)
sys.path.append(root_dir)

Root dir:  /Users/user010/Desktop/Programming/ML/STS


In [4]:
from utils.helpers import read_config, print_config
import numpy as np

glob_cfg = read_config("../config.yaml")
cfg = read_config(glob_cfg.configs.trace_tune)

print_config(cfg)

{
  "model": "distilbert",
  "dataset": "sts_traces"
}


In [5]:
from datasets import load_dataset

dataset_params = glob_cfg.datasets[cfg.dataset].hf_params
print("Params:", dataset_params)
dataset = load_dataset(**dataset_params)

Params: {'path': 'under-tree/sts_traces'}


In [6]:
# dataset info
dataset

DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'score'],
        num_rows: 15000
    })
    val: Dataset({
        features: ['text1', 'text2', 'score'],
        num_rows: 3000
    })
})

In [7]:
# shuffle
dataset = dataset.shuffle(seed=42)

In [8]:
# automodel
from transformers import AutoTokenizer, AutoModel

model_params = glob_cfg.models[cfg.model].hf_params
print("Params:", model_params)

tokenizer = AutoTokenizer.from_pretrained(**model_params.tokenizer)
model = AutoModel.from_pretrained(**model_params.model)

Params: {'model': {'pretrained_model_name_or_path': 'distilbert-base-uncased'}, 'tokenizer': {'pretrained_model_name_or_path': 'distilbert-base-uncased'}}


In [9]:
# cosine similarity
from torch.nn.functional import cosine_similarity
import torch

# get embeddings
def get_embeddings(model, tokenizer, sentences, no_grad=True):
    # tokenize
    tokenized = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

    # get embeddings
    with torch.set_grad_enabled(not no_grad):
        embeddings = model(**tokenized).last_hidden_state.mean(dim=1)
    return embeddings

def get_score(model, tokjenizer, sample):
    emb_a, emb_b = get_embeddings(model, tokenizer, [sample["text1"], sample["text2"]])
    return cosine_similarity(emb_a.unsqueeze(0), emb_b.unsqueeze(0)).item()
sample = dataset["train"][0]
score = get_score(model, tokenizer, sample)

print("Cosine similarity:", score)
print("Expected score:", sample["score"])

Cosine similarity: 0.8074988126754761
Expected score: 0.6765367984771729


In [13]:
# diff for val
emb_size = 50
texts = dataset["val"]
diffs = []
for i in range(len(texts) // emb_size + 1):
    emb_a = get_embeddings(model, tokenizer, texts["text1"][i * emb_size: (i + 1) * emb_size], no_grad=True)
    emb_b = get_embeddings(model, tokenizer, texts["text2"][i * emb_size: (i + 1) * emb_size], no_grad=True)
    diff = cosine_similarity(emb_a, emb_b)
    diffs.append(diff)

In [None]:
print("Mean diff:", ds_diff["val"]["diff"].mean())

In [44]:
# tokenize
def tokenize(sample):
    tokens1 = tokenizer(sample["text1"], padding=True, return_tensors="pt")
    tokens2 = tokenizer(sample["text2"], padding=True, return_tensors="pt")

    # rename keys to avoid collision
    for key in ["input_ids", "attention_mask"]:
        tokens1[f"{key}_1"] = tokens1.pop(key)
        tokens2[f"{key}_2"] = tokens2.pop(key)
    # merge
    tokens = {**tokens1, **tokens2}
    return tokens

dataset = dataset.map(tokenize, batched=True, batch_size=1000)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [45]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'score', 'input_ids', 'attention_mask', 'input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2'],
        num_rows: 15000
    })
    val: Dataset({
        features: ['text1', 'text2', 'score', 'input_ids', 'attention_mask', 'input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2'],
        num_rows: 3000
    })
})

In [46]:
# format
dataset.set_format(type='torch', columns=['input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2', 'score'])

In [47]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

# Step 1: Create a custom dataset class
class TextSimilarityDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        input_ids_1 = self.dataset[idx]['input_ids_1']
        attention_mask_1 = self.dataset[idx]['attention_mask_1']
        input_ids_2 = self.dataset[idx]['input_ids_2']
        attention_mask_2 = self.dataset[idx]['attention_mask_2']
        score = self.dataset[idx]['score']
        return (input_ids_1, attention_mask_1, input_ids_2, attention_mask_2), score

# Step 2: Prepare your dataset
train_dataset = TextSimilarityDataset(dataset['train'])
val_dataset = TextSimilarityDataset(dataset['val'])

# Step 3: Create a DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=0)


DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'score', 'input_ids', 'attention_mask', 'input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2'],
        num_rows: 15000
    })
    val: Dataset({
        features: ['text1', 'text2', 'score', 'input_ids', 'attention_mask', 'input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2'],
        num_rows: 3000
    })
})

In [None]:
# Define the custom loss function (e.g., Mean Squared Error)
loss_function = nn.MSELoss()

# Initialize the optimizer (e.g., Adam)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 4: Train the model
num_epochs = 10


In [None]:
train_losses = []
val_losses = []


for epoch in range(num_epochs):
    model.train()
    losses = []
    for batch in train_dataloader:
        (input_ids_1, attention_mask_1, input_ids_2, attention_mask_2), real_similarity_score = batch

        
        output1 = model(input_ids=input_ids_1, attention_mask=attention_mask_1)
        output2 = model(input_ids=input_ids_2, attention_mask=attention_mask_2)
        embeddings1 = output1.last_hidden_state
        embeddings2 = output2.last_hidden_state

        # Perform mean pooling on the embeddings
        mean_pooled_embeddings1 = torch.mean(embeddings1, dim=1)
        mean_pooled_embeddings2 = torch.mean(embeddings2, dim=1)

        # Calculate cosine similarity between the embeddings
        cosine_similarity = F.cosine_similarity(mean_pooled_embeddings1, mean_pooled_embeddings2)

        # Compute the loss
        loss = loss_function(cosine_similarity, real_similarity_score)
        losses.append(loss.item())
        

        # Perform backpropagation
        loss.backward()

        # Update the model parameters
        optimizer.step()
        optimizer.zero_grad()
    train_losses.append(np.mean(losses))
    
    model.eval()
    losses = []
    with torch.no_grad():
        for batch in val_dataloader:
            (input_ids_1, attention_mask_1, input_ids_2, attention_mask_2), real_similarity_score = batch

        
            output1 = model(input_ids=input_ids_1, attention_mask=attention_mask_1)
            output2 = model(input_ids=input_ids_2, attention_mask=attention_mask_2)
            embeddings1 = output1.last_hidden_state
            embeddings2 = output2.last_hidden_state

            # Perform mean pooling on the embeddings
            mean_pooled_embeddings1 = torch.mean(embeddings1, dim=1)
            mean_pooled_embeddings2 = torch.mean(embeddings2, dim=1)

            # Calculate cosine similarity between the embeddings
            cosine_similarity = F.cosine_similarity(mean_pooled_embeddings1, mean_pooled_embeddings2)

            # Compute the loss
            loss = loss_function(cosine_similarity, real_similarity_score)
            losses.append(loss.item())
    val_losses.append(np.mean(losses))


    


In [43]:
# Average loss
print("Average loss:", loss.item())

In [None]:
num_epochs = cfg.num_epochs
lr = cfg.lr
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_func = torch.nn.MSELoss()

In [None]:


for epoch in range(num_epochs):
    for input_pair, real_similarity_score in dataset:
        # Compute embeddings for each text in the input pair
        

        # Calculate cosine similarity between the embeddings
        cosine_similarity = F.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0))

        # Compute the loss
        loss = loss_function(cosine_similarity, real_similarity_score)

        # Zero the gradients
        optimizer.zero_grad()

        # Perform backpropagation
        loss.backward()

        # Update the model parameters
        optimizer.step()

In [34]:
# train
epochs = 3
from tqdm import tqdm(epochs):
    for batch in tqdm(train_dataloader, desc="Epoch"):
        # get embeddings
        

Iteration: 100%|██████████| 360/360 [01:40<00:00,  3.60it/s]
Iteration: 100%|██████████| 360/360 [01:45<00:00,  3.41it/s]
Iteration: 100%|██████████| 360/360 [01:45<00:00,  3.43it/s]
Iteration: 100%|██████████| 360/360 [01:45<00:00,  3.40it/s]
Epoch: 100%|██████████| 4/4 [06:56<00:00, 104.18s/it]


Evaluation

In [35]:
val_dataset_params = glob_cfg.datasets[cfg.val_dataset].hf_params
print("Params:", val_dataset_params)
val_dataset = load_dataset(**val_dataset_params)

Params: {'path': 'stsb_multi_mt', 'name': 'en', 'split': 'test'}


In [36]:
# create emb1 and emb2 in val_dataset
val_dataset = val_dataset.map(lambda x: 
                        {f'emb1': model.encode(x['sentence1'], convert_to_tensor=True), 
                         f'emb2': model.encode(x['sentence2'], convert_to_tensor=True)},
                         batched=True, batch_size=len(val_dataset)
                        )
val_dataset

Map: 100%|██████████| 1379/1379 [00:05<00:00, 247.81 examples/s]


Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score', 'emb1', 'emb2'],
    num_rows: 1379
})

In [37]:
# let's normalize score
max_score = 5
val_dataset = val_dataset.map(lambda x: {'similarity_score': x['similarity_score']/max_score})

Map: 100%|██████████| 1379/1379 [00:00<00:00, 31647.94 examples/s]


In [38]:
def get_sim_score(x):
    return util.pytorch_cos_sim(x[f'emb1'], x[f'emb2']).item()

def add_sim_score_features(x):
    score = get_sim_score(x)
    return {'model_score': score, 'diff': abs(score - x['similarity_score'])}

val_dataset = val_dataset.map(add_sim_score_features)

Map: 100%|██████████| 1379/1379 [00:00<00:00, 3146.25 examples/s]


In [39]:
# average diff
avg_diff = np.mean(val_dataset['diff'])
print(f"Average diff after fine-tuning: {avg_diff}")

Average diff after fine-tuning: 0.12818199023199175


In [51]:
import json
rnd = np.random.choice(len(val_dataset), 3)

features = ['sentence1', 'sentence2', 'similarity_score', f'model_score', f'diff']

for idx in rnd:
    idx = int(idx)
    sample = val_dataset[idx]
    sample = {k: sample[k] for k in sample if k in features}
    print(json.dumps(sample, indent=2))
    print('---'*10)
    print()

{
  "sentence1": "Results from No. 2 U.S. soft drink maker PepsiCo Inc. (nyse: PEP - news - people) were likely to be in the spotlight.",
  "sentence2": "Wall Street was also waiting for aluminum maker Alcoa Inc. (nyse: PEP - news - people) to report earnings after the close.",
  "similarity_score": 0.4000000059604645,
  "model_score": 0.35285234451293945,
  "diff": 0.047147661447525024
}
------------------------------

{
  "sentence1": "Pope canonizes 2 Palestinians",
  "sentence2": "Sweden recognizes Palestinian state",
  "similarity_score": 0.0,
  "model_score": 0.23797515034675598,
  "diff": 0.23797515034675598
}
------------------------------

{
  "sentence1": "A person wearing a helmet rides a bike near a white structure.",
  "sentence2": "A girl wearing black shorts and boots is standing next to a blue motorcycle.",
  "similarity_score": 0.24000000953674316,
  "model_score": 0.129312664270401,
  "diff": 0.11068734526634216
}
------------------------------

