In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
import os
import sys

root_dir = os.path.abspath('..')
print("Root dir: ", root_dir)
sys.path.append(root_dir)

Root dir:  /Users/user010/Desktop/Programming/ML/STS


In [27]:
from utils.helpers import read_config, print_config
import numpy as np

glob_cfg = read_config("../config.yaml")
cfg = read_config(glob_cfg.configs.fine_tune)

print_config(cfg)

{
  "model": "mini_lm",
  "dataset": "stsb_train",
  "val_dataset": "stsb_val",
  "hf_model_repo": "under-tree/STS-model"
}


In [28]:
from datasets import load_dataset

dataset_params = glob_cfg.datasets[cfg.dataset].hf_params
print("Params:", dataset_params)
dataset = load_dataset(**dataset_params)

Params: {'path': 'stsb_multi_mt', 'name': 'en', 'split': 'train'}


In [29]:
# let's normalize score
max_score = 5
dataset = dataset.map(lambda x: {'similarity_score': x['similarity_score']/max_score})

Map: 100%|██████████| 5749/5749 [00:00<00:00, 56888.66 examples/s]


In [30]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 5749
})

In [31]:
from sentence_transformers import SentenceTransformer, util, InputExample

model_params = glob_cfg.models[cfg.model].st_params
print("Params:", model_params)
model = SentenceTransformer(**model_params)

Params: {'model_name_or_path': 'paraphrase-MiniLM-L6-v2'}


In [32]:

def convert_to_input_example(example):
    return InputExample(texts=[example["sentence1"], example["sentence2"]], label=example["similarity_score"])

train_dataset = [convert_to_input_example(example) for example in dataset]

In [33]:
from sentence_transformers import SentencesDataset, losses
from torch.utils.data import DataLoader

train_dataset = SentencesDataset(train_dataset, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)


In [34]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=4,
    warmup_steps=100,
    output_path="model-st",
)

Iteration: 100%|██████████| 360/360 [01:40<00:00,  3.60it/s]
Iteration: 100%|██████████| 360/360 [01:45<00:00,  3.41it/s]
Iteration: 100%|██████████| 360/360 [01:45<00:00,  3.43it/s]
Iteration: 100%|██████████| 360/360 [01:45<00:00,  3.40it/s]
Epoch: 100%|██████████| 4/4 [06:56<00:00, 104.18s/it]


Evaluation

In [35]:
val_dataset_params = glob_cfg.datasets[cfg.val_dataset].hf_params
print("Params:", val_dataset_params)
val_dataset = load_dataset(**val_dataset_params)

Params: {'path': 'stsb_multi_mt', 'name': 'en', 'split': 'test'}


In [36]:
# create emb1 and emb2 in val_dataset
val_dataset = val_dataset.map(lambda x: 
                        {f'emb1': model.encode(x['sentence1'], convert_to_tensor=True), 
                         f'emb2': model.encode(x['sentence2'], convert_to_tensor=True)},
                         batched=True, batch_size=len(val_dataset)
                        )
val_dataset

Map: 100%|██████████| 1379/1379 [00:05<00:00, 247.81 examples/s]


Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score', 'emb1', 'emb2'],
    num_rows: 1379
})

In [37]:
# let's normalize score
max_score = 5
val_dataset = val_dataset.map(lambda x: {'similarity_score': x['similarity_score']/max_score})

Map: 100%|██████████| 1379/1379 [00:00<00:00, 31647.94 examples/s]


In [38]:
def get_sim_score(x):
    return util.pytorch_cos_sim(x[f'emb1'], x[f'emb2']).item()

def add_sim_score_features(x):
    score = get_sim_score(x)
    return {'model_score': score, 'diff': abs(score - x['similarity_score'])}

val_dataset = val_dataset.map(add_sim_score_features)

Map: 100%|██████████| 1379/1379 [00:00<00:00, 3146.25 examples/s]


In [39]:
# average diff
avg_diff = np.mean(val_dataset['diff'])
print(f"Average diff after fine-tuning: {avg_diff}")

Average diff after fine-tuning: 0.12818199023199175


In [51]:
import json
rnd = np.random.choice(len(val_dataset), 3)

features = ['sentence1', 'sentence2', 'similarity_score', f'model_score', f'diff']

for idx in rnd:
    idx = int(idx)
    sample = val_dataset[idx]
    sample = {k: sample[k] for k in sample if k in features}
    print(json.dumps(sample, indent=2))
    print('---'*10)
    print()

{
  "sentence1": "Results from No. 2 U.S. soft drink maker PepsiCo Inc. (nyse: PEP - news - people) were likely to be in the spotlight.",
  "sentence2": "Wall Street was also waiting for aluminum maker Alcoa Inc. (nyse: PEP - news - people) to report earnings after the close.",
  "similarity_score": 0.4000000059604645,
  "model_score": 0.35285234451293945,
  "diff": 0.047147661447525024
}
------------------------------

{
  "sentence1": "Pope canonizes 2 Palestinians",
  "sentence2": "Sweden recognizes Palestinian state",
  "similarity_score": 0.0,
  "model_score": 0.23797515034675598,
  "diff": 0.23797515034675598
}
------------------------------

{
  "sentence1": "A person wearing a helmet rides a bike near a white structure.",
  "sentence2": "A girl wearing black shorts and boots is standing next to a blue motorcycle.",
  "similarity_score": 0.24000000953674316,
  "model_score": 0.129312664270401,
  "diff": 0.11068734526634216
}
------------------------------

