In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

root_dir = os.path.abspath('..')
print("Root dir: ", root_dir)
sys.path.append(root_dir)

Root dir:  /Users/user010/Desktop/Programming/ML/STS


In [3]:
from utils.helpers import read_config, print_config
import numpy as np

glob_cfg = read_config("../config.yaml")
cfg = read_config(glob_cfg.configs.fine_tune)

print_config(cfg)

{
  "model": "mini_lm",
  "dataset": "stsb_train",
  "val_dataset": "stsb_val",
  "hf_model_repo": "under-tree/STS-model"
}


In [4]:
from datasets import load_dataset

dataset_params = glob_cfg.datasets[cfg.dataset].hf_params
print("Params:", dataset_params)
dataset = load_dataset(**dataset_params)

Params: {'path': 'stsb_multi_mt', 'name': 'en', 'split': 'train'}


In [5]:
# let's normalize score
max_score = 5
dataset = dataset.map(lambda x: {'similarity_score': x['similarity_score']/max_score})

In [6]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 5749
})

In [7]:
from sentence_transformers import SentenceTransformer, util, InputExample

model_params = glob_cfg.models[cfg.model].st_params
print("Params:", model_params)
model = SentenceTransformer(**model_params)

Params: {'model_name_or_path': 'paraphrase-MiniLM-L6-v2'}


In [8]:

def convert_to_input_example(example):
    return InputExample(texts=[example["sentence1"], example["sentence2"]], label=example["similarity_score"])

train_dataset = [convert_to_input_example(example) for example in dataset]

In [9]:
from sentence_transformers import SentencesDataset, losses
from torch.utils.data import DataLoader

train_dataset = SentencesDataset(train_dataset, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)


In [34]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=4,
    warmup_steps=100,
    output_path="model-st",
)

Iteration: 100%|██████████| 360/360 [01:40<00:00,  3.60it/s]
Iteration: 100%|██████████| 360/360 [01:45<00:00,  3.41it/s]
Iteration: 100%|██████████| 360/360 [01:45<00:00,  3.43it/s]
Iteration: 100%|██████████| 360/360 [01:45<00:00,  3.40it/s]
Epoch: 100%|██████████| 4/4 [06:56<00:00, 104.18s/it]


Evaluation

In [10]:
model = SentenceTransformer("model-st")

In [11]:
val_dataset_params = glob_cfg.datasets[cfg.val_dataset].hf_params
print("Params:", val_dataset_params)
val_dataset = load_dataset(**val_dataset_params)

Params: {'path': 'stsb_multi_mt', 'name': 'en', 'split': 'test'}


In [17]:
# create emb1 and emb2 in val_dataset
val_dataset = val_dataset.map(lambda x: 
                        {f'emb1': model.encode(x['sentence1'], convert_to_tensor=True), 
                         f'emb2': model.encode(x['sentence2'], convert_to_tensor=True)},
                         batched=True, batch_size=len(val_dataset)
                        )
val_dataset

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score', 'emb1', 'emb2'],
    num_rows: 1379
})

In [18]:
# let's normalize score
max_score = 5
val_dataset = val_dataset.map(lambda x: {'similarity_score': x['similarity_score']/max_score})

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [19]:
def get_sim_score(x):
    return util.pytorch_cos_sim(x[f'emb1'], x[f'emb2']).item()

def add_sim_score_features(x):
    score = get_sim_score(x)
    return {'model_score': score, 'diff': abs(score - x['similarity_score'])}

val_dataset = val_dataset.map(add_sim_score_features)

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [20]:
# average diff
avg_diff = np.mean(val_dataset['diff'])
print(f"Average diff after fine-tuning: {avg_diff}")

Average diff after fine-tuning: 0.12818199023199175


In [21]:
import json
np.random.seed(43)
rnd = np.random.choice(len(val_dataset), 5)

features = ['sentence1', 'sentence2', 'similarity_score', f'model_score', f'diff']

for idx in rnd:
    idx = int(idx)
    sample = val_dataset[idx]
    sample = {k: sample[k] for k in sample if k in features}
    print(json.dumps(sample, indent=2))
    print('---'*10)
    print()

{
  "sentence1": "US, China fail to paper over cracks in ties",
  "sentence2": "China: Relief in focus as hope for missing fades",
  "similarity_score": 0.07999999821186066,
  "model_score": 0.2659902572631836,
  "diff": 0.18599025905132294
}
------------------------------

{
  "sentence1": "A man with a bicycle at a coffee house.",
  "sentence2": "Man walking bicycle to patio of a coffee shop.",
  "similarity_score": 0.6399999856948853,
  "model_score": 0.8284282684326172,
  "diff": 0.18842828273773193
}
------------------------------

{
  "sentence1": "Two men standing in grass staring at a car.",
  "sentence2": "A woman in a pink top posing with beer.",
  "similarity_score": 0.03999999910593033,
  "model_score": -0.053494710475206375,
  "diff": 0.0934947095811367
}
------------------------------

{
  "sentence1": "Some men are sawing.",
  "sentence2": "Men are sawing logs.",
  "similarity_score": 0.6800000071525574,
  "model_score": 0.7269361615180969,
  "diff": 0.04693615436553955
