In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import os
import sys

root_dir = os.path.abspath('..')
print("Root dir: ", root_dir)
sys.path.append(root_dir)

Root dir:  /Users/user010/Desktop/Programming/ML/STS


In [12]:
from utils.helpers import read_config, print_config
import numpy as np

glob_cfg = read_config("../config.yaml")
cfg = read_config(glob_cfg.configs.fine_tune)

print_config(cfg)

{
  "model": "mini_lm",
  "dataset": "stsb_train",
  "hf_model_repo": "under-tree/STS-model"
}


In [13]:
from datasets import load_dataset

dataset_params = glob_cfg.datasets[cfg.dataset].hf_params
print("Params:", dataset_params)
dataset = load_dataset(**dataset_params)

Params: {'path': 'stsb_multi_mt', 'name': 'en', 'split': 'train'}


In [14]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 5749
})

In [15]:
from sentence_transformers import SentenceTransformer, util, InputExample

model_params = glob_cfg.models[cfg.model].st_params
print("Params:", model_params)
model = SentenceTransformer(**model_params)

Params: {'model_name_or_path': 'paraphrase-MiniLM-L6-v2'}


In [16]:

def convert_to_input_example(example):
    return InputExample(texts=[example["sentence1"], example["sentence2"]], label=example["similarity_score"])

train_dataset = [convert_to_input_example(example) for example in dataset]

In [18]:
from sentence_transformers import SentencesDataset, losses
from torch.utils.data import DataLoader

train_dataset = SentencesDataset(train_dataset, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)


In [19]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=4,
    warmup_steps=100,
    output_path="model-st",
)

Iteration:   9%|▉         | 33/360 [00:09<01:38,  3.33it/s]
Epoch:   0%|          | 0/4 [00:09<?, ?it/s]


KeyboardInterrupt: 

In [None]:
from sentence_transformers.converters import SentenceTransformer_to_Transformers

SentenceTransformer_to_Transformers.convert(model, "model-hf")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("model-hf")
model_hf = AutoModel.from_pretrained("model-hf")

In [None]:
# push both the tokenizer and the model to the hub
hf_model_repo = cfg.hf_model_repo
model_hf.push_to_hub(hf_model_repo, use_auth_token=True)
tokenizer.push_to_hub(hf_model_repo, use_auth_token=True)

Evaluation

In [84]:
# create emb1 and emb2 in dataset
dataset = dataset.map(lambda x: 
                        {f'emb1': model.encode(x['sentence1'], convert_to_tensor=True), 
                         f'emb2': model.encode(x['sentence2'], convert_to_tensor=True)},
                         batched=True, batch_size=len(dataset)
                        )
dataset

Model: mini_lm


Map: 100%|██████████| 1379/1379 [00:04<00:00, 277.81 examples/s]


Model: pp_roberta


Map: 100%|██████████| 1379/1379 [00:08<00:00, 162.73 examples/s]


Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score', 'emb1_mini_lm', 'emb2_mini_lm', 'emb1_pp_roberta', 'emb2_pp_roberta'],
    num_rows: 1379
})

In [85]:
# let's normalize score
max_score = 5
dataset = dataset.map(lambda x: {'similarity_score': x['similarity_score']/max_score})

Map: 100%|██████████| 1379/1379 [00:00<00:00, 19946.15 examples/s]


In [87]:
def get_sim_score(x):
    return util.pytorch_cos_sim(x[f'emb1'], x[f'emb2']).item()

def add_sim_score_features(x):
    score = get_sim_score(x)
    return {'model_score': score, 'diff': abs(score - x['similarity_score'])}

dataset = dataset.map(add_sim_score_features)

Map: 100%|██████████| 1379/1379 [00:01<00:00, 1099.79 examples/s]


In [90]:
# average diff
avg_diff = np.mean(dataset['diff'])
print(f"Average diff after fine-tuning: {avg_diff}")

Model: mini_lm
Average diff: 0.15014304435610004

Model: pp_roberta
Average diff: 0.16648173247389036

Best model: mini_lm
Best average diff: 0.15014304435610004


In [101]:
import json
rnd = np.random.choice(len(dataset), 3)

features = ['sentence1', 'sentence2', 'similarity_score', f'model_score', f'diff']

for idx in rnd:
    idx = int(idx)
    sample = dataset[idx]
    sample = {k: sample[k] for k in sample if k in features}
    print(json.dumps(sample, indent=2))
    print('---'*10)
    print()

{
  "sentence1": "You may have to experiment and find what you like.",
  "sentence2": "You have to find out what works for you.",
  "similarity_score": 1.0,
  "score_mini_lm": 0.6091929078102112,
  "diff_mini_lm": 0.3908070921897888
}
------------------------------

{
  "sentence1": "In the first case, I think you don't need it.",
  "sentence2": "So I don't think you need to put it on the cover.",
  "similarity_score": 0.20000000298023224,
  "score_mini_lm": 0.5300893187522888,
  "diff_mini_lm": 0.3300893157720566
}
------------------------------

{
  "sentence1": "A man and a woman watch two dogs.",
  "sentence2": "A man in a maroon bathing suit swings on a rope on a lake.",
  "similarity_score": 0.07999999821186066,
  "score_mini_lm": -0.008783694356679916,
  "diff_mini_lm": 0.08878369256854057
}
------------------------------

