In [77]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import os
import sys

root_dir = os.path.abspath('..')
print("Root dir: ", root_dir)
sys.path.append(root_dir)

Root dir:  /Users/user010/Desktop/Programming/ML/STS


In [2]:
from utils.helpers import read_config, print_config
import numpy as np

glob_cfg = read_config("../config.yaml")
cfg = read_config(glob_cfg.configs.models_comparison)

print_config(cfg)

{
  "dataset": "stsb_val",
  "models": [
    "mini_lm",
    "pp_roberta"
  ]
}


In [80]:
print_config(glob_cfg)

{
  "root": "/Users/user010/Desktop/Programming/ML/STS",
  "datasets": {
    "stsb_train": {
      "hf_params": {
        "path": "stsb_multi_mt",
        "name": "en",
        "split": "train"
      }
    },
    "stsb_val": {
      "hf_params": {
        "path": "stsb_multi_mt",
        "name": "en",
        "split": "test"
      }
    }
  },
  "models": {
    "mini_lm": {
      "st_params": {
        "model_name_or_path": "paraphrase-MiniLM-L6-v2"
      }
    },
    "pp_roberta": {
      "st_params": {
        "model_name_or_path": "sentence-transformers/paraphrase-distilroberta-base-v1"
      }
    }
  },
  "configs": {
    "embeddings": "/Users/user010/Desktop/Programming/ML/STS/configs/embeddings.yaml"
  }
}


In [81]:
from datasets import load_dataset

dataset_params = glob_cfg.datasets[cfg.dataset].hf_params
print("Params:", dataset_params)
dataset = load_dataset(**dataset_params)

Params: {'path': 'stsb_multi_mt', 'name': 'en', 'split': 'test'}


In [82]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 1379
})

In [83]:
from sentence_transformers import SentenceTransformer, util

models = []
for model_name in cfg.models:
    model_params = glob_cfg.models[model_name].st_params
    print("Params:", model_params)
    model = SentenceTransformer(**model_params)
    models.append(dict(name=model_name, model=model))

Params: {'model_name_or_path': 'paraphrase-MiniLM-L6-v2'}
Params: {'model_name_or_path': 'sentence-transformers/paraphrase-distilroberta-base-v1'}


In [84]:
# create emb1 and emb2 in dataset
for model_info in models:
  model_name = model_info['name']
  model = model_info['model']
  print("Model:", model_name)
  # pass whole dataset to model
  dataset = dataset.map(lambda x: 
                        {f'emb1_{model_name}': model.encode(x['sentence1'], convert_to_tensor=True), 
                         f'emb2_{model_name}': model.encode(x['sentence2'], convert_to_tensor=True)},
                         batched=True, batch_size=len(dataset)
                        )
dataset

Model: mini_lm


Map: 100%|██████████| 1379/1379 [00:04<00:00, 277.81 examples/s]


Model: pp_roberta


Map: 100%|██████████| 1379/1379 [00:08<00:00, 162.73 examples/s]


Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score', 'emb1_mini_lm', 'emb2_mini_lm', 'emb1_pp_roberta', 'emb2_pp_roberta'],
    num_rows: 1379
})

In [85]:
# let's normalize score
max_score = 5
dataset = dataset.map(lambda x: {'similarity_score': x['similarity_score']/max_score})

Map: 100%|██████████| 1379/1379 [00:00<00:00, 19946.15 examples/s]


In [87]:
def get_sim_score(x, model_name):
    return util.pytorch_cos_sim(x[f'emb1_{model_name}'], x[f'emb2_{model_name}']).item()

def add_sim_score_features(x):
    new_features = {}
    for model_info in models:
        model_name = model_info['name']
        sim_score = get_sim_score(x, model_name)
        new_features[f'score_{model_name}'] = sim_score
        new_features[f'diff_{model_name}'] = np.abs(sim_score - x['similarity_score'])
    return new_features

dataset = dataset.map(add_sim_score_features)

Map: 100%|██████████| 1379/1379 [00:01<00:00, 1099.79 examples/s]


In [90]:
# average diff
best_model_name = ''
best_avg_diff = 1.0
for model_info in models:
    model_name = model_info['name']
    avg_diff = np.mean(dataset[f'diff_{model_name}'])
    print("Model:", model_name)
    print("Average diff:", avg_diff)
    print()

    if avg_diff < best_avg_diff:
        best_avg_diff = avg_diff
        best_model_name = model_name

print("Best model:", best_model_name)
print("Best average diff:", best_avg_diff)

Model: mini_lm
Average diff: 0.15014304435610004

Model: pp_roberta
Average diff: 0.16648173247389036

Best model: mini_lm
Best average diff: 0.15014304435610004


In [101]:
import json
rnd = np.random.choice(len(dataset), 3)

features = ['sentence1', 'sentence2', 'similarity_score', f'score_{best_model_name}', f'diff_{best_model_name}']

for idx in rnd:
    idx = int(idx)
    sample = dataset[idx]
    sample = {k: sample[k] for k in sample if k in features}
    print(json.dumps(sample, indent=2))
    print('---'*10)
    print()

{
  "sentence1": "You may have to experiment and find what you like.",
  "sentence2": "You have to find out what works for you.",
  "similarity_score": 1.0,
  "score_mini_lm": 0.6091929078102112,
  "diff_mini_lm": 0.3908070921897888
}
------------------------------

{
  "sentence1": "In the first case, I think you don't need it.",
  "sentence2": "So I don't think you need to put it on the cover.",
  "similarity_score": 0.20000000298023224,
  "score_mini_lm": 0.5300893187522888,
  "diff_mini_lm": 0.3300893157720566
}
------------------------------

{
  "sentence1": "A man and a woman watch two dogs.",
  "sentence2": "A man in a maroon bathing suit swings on a rope on a lake.",
  "similarity_score": 0.07999999821186066,
  "score_mini_lm": -0.008783694356679916,
  "diff_mini_lm": 0.08878369256854057
}
------------------------------



Well, I agree more with model prediction than with ground truth score.