In [63]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [64]:
import os
import sys

root_dir = os.path.abspath('..')
print("Root dir: ", root_dir)
sys.path.append(root_dir)

Root dir:  /Users/user010/Desktop/Programming/ML/STS


In [65]:
from utils.helpers import read_config, print_config
import numpy as np

glob_cfg = read_config("../config.yaml")
cfg = read_config(glob_cfg.configs.embeddings)

print_config(cfg)

{
  "dataset": "stsb_val",
  "model": "pp_roberta"
}


In [66]:
print_config(glob_cfg)

{
  "root": "/Users/user010/Desktop/Programming/ML/STS",
  "datasets": {
    "stsb_train": {
      "hf_params": {
        "path": "stsb_multi_mt",
        "name": "en",
        "split": "train"
      }
    },
    "stsb_val": {
      "hf_params": {
        "path": "stsb_multi_mt",
        "name": "en",
        "split": "test"
      }
    }
  },
  "models": {
    "mini_lm": {
      "st_params": {
        "model_name_or_path": "paraphrase-MiniLM-L6-v2"
      }
    },
    "pp_roberta": {
      "st_params": {
        "model_name_or_path": "sentence-transformers/paraphrase-distilroberta-base-v1"
      }
    }
  },
  "configs": {
    "embeddings": "/Users/user010/Desktop/Programming/ML/STS/configs/embeddings.yaml"
  }
}


In [67]:
from datasets import load_dataset

dataset_params = glob_cfg.datasets[cfg.dataset].hf_params
print("Params:", dataset_params)
dataset = load_dataset(**dataset_params)

Params: {'path': 'stsb_multi_mt', 'name': 'en', 'split': 'test'}


In [68]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 1379
})

In [69]:
from sentence_transformers import SentenceTransformer, util

model_params = glob_cfg.models[cfg.model].st_params
print("Params:", model_params)
model = SentenceTransformer(**model_params)

Params: {'model_name_or_path': 'sentence-transformers/paraphrase-distilroberta-base-v1'}


Downloading (…)7f4ef/.gitattributes: 100%|██████████| 391/391 [00:00<00:00, 3.23MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.41MB/s]
Downloading (…)f279f7f4ef/README.md: 100%|██████████| 3.74k/3.74k [00:00<00:00, 45.5MB/s]
Downloading (…)79f7f4ef/config.json: 100%|██████████| 718/718 [00:00<00:00, 4.82MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 1.30MB/s]
Downloading (…)279f7f4ef/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 4.24MB/s]
Downloading pytorch_model.bin: 100%|██████████| 329M/329M [00:25<00:00, 12.9MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 109kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 1.01MB/s]
Downloading (…)7f4ef/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 9.04MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 1.35k/1.35k [00:00<00:00, 3.44MB/s]
Downloading (…)279f7f4ef/vocab.json: 100%|

In [70]:

# np.random.seed(42)
# n = 50
# rnd_sample = np.random.choice(len(dataset), n)
# dataset = dataset.select(rnd_sample)

In [71]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score'],
    num_rows: 1379
})

In [72]:
# create emb1 and emb2 in dataset
dataset = dataset.map(lambda x: 
                      {'emb1': model.encode(x['sentence1'], convert_to_tensor=True), 'emb2': model.encode(x['sentence2'], convert_to_tensor=True)}, 
                        batched=True
                      )
dataset

Map: 100%|██████████| 1379/1379 [00:08<00:00, 164.35 examples/s]


Dataset({
    features: ['sentence1', 'sentence2', 'similarity_score', 'emb1', 'emb2'],
    num_rows: 1379
})

In [73]:
# let's normalize score
max_score = 5
dataset = dataset.map(lambda x: {'similarity_score': x['similarity_score']/max_score})

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

Map: 100%|██████████| 1379/1379 [00:00<00:00, 26915.59 examples/s]


In [74]:
def get_sim_score(x):
    return util.pytorch_cos_sim(x['emb1'], x['emb2']).item()

def add_sim_score_features(x):
    sim_score = get_sim_score(x)
    return {'model_score': sim_score, 'diff': np.abs(sim_score - x['similarity_score'])}

dataset = dataset.map(add_sim_score_features)

Map: 100%|██████████| 1379/1379 [00:00<00:00, 1678.00 examples/s]


In [75]:
# average diff
print("Average diff:", np.mean(dataset['diff']))

Average diff: 0.1664817318903786


In [76]:
import json
rnd = np.random.choice(len(dataset), 3)

for idx in rnd:
    idx = int(idx)
    sample = dataset[idx]
    sample_without_emb = {k: v for k, v in sample.items() if k not in ['emb1', 'emb2']}
    print(json.dumps(sample_without_emb, indent=2))
    print('---'*10)
    print()

{
  "sentence1": "Bahraini protesters and police clash after funeral",
  "sentence2": "Greek protesters, police clash as bailout deal in limbo",
  "similarity_score": 0.12000000476837158,
  "model_score": 0.6286015510559082,
  "diff": 0.5086015462875366
}
------------------------------

{
  "sentence1": "People on motorcycles wearing racing gear ride around a racetrack",
  "sentence2": "People on motorcycles ride around a racetrack",
  "similarity_score": 0.7599999904632568,
  "model_score": 0.9040279984474182,
  "diff": 0.14402800798416138
}
------------------------------

{
  "sentence1": "A dog is running through a pond",
  "sentence2": "A pale dog is running along a dirt path.",
  "similarity_score": 0.2800000011920929,
  "model_score": 0.6049938201904297,
  "diff": 0.3249938189983368
}
------------------------------



Well, I agree more with model prediction than with ground truth score.