In [None]:
from transformers import AutoTokenizer, AutoModel
import yaml
from utils.helpers import normalize_score, cos_sim_score, mean_pooling
from tqdm import tqdm
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import torch

with open('./benchmarks.yaml', 'r') as file:
        benchmarks = yaml.safe_load(file)


def gen_statistics(model, tokenizer):
        for k, v in benchmarks.items():
                tqdm.write(f'Processing {k} benchmark.')
                data = pd.read_csv(v.get('data_dir'), sep='\t')

                ## get terms
                term1 = list(set(data['Term1'].tolist()))
                term2 = list(set(data['Term2'].tolist()))
                terms = list(set([*term1, *term2]))

                del term1, term2

                min = v['min']
                max = v['max']

                ## normalize scores
                scores = normalize_score(np.asarray(data['Score'].tolist()), min=min, max=max)

                embs = {}
                for i in range(len(terms)):
                        x = tokenizer(terms[i], padding=True, truncation=False, return_tensors="pt")
                        with torch.no_grad():
                                emb = model(**x)
                        emb = mean_pooling(emb, x['attention_mask'])

                        embs[terms[i]] = {'mean': emb}

                        

                pred = []
                for i in range(len(data)):
                        term1, term2 = data.iloc[i]['Term1'], data.iloc[i]['Term2']
                        emb1, emb2 = embs.get(term1), embs.get(term2)
                        pred.append(cos_sim_score(emb1, emb2))

                corr, p_value = pearsonr(scores, pred)

                tqdm.write(f'Corr: {round(corr, 4)}, p-value: {p_value}')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('./umlsbert')
model = AutoModel.from_pretrained('./umlsbert')


gen_statistics(model=model, tokenizer=tokenizer)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('medicalai/ClinicalBERT')
model = AutoModel.from_pretrained('medicalai/ClinicalBERT')

gen_statistics(model=model, tokenizer=tokenizer)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
model = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

gen_statistics(model=model, tokenizer=tokenizer)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v1')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v1')

gen_statistics(model=model, tokenizer=tokenizer)