<a href="https://colab.research.google.com/github/PavleSavic/MLM_consistency/blob/main/consistency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import string
import logging
from typing import Callable
from collections import OrderedDict
import pandas as pd
import numpy as np
import tensorflow as tf
#!pip install transformers datasets evaluate
from transformers import AutoTokenizer, TFAutoModelForMaskedLM, TFAutoModel

In [2]:
random.seed(123)
tf.keras.mixed_precision.set_global_policy('mixed_float16')
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


## Relations used in analysis

In [3]:
relations = []
with open("final_19_relations.txt") as f:
    lines = f.readlines()
    for l in lines:
        relations.append(l.strip())
print(len(relations))
relations.sort()
print(relations)

19
['associated_morphology_of', 'disease_has_abnormal_cell', 'disease_has_associated_anatomic_site', 'disease_has_normal_cell_origin', 'disease_has_normal_tissue_origin', 'disease_mapped_to_gene', 'disease_may_have_associated_disease', 'disease_may_have_finding', 'disease_may_have_molecular_abnormality', 'gene_associated_with_disease', 'gene_encodes_gene_product', 'gene_product_encoded_by_gene', 'gene_product_has_associated_anatomy', 'gene_product_has_biochemical_function', 'gene_product_plays_role_in_biological_process', 'has_physiologic_effect', 'may_prevent', 'may_treat', 'occurs_after']


## Prompts

In [4]:
prompts = pd.read_csv('prompts.csv')

In [5]:
prompts

Unnamed: 0,pid,default_prompt,human_prompt
0,associated_morphology_of,[X] associated morphology of [Y] .,[X] is associated morphology of [Y] .
1,disease_has_abnormal_cell,[X] disease has abnormal cell [Y] .,[X] has the abnormal cell [Y] .
2,disease_has_associated_anatomic_site,[X] disease has associated anatomic site [Y] .,The disease [X] can stem from the associated a...
3,disease_has_normal_cell_origin,[X] disease has normal cell origin [Y] .,The disease [X] stems from the normal cell [Y] .
4,disease_has_normal_tissue_origin,[X] disease has normal tissue origin [Y] .,The disease [X] stems from the normal tissue [...
5,disease_mapped_to_gene,[X] disease mapped to gene [Y] .,The disease [X] is mapped to gene [Y] .
6,disease_may_have_associated_disease,[X] disease may have associated disease [Y] .,The disease [X] might have the associated dise...
7,disease_may_have_finding,[X] disease may have finding [Y] .,[X] may have [Y] .
8,disease_may_have_molecular_abnormality,[X] disease may have molecular abnormality [Y] .,The disease [X] may have molecular abnormality...
9,gene_associated_with_disease,[X] gene associated with disease [Y] .,The gene [X] is associatied with disease [Y] .


## Masked Language Models

In [6]:
# uncased
bert_models = {'BERT_base' : "google-bert/bert-base-uncased", 'BERT_large': "google-bert/bert-large-uncased",
                'BERT_large_wwm': "google-bert/bert-large-uncased-whole-word-masking"}
# cased
roberta_models = {'RoBERTa_base': "FacebookAI/roberta-base", 'RoBERTa_large': "FacebookAI/roberta-large"}
# uncased
albert_models = {'ALBERT_base': "albert/albert-base-v2", 'ALBERT_xxlarge': "albert/albert-xxlarge-v2"}
# cased
biobert_models = {'BioBERT': "dmis-lab/biobert-base-cased-v1.2"}
# uncased
biomedbert_models = {'BioMedBERT_base_abstract' : "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract",
                     'BioMedBERT_base_full': "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
                     'BioMedBERT_large_abstract': "microsoft/BiomedNLP-BiomedBERT-large-uncased-abstract"}

## Example

In [7]:
# List of input texts with masked tokens
texts = ["She is from the city of [MASK].", "This is a great [MASK].", "He is an excellent [MASK]."]

In [8]:
def change_input_format(input):
  new_input = input.replace('[MASK]','<mask>')
  return new_input

In [None]:
def analyze_tokenizer(model_checkpoint:str, inputs):

  if 'roberta' in model_checkpoint:
    inputs = [change_input_format(input) for input in inputs]

  tokz = AutoTokenizer.from_pretrained(model_checkpoint)

  # model_max_length field not set by default for BioBERT and BioMedBERT models
  if 'bio' in model_checkpoint.lower():
    tokz.model_max_length = 512

  tokenization = tokz(inputs, return_tensors='tf', padding=True)  # truncation=True, max_length=tokz.model_max_length
  print(f"Tokenization example: {tokenization['input_ids']}")
  for l in tokenization['input_ids']:
    print(f"Decoded tokens: {tokz.decode(l)}")
  print(f"End of sequence token: {tokz.eos_token}")
  print(f"Mask token id: {tokz.mask_token_id}")
  print(f"All special tokens ids: {tokz.all_special_ids}")
  print(f"All special tokens: {tokz.decode(tokz.all_special_ids)}")
  print(f"Maximum model input length: {tokz.model_max_length}")

In [None]:
# One model for each group
print('BERT_base')
analyze_tokenizer(bert_models['BERT_base'], texts)
print('------------------------------------------------------------------------------')
print('BERT_large_wwm')
analyze_tokenizer(bert_models['BERT_large_wwm'], texts)
print('------------------------------------------------------------------------------')
print('RoBERTa_base')
analyze_tokenizer(roberta_models['RoBERTa_base'], texts)
print('------------------------------------------------------------------------------')
print('ALBERT_base')
analyze_tokenizer(albert_models['ALBERT_base'], texts)
print('------------------------------------------------------------------------------')
print('BioBERT')
analyze_tokenizer(biobert_models['BioBERT'], texts)
print('------------------------------------------------------------------------------')
print('BioMedBERT_base_full')
analyze_tokenizer(biomedbert_models['BioMedBERT_base_full'], texts)
print('------------------------------------------------------------------------------')

BERT_base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenization example: [[ 101 2016 2003 2013 1996 2103 1997  103 1012  102]
 [ 101 2023 2003 1037 2307  103 1012  102    0    0]
 [ 101 2002 2003 2019 6581  103 1012  102    0    0]]
Decoded tokens: [CLS] she is from the city of [MASK]. [SEP]
Decoded tokens: [CLS] this is a great [MASK]. [SEP] [PAD] [PAD]
Decoded tokens: [CLS] he is an excellent [MASK]. [SEP] [PAD] [PAD]
End of sequence token: None
Mask token id: 103
All special tokens ids: [100, 102, 0, 101, 103]
All special tokens: [UNK] [SEP] [PAD] [CLS] [MASK]
Maximum model input length: 512
------------------------------------------------------------------------------
BERT_large_wwm


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenization example: [[ 101 2016 2003 2013 1996 2103 1997  103 1012  102]
 [ 101 2023 2003 1037 2307  103 1012  102    0    0]
 [ 101 2002 2003 2019 6581  103 1012  102    0    0]]
Decoded tokens: [CLS] she is from the city of [MASK]. [SEP]
Decoded tokens: [CLS] this is a great [MASK]. [SEP] [PAD] [PAD]
Decoded tokens: [CLS] he is an excellent [MASK]. [SEP] [PAD] [PAD]
End of sequence token: None
Mask token id: 103
All special tokens ids: [100, 102, 0, 101, 103]
All special tokens: [UNK] [SEP] [PAD] [CLS] [MASK]
Maximum model input length: 512
------------------------------------------------------------------------------
RoBERTa_base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenization example: [[    0  2515    16    31     5   343     9 50264     4     2]
 [    0   713    16    10   372 50264     4     2     1     1]
 [    0   894    16    41  4206 50264     4     2     1     1]]
Decoded tokens: <s>She is from the city of<mask>.</s>
Decoded tokens: <s>This is a great<mask>.</s><pad><pad>
Decoded tokens: <s>He is an excellent<mask>.</s><pad><pad>
End of sequence token: </s>
Mask token id: 50264
All special tokens ids: [0, 2, 3, 1, 50264]
All special tokens: <s></s><unk><pad><mask>
Maximum model input length: 512
------------------------------------------------------------------------------
ALBERT_base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Tokenization example: [[   2   39   25   37   14  136   16    4   13    9    3]
 [   2   48   25   21  374    4   13    9    3    0    0]
 [   2   24   25   40 5977    4   13    9    3    0    0]]
Decoded tokens: [CLS] she is from the city of[MASK].[SEP]
Decoded tokens: [CLS] this is a great[MASK].[SEP]<pad><pad>
Decoded tokens: [CLS] he is an excellent[MASK].[SEP]<pad><pad>
End of sequence token: [SEP]
Mask token id: 4
All special tokens ids: [2, 3, 1, 0, 4]
All special tokens: [CLS][SEP]<unk><pad>[MASK]
Maximum model input length: 512
------------------------------------------------------------------------------
BioBERT


config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Tokenization example: [[ 101 1131 1110 1121 1103 1331 1104  103  119  102]
 [ 101 1142 1110  170 1632  103  119  102    0    0]
 [ 101 1119 1110 1126 6548  103  119  102    0    0]]
Decoded tokens: [CLS] she is from the city of [MASK]. [SEP]
Decoded tokens: [CLS] this is a great [MASK]. [SEP] [PAD] [PAD]
Decoded tokens: [CLS] he is an excellent [MASK]. [SEP] [PAD] [PAD]
End of sequence token: None
Mask token id: 103
All special tokens ids: [100, 102, 0, 101, 103]
All special tokens: [UNK] [SEP] [PAD] [CLS] [MASK]
Maximum model input length: 512
------------------------------------------------------------------------------
BioMedBERT_base_full


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Tokenization example: [[   2 4693 1977 2037 1920 8318 1927    4   18    3]
 [   2 2052 1977   43 4733    4   18    3    0    0]
 [   2 2234 1977 1925 7526    4   18    3    0    0]]
Decoded tokens: [CLS] she is from the city of [MASK]. [SEP]
Decoded tokens: [CLS] this is a great [MASK]. [SEP] [PAD] [PAD]
Decoded tokens: [CLS] he is an excellent [MASK]. [SEP] [PAD] [PAD]
End of sequence token: None
Mask token id: 4
All special tokens ids: [1, 3, 0, 2, 4]
All special tokens: [UNK] [SEP] [PAD] [CLS] [MASK]
Maximum model input length: 512
------------------------------------------------------------------------------


In [9]:
def get_model_predictions(model_checkpoint:str, inputs:list[str], top_n=5, verbose=0):
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

  # Adjusting inputs for RoBERTa models
  if 'roberta' in model_checkpoint:
    inputs = [change_input_format(input) for input in inputs]

  # model_max_length field not set by default for BioBERT and BioMedBERT models
  if 'bio' in model_checkpoint.lower():
    tokenizer.model_max_length = 512

  if verbose:
    print(f'Choosen model: {model_checkpoint}')
    model.summary()

  # Tokenizing the inputs
  tokenized_inputs = tokenizer(inputs, return_tensors="tf", padding=True, truncation=True) # max_length=128 (by default max_length = tokenizer.model_max_length)

  # Getting the token logits from the model
  token_logits = model(**tokenized_inputs).logits

  outputs = []

  for i, input in enumerate(inputs):
                                                                                   # not necessary (all tokenizers have mask_token_id defined)
    mask_token_id = tokenizer.mask_token_id if tokenizer.mask_token_id is not None else tokenizer.convert_tokens_to_ids(["[MASK]"])[0]

    mask_token_index = np.argwhere(tokenized_inputs["input_ids"].numpy()[i] == mask_token_id)[0, 0]

    mask_token_logits = token_logits[i, mask_token_index, :]

    top_tokens = np.argsort(-mask_token_logits.numpy()).tolist()

    predictions = []
    if verbose:
      print(f"Input: {input}")

    for token_id in top_tokens:
        # Skip special tokens
        if token_id in tokenizer.all_special_ids:
          continue

        # tokenizer.convert_ids_to_tokens (???)
        predicted_token = tokenizer.decode([token_id])

        # subword tokens (???)

        # Skip punctutation tokens
        if predicted_token in string.punctuation:
          continue

        predictions.append(predicted_token)
        if verbose:
          print(f">>> {input.replace(tokenizer.mask_token, predicted_token)}")

        if len(predictions) == top_n:
          break
    if verbose:
      print()

    outputs.append(predictions)

  return np.array(outputs)

In [None]:
pred = get_model_predictions(model_checkpoint=roberta_models['RoBERTa_base'], inputs=texts, top_n=10)
pred

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

array([[' Chicago', ' London', ' Toronto', ' Seattle', ' Boston',
        ' Portland', ' Philadelphia', ' Vancouver', ' Minneapolis',
        ' Houston'],
       [' idea', ' example', ' article', ' video', ' post', ' story',
        ' read', ' question', ' book', ' game'],
       [' player', ' writer', ' athlete', ' student', ' shooter',
        ' coach', ' hitter', ' defender', ' guy', ' broadcaster']],
      dtype='<U13')

# Relations

In [10]:
occurs_data = pd.read_csv('occurs_after_1000.csv', usecols=["head_name", "rel", "tail_names"])
occurs_data.head(10)

Unnamed: 0,head_name,rel,tail_names
0,Post influenza vaccination encephalitis,occurs_after,Administration of influenza vaccine
1,Basal cell carcinoma recurrent following cryos...,occurs_after,Cryosurgery
2,Adverse effect from PUVA photochemotherapy,occurs_after,Light therapy || Photochemotherapy with psoral...
3,Allergy to pea,occurs_after,Allergic sensitization
4,Bite of unidentified snake with neurological s...,occurs_after,Animal bite
5,Allergy to hypothalamic hormone,occurs_after,Allergic sensitization
6,Late effect of accidental injury,occurs_after,Traumatic injury
7,Radiotherapy scar,occurs_after,Procedure || Radiation oncology AND/OR radioth...
8,Atonic postpartum hemorrhage,occurs_after,Delivery procedure
9,Late effect of skin and subcutaneous tissue in...,occurs_after,Injury || Traumatic injury


In [11]:
# Prompts to use
rel_name = occurs_data['rel'][0]
default_prompt = prompts.loc[prompts['pid'] == rel_name]['default_prompt'].tolist()[0]
human_prompt  = prompts.loc[prompts['pid'] == rel_name]['human_prompt'].tolist()[0]
print(f"Default prompt: {default_prompt}\nHuman prompt: {human_prompt}")

Default prompt: [X] occurs after [Y] .
Human prompt: [X] occurs after [Y] .


In [12]:
# Preparing inputs
def prepare_inputs(data, prompt:str):
  # number of rows
  n = len(data)

  inputs = [prompt for _ in range(n)]
  heads = data['head_name'].tolist()

  inputs = [input.replace('[X]', head) for input, head in zip(inputs, heads)]
  inputs = [input.replace('[Y] ', '[MASK]') for input in inputs]
  return inputs

In [None]:
inputs = prepare_inputs(occurs_data, default_prompt)
print(inputs[:10])

['Post influenza vaccination encephalitis occurs after [MASK].', 'Basal cell carcinoma recurrent following cryosurgery occurs after [MASK].', 'Adverse effect from PUVA photochemotherapy occurs after [MASK].', 'Allergy to pea occurs after [MASK].', 'Bite of unidentified snake with neurological signs occurs after [MASK].', 'Allergy to hypothalamic hormone occurs after [MASK].', 'Late effect of accidental injury occurs after [MASK].', 'Radiotherapy scar occurs after [MASK].', 'Atonic postpartum hemorrhage occurs after [MASK].', 'Late effect of skin and subcutaneous tissue injury occurs after [MASK].']


In [None]:
# Getting predictions
pred_1 = get_model_predictions(model_checkpoint=biomedbert_models['BioMedBERT_base_abstract'], inputs=inputs, top_n=1)
pred_1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

array([['vaccination'],
       ['years'],
       ['transplantation'],
       ['birth'],
       ['birth'],
       ['surgery'],
       ['birth'],
       ['radiotherapy'],
       ['delivery'],
       ['surgery'],
       ['trauma'],
       ['stroke'],
       ['surgery'],
       ['trauma'],
       ['injection'],
       ['surgery'],
       ['surgery'],
       ['transplantation'],
       ['surgery'],
       ['trauma'],
       ['surgery'],
       ['transplantation'],
       ['surgery'],
       ['immunization'],
       ['chemotherapy'],
       ['cholecystectomy'],
       ['pregnancy'],
       ['esophagectomy'],
       ['varicella'],
       ['treatment'],
       ['surgery'],
       ['surgery'],
       ['cesarean'],
       ['laminectomy'],
       ['surgery'],
       ['trauma'],
       ['colonoscopy'],
       ['catheterization'],
       ['surgery'],
       ['surgery'],
       ['surgery'],
       ['ingestion'],
       ['surgery'],
       ['injection'],
       ['delivery'],
       ['surgery'],
     

In [13]:
tails = occurs_data['tail_names'].tolist()
tails = list(map(lambda x: x.split(' || '), tails))
tails

[['Administration of influenza vaccine'],
 ['Cryosurgery'],
 ['Light therapy', 'Photochemotherapy with psoralens and ultraviolet A'],
 ['Allergic sensitization'],
 ['Animal bite'],
 ['Allergic sensitization'],
 ['Traumatic injury'],
 ['Procedure', 'Radiation oncology AND/OR radiotherapy'],
 ['Delivery procedure'],
 ['Injury', 'Traumatic injury'],
 ['Fat necrosis'],
 ['Spontaneous cerebral hemorrhage'],
 ['Transplantation',
  'Implantation of prosthetic device',
  'Surgical construction of arteriovenous shunt'],
 ['Injury of knee', 'Traumatic event'],
 ['Procedure', 'Injection'],
 ['Colostomy', 'Procedure'],
 ['Procedure'],
 ['Corneal transplant'],
 ['Allergic sensitization'],
 ['Traumatic injury', 'Traumatic event'],
 ['Extraction of cataract', 'Implantation of phakic intraocular lens implant'],
 ['Transplantation of bone marrow', 'Grafting procedure'],
 ['Allergic sensitization'],
 ['Active or passive immunization'],
 ['Allergic sensitization'],
 ['Implantation of prosthetic device', 

In [14]:
# Computing top_n accuracy - percentage of examples where one of the top n answers is in the set of correct tails (ignore case)
def compute_accuracy(predictions, tails, verbose=0):
  hits = 0
  n = len(predictions)

  for i in range(n):
    preds = [prediction.strip().lower() for prediction in predictions[i]]
    # strip - RoBERTa models predict extra space at the beginning
    tls = [tail.strip().lower() for tail in tails[i]]

    if verbose==1:
      print(f"Predictions: {preds}")
      print(f"True answers: {tls}")

    if set(preds).intersection(tls):
      hits += 1
      if verbose:
        print("\033[1mTrue prediction!\033[0m")

  return (hits/n)*100

In [None]:
top_1_acc = compute_accuracy(pred_1, tails)
print(f'Top 1 accuracy: {top_1_acc:.2f} %')

Top 1 accuracy: 4.00 %


In [None]:
pred_10 = get_model_predictions(model_checkpoint=biomedbert_models['BioMedBERT_base_full'], inputs=inputs, top_n=10)
pred_10

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

array([['vaccination', 'birth', 'immunization', 'childhood',
        'vaccinations', 'infection', 'hospitalization', 'delivery',
        'influenza', 'seroconversion'],
       ['radiotherapy', 'years', 'surgery', 'chemotherapy', 'decades',
        'months', 'recurrence', 'treatment', 'excision', 'irradiation'],
       ['surgery', 'radiotherapy', 'treatment', 'chemotherapy',
        'irradiation', 'menopause', 'delivery', 'birth', 'radiation',
        'therapy'],
       ['surgery', 'birth', 'pregnancy', 'exercise', 'childbirth',
        'childhood', 'weaning', 'vaccination', 'transplantation',
        'trauma'],
       ['childhood', 'trauma', 'birth', 'childbirth', 'accidents',
        'death', 'delivery', 'adolescence', 'surgery', 'ingestion'],
       ['birth', 'childbirth', 'surgery', 'puberty', 'trauma',
        'childhood', 'menopause', 'vaccination', 'pregnancy',
        'splenectomy'],
       ['surgery', 'treatment', 'discharge', 'trauma', 'operation',
        'recovery', 'hospita

In [None]:
top_10_acc = compute_accuracy(pred_10, tails)
print(f'Top 10 accuracy: {top_10_acc:.2f} %')

Top 10 accuracy: 12.10 %


In [15]:
def cosine_similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)

    dot_product = np.dot(v1, v2)

    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)

    cosine_sim = dot_product / (norm_v1 * norm_v2)

    return cosine_sim

print(f'Cosine Similarity: {cosine_similarity([1,2,3], [4,5,6])}')

Cosine Similarity: 0.9746318461970762


In [None]:
model_name = bert_models['BERT_base']
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name, from_pt=True, output_hidden_states=True)

input_texts = [
    "The cat sat on the mat.",
    "The dog sat on the mat.",
    "The kitten sat on the mat.",
    "The mailman sat on the mat."
    ]

inputs = tokenizer(input_texts, return_tensors="tf", padding=True, truncation=True)

print(f"Tokens: {inputs['input_ids']}")
for i, input_ids in enumerate(inputs['input_ids']):
    print(f"Decoded tokens for sentence {i}: {tokenizer.decode(input_ids)}")

outputs = model(**inputs)

hidden_states = outputs.hidden_states

if not isinstance(hidden_states, tuple):
    raise ValueError("Model configuration does not support returning hidden states!")

# combining last 4 (TRY OTHER) layers for more robust results
last_four_layers = [hidden_states[i] for i in range(-4, 0)]
combined_layers = tf.reduce_mean(tf.stack(last_four_layers), axis=0)

# normalizing the combined embeddings
normalized_embeddings = tf.nn.l2_normalize(combined_layers, axis=2)

# position of mask token in original sentence
third_token_embeddings = normalized_embeddings[:, 2, :]

print(third_token_embeddings)

# compare word embeddings ()
print(f"Cosine similarity (\"cat\", \"dog\"): {cosine_similarity(tf.reshape(third_token_embeddings[0], shape=(1, -1)), tf.reshape(third_token_embeddings[1], shape=(-1, 1)))[0][0]}")
print(f"Cosine similarity (\"cat\", \"kitten\"): {cosine_similarity(tf.reshape(third_token_embeddings[0], shape=(1, -1)), tf.reshape(third_token_embeddings[2], shape=(-1, 1)))[0][0]}")
print(f"Cosine similarity (\"dog\", \"kitten\"): {cosine_similarity(tf.reshape(third_token_embeddings[1], shape=(1, -1)), tf.reshape(third_token_embeddings[2], shape=(-1, 1)))[0][0]}")
print(f"Cosine similarity (\"cat\", \"mailman\"): {cosine_similarity(tf.reshape(third_token_embeddings[0], shape=(1, -1)), tf.reshape(third_token_embeddings[3], shape=(-1, 1)))[0][0]}")
print(f"Cosine similarity (\"dog\", \"mailman\"): {cosine_similarity(tf.reshape(third_token_embeddings[1], shape=(1, -1)), tf.reshape(third_token_embeddings[3], shape=(-1, 1)))[0][0]}")
print(f"Cosine similarity (\"kitten\", \"mailman\"): {cosine_similarity(tf.reshape(third_token_embeddings[2], shape=(1, -1)), tf.reshape(third_token_embeddings[3], shape=(-1, 1)))[0][0]}")

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Tokens: [[  101  1996  4937  2938  2006  1996 13523  1012   102     0]
 [  101  1996  3899  2938  2006  1996 13523  1012   102     0]
 [  101  1996 18401  2938  2006  1996 13523  1012   102     0]
 [  101  1996  5653  2386  2938  2006  1996 13523  1012   102]]
Decoded tokens for sentence 0: [CLS] the cat sat on the mat. [SEP] [PAD]
Decoded tokens for sentence 1: [CLS] the dog sat on the mat. [SEP] [PAD]
Decoded tokens for sentence 2: [CLS] the kitten sat on the mat. [SEP] [PAD]
Decoded tokens for sentence 3: [CLS] the mailman sat on the mat. [SEP]
tf.Tensor(
[[-0.01194338 -0.0051559   0.0104109  ... -0.02537592  0.03168032
   0.04000831]
 [ 0.02839966  0.02874123  0.00292385 ... -0.04067679  0.02913046
   0.03197852]
 [-0.04018649 -0.03780384 -0.012027   ... -0.03123694  0.027043
  -0.03035366]
 [ 0.05976407 -0.01487961  0.02051217 ...  0.01845756  0.0075776
  -0.05614597]], shape=(4, 768), dtype=float32)
Cosine similarity ("cat", "dog"): 0.8437002897262573
Cosine similarity ("cat", "k

In [None]:
def replace_masks(model_checkpoint, masked_inputs, tails):

  n = len(masked_inputs)
  full_sentences = []

  mask_string = '[MASK]'

  if 'roberta' in model_checkpoint:
    mask_string = '<mask>'

  for i in range(n):
   # strip - RoBERTa models predictions have extra space at the beginning of the word after decoding ('Ġ' symbol at a start of a token indicates the beginning of a word within the tokenized sequence)
   example_sentences = [masked_inputs[i].replace(mask_string, tail.strip().lower()) for tail in tails[i]]
   full_sentences.append(example_sentences)

  return full_sentences

print(replace_masks(biomedbert_models['BioMedBERT_base_full'], inputs[:10], tails[:10]))
print(replace_masks(biomedbert_models['BioMedBERT_base_full'], inputs[:10], pred_10[:10]))

[['Post influenza vaccination encephalitis occurs after administration of influenza vaccine.'], ['Basal cell carcinoma recurrent following cryosurgery occurs after cryosurgery.'], ['Adverse effect from PUVA photochemotherapy occurs after light therapy.', 'Adverse effect from PUVA photochemotherapy occurs after photochemotherapy with psoralens and ultraviolet a.'], ['Allergy to pea occurs after allergic sensitization.'], ['Bite of unidentified snake with neurological signs occurs after animal bite.'], ['Allergy to hypothalamic hormone occurs after allergic sensitization.'], ['Late effect of accidental injury occurs after traumatic injury.'], ['Radiotherapy scar occurs after procedure.', 'Radiotherapy scar occurs after radiation oncology and/or radiotherapy.'], ['Atonic postpartum hemorrhage occurs after delivery procedure.'], ['Late effect of skin and subcutaneous tissue injury occurs after injury.', 'Late effect of skin and subcutaneous tissue injury occurs after traumatic injury.']]
[

In [None]:
# 1. obtain embeddings of the expected token and the predicted token from the word embedding layer of a model
# 2. compute cosine similarity metric (TO DO: Multi Token scenario: Averaging cosine similarities, BLEU score (???))

# single token scenario: a) only first token of true answer taken into consideration
def compute_cos_sim_accuracy_v1(model_checkpoint:str, original_inputs, predictions, tails, verbose=0):
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = TFAutoModel.from_pretrained(model_checkpoint, from_pt=True, output_hidden_states=True)

  # Adjusting inputs for RoBERTa models
  if 'roberta' in model_checkpoint:
    original_inputs = [change_input_format(input) for input in original_inputs]

  # model_max_length field not set by default for BioBERT and BioMedBERT models
  if 'bio' in model_checkpoint.lower():
    tokenizer.model_max_length = 512

  # Tokenizing original inputs
  tokenized_inputs = tokenizer(original_inputs, return_tensors="tf", padding=True, truncation=True) # max_length=128 (by default max_length = tokenizer.model_max_length)

  # Getting mask_token indexes for every example
  mask_token_indexes = []

  for i, input in enumerate(tokenized_inputs["input_ids"]):
                                                                                   # not necessary (all tokenizers have mask_token_id defined)
    mask_token_id = tokenizer.mask_token_id if tokenizer.mask_token_id is not None else tokenizer.convert_tokens_to_ids(["[MASK]"])[0]

    mask_token_indexes.append(np.argwhere(tokenized_inputs["input_ids"].numpy()[i] == mask_token_id)[0, 0])

  predicted_sentences = replace_masks(model_checkpoint, original_inputs, predictions)
  true_sentences = replace_masks(model_checkpoint, original_inputs, tails)

  hits = 0
  n = len(original_inputs)
  # for every example in relation dataset
  for i in range(n):

    predicted_sentences_tkz = tokenizer(predicted_sentences[i], return_tensors="tf", padding=True, truncation=True)

    hidden_states = model(**predicted_sentences_tkz).hidden_states

    if not isinstance(hidden_states, tuple):
      raise ValueError("Model configuration does not support returning hidden states!")

    # combining last 4 (TRY OTHER) layers for more robust results - experiment with combining embeddings from earlier layers (???), Dimensionality reduction - PCA (???)
    combined_last_four_layers = tf.reduce_mean(tf.stack([hidden_states[i] for i in range(-4, 0)]), axis=0)
    # normalizing the combined embeddings
    pred_embeddings = tf.nn.l2_normalize(combined_last_four_layers, axis=2) [:, mask_token_indexes[i], :]

    true_sentences_tkz = tokenizer(true_sentences[i], return_tensors="tf", padding=True, truncation=True)
    hidden_states = model(**true_sentences_tkz).hidden_states
    # combining last 4 (TRY OTHER) layers for more robust results
    combined_last_four_layers = tf.reduce_mean(tf.stack([hidden_states[i] for i in range(-4, 0)]), axis=0)
    # normalizing the combined embeddings
    true_embeddings = tf.nn.l2_normalize(combined_last_four_layers, axis=2) [:, mask_token_indexes[i], :]

    highest_similarity = -1

    if verbose:
      most_similar_pred = None
      most_similar_true = None

    for j in range(len(pred_embeddings)):
      for k in range(len(true_embeddings)):

        if verbose:
          pred_token = tokenizer.decode(predicted_sentences_tkz["input_ids"][j].numpy()[mask_token_indexes[i]])
          true_token = tokenizer.decode(true_sentences_tkz["input_ids"][k].numpy()[mask_token_indexes[i]])

        similarity = cosine_similarity(tf.reshape(pred_embeddings[j], shape=(1, -1)), tf.reshape(true_embeddings[k], shape=(-1, 1)))[0][0]

        if verbose:
          # some words from true answers cut into multiple tokens (!!!)
          print(f"Cosine similarity between '{pred_token}' and '{true_token}': {similarity}")

        if similarity > highest_similarity:
            highest_similarity = similarity
            if verbose:
              most_similar_pred = pred_token
              most_similar_true = true_token

    if verbose:
      print(f"\033[1mHighest similarity is between '{most_similar_pred}' and '{most_similar_true}': {highest_similarity}\033[0m")
      print('--------------------------------------------')

    # 0.95 - threshold (can be parameter) - TRY OTHER
    if highest_similarity >= 0.95:
      hits += 1

  return (hits/n) * 100

# print(compute_cos_sim_accuracy_v1(biomedbert_models['BioMedBERT_base_full'], inputs[:10], pred_10[:10], tails[:10], verbose=1))
pred_5_roberta = get_model_predictions(model_checkpoint=roberta_models['RoBERTa_base'], inputs=inputs[:10], top_n=5)
print(compute_cos_sim_accuracy_v1(roberta_models['RoBERTa_base'], inputs[:10], pred_5_roberta[:10], tails[:10], verbose=1))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Cosine similarity between ' vaccination' and ' administration': 0.8891960978507996
Cosine similarity between ' vaccinations' and ' administration': 0.8829200863838196
Cosine similarity between ' infection' and ' administration': 0.8946338891983032
Cosine similarity between ' exposure' and ' administration': 0.9050487875938416
Cosine similarity between ' influenza' and ' administration': 0.8568709492683411
[1mHighest similarity is between ' exposure' and ' administration': 0.9050487875938416[0m
--------------------------------------------
Cosine similarity between ' surgery' and ' cry': 0.8358896374702454
Cosine similarity between ' chemotherapy' and ' cry': 0.858271598815918
Cosine similarity between ' treatment' and ' cry': 0.8431753516197205
Cosine similarity between ' death' and ' cry': 0.8333811163902283
Cosine similarity between ' diagnosis' and ' cry': 0.8337224125862122
[1mHighest similarity is between ' chemotherapy' and ' cry': 0.858271598815918[0m
------------------------

In [None]:
top_1_cos_acc = compute_cos_sim_accuracy_v1(biomedbert_models['BioMedBERT_base_abstract'], inputs, pred_1, tails)
print(f'Top 1 accuracy: {top_1_cos_acc:.2f} %')

Top 1 accuracy: 16.60 %


In [None]:
top_10_cos_acc = compute_cos_sim_accuracy_v1(biomedbert_models['BioMedBERT_base_full'], inputs, pred_10, tails)
print(f'Top 10 accuracy: {top_10_cos_acc:.2f} %')

Top 10 accuracy: 37.30 %


In [None]:
# single token scenario: b) combine embeddings of all tokens of the answer - from mask_index to end ('.' token) - Problem: different granularity between predictions and true phrases
def compute_cos_sim_accuracy_v2(model_checkpoint:str, original_inputs, predictions, tails, verbose=0):
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = TFAutoModel.from_pretrained(model_checkpoint, from_pt=True, output_hidden_states=True)

  # Adjusting inputs for RoBERTa models
  if 'roberta' in model_checkpoint:
    original_inputs = [change_input_format(input) for input in original_inputs]

  # model_max_length field not set by default for BioBERT and BioMedBERT models
  if 'bio' in model_checkpoint.lower():
    tokenizer.model_max_length = 512

  # Tokenizing original inputs
  tokenized_inputs = tokenizer(original_inputs, return_tensors="tf", padding=True, truncation=True) # max_length=128 (by default max_length = tokenizer.model_max_length)
  #print(tokenized_inputs['input_ids'])

  # Getting mask_token indexes for every example
  mask_token_indexes = []

  for i, input in enumerate(tokenized_inputs["input_ids"]):
                                                                                   # not necessary (all tokenizers have mask_token_id defined)
    mask_token_id = tokenizer.mask_token_id if tokenizer.mask_token_id is not None else tokenizer.convert_tokens_to_ids(["[MASK]"])[0]

    mask_token_indexes.append(np.argwhere(tokenized_inputs["input_ids"].numpy()[i] == mask_token_id)[0, 0])

  predicted_sentences = replace_masks(model_checkpoint, original_inputs, predictions)
  true_sentences = replace_masks(model_checkpoint, original_inputs, tails)

  hits = 0
  n = len(original_inputs)
  # for every example in relation dataset
  for i in range(n):

    predicted_sentences_tkz = tokenizer(predicted_sentences[i], return_tensors="tf", padding=True, truncation=True)

    hidden_states = model(**predicted_sentences_tkz).hidden_states

    if not isinstance(hidden_states, tuple):
      raise ValueError("Model configuration does not support returning hidden states!")

    # combining last 4 (TRY OTHER) layers for more robust results - experiment with combining embeddings from earlier layers (???), Dimensionality reduction - PCA (???)
    combined_last_four_layers = tf.reduce_mean(tf.stack([hidden_states[i] for i in range(-4, 0)]), axis=0)

    # normalizing the combined embeddings
    pred_embeddings = tf.nn.l2_normalize(combined_last_four_layers, axis=2) [:, mask_token_indexes[i], :]

    true_sentences_tkz = tokenizer(true_sentences[i], return_tensors="tf", padding=True, truncation=True)
    hidden_states = model(**true_sentences_tkz).hidden_states

    # combining last 4 layers for more robust results
    combined_last_four_layers = tf.reduce_mean(tf.stack([hidden_states[i] for i in range(-4, 0)]), axis=0)

    # finding end indexes of the true phrases (position of the period token '.' - different for every true phrase!)
    period_token_id = tokenizer.convert_tokens_to_ids(".")
    end_indexes = []
    embedding_size = combined_last_four_layers.shape[2]
    true_embeddings = tf.zeros((0, embedding_size))

    for m in range(len(true_sentences_tkz["input_ids"])):

      full_stop_indices = np.where(true_sentences_tkz["input_ids"].numpy()[m] == period_token_id)[0]

      # first full stop index > first_mask_token_indices[i]
      indices = np.where(full_stop_indices > mask_token_indexes[i])[0]
      end_index = full_stop_indices[indices[0]] if indices.size > 0 else None

      if end_index is None:
        # in case there's no '.', take until some special token occurs (should not happen)
        end_index = np.where(np.isin(true_sentences_tkz["input_ids"].numpy()[m], tokenizer.all_special_ids))[0][0]

      # extracting the embeddings from the mask_token_indexes[i] (start index) to the end index
      phrase_embedding = combined_last_four_layers[m, mask_token_indexes[i]:end_index, :]

      # normalizing and combining the embeddings of the whole phrase
      true_embedding = tf.nn.l2_normalize(tf.reduce_mean(phrase_embedding, axis=0), axis=0)

      true_embeddings = tf.concat([true_embeddings, tf.reshape(true_embedding, (1, -1))], axis=0)

      end_indexes.append(end_index)

    highest_similarity = -1

    if verbose:
      most_similar_pred = None
      most_similar_true = None

    for j in range(len(pred_embeddings)):
      for k in range(len(true_embeddings)):

        if verbose:
          pred_token = tokenizer.decode(predicted_sentences_tkz["input_ids"][j].numpy()[mask_token_indexes[i]])
          true_phrase = tokenizer.decode(true_sentences_tkz["input_ids"][k].numpy()[mask_token_indexes[i]:end_indexes[k]])

        similarity = cosine_similarity(tf.reshape(pred_embeddings[j], shape=(1, -1)), tf.reshape(true_embeddings[k], shape=(-1, 1)))[0][0]

        if verbose:
          # some words from true answers cut into multiple tokens (!!!)
          print(f"Cosine similarity between '{pred_token}' and '{true_phrase}': {similarity}")

        if similarity > highest_similarity:
            highest_similarity = similarity
            if verbose:
              most_similar_pred = pred_token
              most_similar_true = true_phrase

    if verbose:
      print(f"\033[1mHighest similarity is between '{most_similar_pred}' and '{most_similar_true}': {highest_similarity}\033[0m")
      print('--------------------------------------------')

    # 0.95 - threshold (can be function parameter) - TRY OTHER
    if highest_similarity >= 0.95:
      hits += 1

  return (hits/n) * 100

pred_5_roberta = get_model_predictions(model_checkpoint=roberta_models['RoBERTa_large'], inputs=inputs[:10], top_n=5)
print(compute_cos_sim_accuracy_v2(roberta_models['RoBERTa_large'], inputs[:10], pred_5_roberta[:10], tails[:10], verbose=1))

Cosine similarity between ' vaccination' and ' administration of influenza vaccine': 0.9667031764984131
Cosine similarity between ' infection' and ' administration of influenza vaccine': 0.933690071105957
Cosine similarity between ' treatment' and ' administration of influenza vaccine': 0.9394055008888245
Cosine similarity between ' exposure' and ' administration of influenza vaccine': 0.9482665061950684
Cosine similarity between ' vaccinations' and ' administration of influenza vaccine': 0.954810619354248
[1mHighest similarity is between ' vaccination' and ' administration of influenza vaccine': 0.9667031764984131[0m
--------------------------------------------
Cosine similarity between ' chemotherapy' and ' cryosurgery': 0.8986359238624573
Cosine similarity between ' treatment' and ' cryosurgery': 0.9033290147781372
Cosine similarity between ' surgery' and ' cryosurgery': 0.9313793778419495
Cosine similarity between ' years' and ' cryosurgery': 0.8706271648406982
Cosine similarity 

In [None]:
pred_5_biomed = get_model_predictions(model_checkpoint=biomedbert_models['BioMedBERT_base_full'], inputs=inputs[:20], top_n=5)
print(compute_cos_sim_accuracy_v2(biomedbert_models['BioMedBERT_base_full'], inputs[:20], pred_5_biomed[:20], tails[:20], verbose=1))

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Cosine similarity between 'vaccination' and 'administration of influenza vaccine': 0.9442157745361328
Cosine similarity between 'birth' and 'administration of influenza vaccine': 0.8761966824531555
Cosine similarity between 'immunization' and 'administration of influenza vaccine': 0.9306254386901855
Cosine similarity between 'childhood' and 'administration of influenza vaccine': 0.8729021549224854
Cosine similarity between 'vaccinations' and 'administration of influenza vaccine': 0.9219234585762024
[1mHighest similarity is between 'vaccination' and 'administration of influenza vaccine': 0.9442157745361328[0m
--------------------------------------------
Cosine similarity between 'radiotherapy' and 'cryosurgery': 0.9210992455482483
Cosine similarity between 'years' and 'cryosurgery': 0.8444016575813293
Cosine similarity between 'surgery' and 'cryosurgery': 0.9139726161956787
Cosine similarity between 'chemotherapy' and 'cryosurgery': 0.897855818271637
Cosine similarity between 'decades

In [None]:
pred_1_biomed = get_model_predictions(model_checkpoint=biomedbert_models['BioMedBERT_base_full'], inputs=inputs[:200], top_n=1)
top_1_cos_acc = compute_cos_sim_accuracy_v2(biomedbert_models['BioMedBERT_base_full'], inputs[:200], pred_1_biomed[:200], tails[:200])
print(f'Top 1 accuracy: {top_1_cos_acc:.2f} %')

Top 1 accuracy: 17.00 %


# MLM acccuracy measuring function

In [None]:
def compute_mlm_top_n_accuracy(model_checkpoint:str, accuracy_function:str, relation_dataset:str, dataset_frac=1 , top_n=5, random_state=123, verbose=0):

  data = pd.read_csv(relation_dataset, usecols=["head_name", "rel", "tail_names"])
  # For quicker testing due to resource limitations
  data_chunk = data.sample(frac=dataset_frac, random_state=random_state).reset_index(drop=True)

  rel_name = data_chunk['rel'][0]
  default_prompt = prompts.loc[prompts['pid'] == rel_name]['default_prompt'].tolist()[0]

  model_inputs = prepare_inputs(data_chunk, default_prompt)

  predicted_objects = get_model_predictions(model_checkpoint=model_checkpoint, inputs=model_inputs, top_n=top_n, verbose=0)

  true_objects = data_chunk['tail_names'].tolist()
  true_objects = list(map(lambda x: x.split(' || '), true_objects))

  if accuracy_function not in ['exact', 'cosine', 'cosine2']:
    raise ValueError("Accuracy must be either 'exact', 'cosine' or 'cosine2.")

  if accuracy_function == 'exact':
    return compute_accuracy(predicted_objects, true_objects, verbose)
  elif accuracy_function == 'cosine':
    return compute_cos_sim_accuracy_v1(model_checkpoint, model_inputs, predicted_objects, true_objects, verbose)
  elif accuracy_function == 'cosine2':
    return compute_cos_sim_accuracy_v2(model_checkpoint, model_inputs, predicted_objects, true_objects, verbose)

In [None]:
# Comparing BERT models top 5 exact accuracy
print('Top 5 accuracy')

relation = 'occurs_after'
print(f"Relation: \033[1m{relation}\033[0m")

for k, v in bert_models.items():
  acc = compute_mlm_top_n_accuracy(v, 'exact', f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}%\n")

Top 5 accuracy
Relation: [1moccurs_after[0m


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT_base: 5.40%



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

BERT_large: 5.40%



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

BERT_large_wwm: 6.40%



In [None]:
for k, v in roberta_models.items():
  acc = compute_mlm_top_n_accuracy(v, 'exact', f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}%\n")

RoBERTa_base: 2.60%

RoBERTa_large: 4.10%



In [None]:
for k, v in albert_models.items():
  acc = compute_mlm_top_n_accuracy(v, 'exact', f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}%\n")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

ALBERT_base: 4.20%



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/893M [00:00<?, ?B/s]

ALBERT_xxlarge: 8.60%



In [None]:
for k, v in biobert_models.items():
  acc = compute_mlm_top_n_accuracy(v, 'exact', f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}%\n")



config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

BioBERT: 3.10%



In [None]:
for k, v in biomedbert_models.items():
  acc = compute_mlm_top_n_accuracy(v, 'exact', f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}%\n")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

BioMedBERT_base_abstract: 10.20%



tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

BioMedBERT_base_full: 8.70%



tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

BioMedBERT_large_abstract: 11.20%



In [None]:
# Comparing BERT models top 5 cos sim accuracy
print('Top 5 accuracy')

relation = 'occurs_after'
print(f"Relation: \033[1m{relation}\033[0m")

for k, v in bert_models.items():
  acc = compute_mlm_top_n_accuracy(v, 'cosine', f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}%\n")

Top 5 accuracy
Relation: [1moccurs_after[0m
BERT_base: 6.10

BERT_large: 9.80

BERT_large_wwm: 8.00



In [None]:
for k, v in roberta_models.items():
  acc = compute_mlm_top_n_accuracy(v, 'cosine', f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}%\n")

RoBERTa_base: 27.00



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

RoBERTa_large: 24.00



In [None]:
for k, v in albert_models.items():
  acc = compute_mlm_top_n_accuracy(v, 'cosine', f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}%\n")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

ALBERT_base: 8.00



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/893M [00:00<?, ?B/s]

ALBERT_xxlarge: 9.00



In [None]:
for k, v in biobert_models.items():
  acc = compute_mlm_top_n_accuracy(v, 'cosine', f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}%\n")



config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

BioBERT: 7.00



In [None]:
for k, v in biomedbert_models.items():
  acc = compute_mlm_top_n_accuracy(v, 'cosine', f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}%\n")

BioMedBERT_base_abstract: 11.00

BioMedBERT_base_full: 35.00



tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

BioMedBERT_large_abstract: 36.00



# Multi-token issue

In [16]:
# Conditional MLM
# filling masks in parallel independently (Independent approach)
# TO DO - top_n predictions (how to combine predictions of particular tokens (???))
def fill_masks_independently(model_checkpoint: str, inputs: list[str], top_n=5, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    tokenized_inputs = tokenizer(inputs, return_tensors="tf", padding=True, truncation=True)

    # getting the token logits from the model
    token_logits = model(**tokenized_inputs).logits

    outputs = []
    outputs_decoded = []

    for i, input in enumerate(inputs):

        input_ids = tokenized_inputs["input_ids"].numpy()[i]

        # finding all positions of the [MASK] token
        mask_token_id = tokenizer.mask_token_id
        mask_token_indices = np.where(input_ids == mask_token_id)[0]
        #print(mask_token_indices)

        prediction = []
        for mask_token_index in mask_token_indices:

          mask_token_logits = token_logits[i, mask_token_index, :]

          top_token = np.argmax(mask_token_logits.numpy())

          prediction.append(top_token)

        outputs.append(prediction)
        prediction_decoded = tokenizer.decode(prediction, skip_special_tokens=True)
        outputs_decoded.append(prediction_decoded)

    return outputs, outputs_decoded

In [None]:
# tokenizer.convert_ids_to_tokens:
#This method converts each token ID into its corresponding token string.
#It returns a list of token strings.
#It retains the special tokens and subword tokens as they are

#tokenizer.decode:
#This method converts a list of token IDs into a single string.
#It concatenates the token strings into a coherent sentence.
#It handles subwords properly by removing special characters (like ## in BERT) and joining subwords.
#It removes special tokens by default

test_input = ["Paris is [MASK] [MASK] to visit.", "Jupyter is the largest planet of the [MASK] [MASK].", "The weather forecast predicts [MASK] [MASK] for tomorrow.", "The weather forecast predicts heavy rain and [MASK] [MASK].", "He wanted to visit the museum and explore the [MASK] [MASK].", "She was excited about the promotion and [MASK] [MASK].", "He is known for his dedication and [MASK] [MASK] [MASK].", "They plan to travel to Italy and enjoy the beautiful [MASK] [MASK] [MASK].",  "She decided to go to the market and buy some fresh [MASK] [MASK] [MASK] [MASK].", "He set a new world record at the [MASK] [MASK] [MASK] [MASK] event."]
outputs, outputs_dec = fill_masks_independently(bert_models['BERT_base'], test_input)

test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])
# getting less coherent and contextually appropriate predictions as the sequence of [MASK] tokens becomes longer
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

[2036, 4569] : also fun : ['also', 'fun']
[3103, 2155] : sun family : ['sun', 'family']
[1996, 4633] : the weather : ['the', 'weather']
[4586, 7266] : snow winds : ['snow', 'winds']
[2334, 2088] : local world : ['local', 'world']
[1996, 2769] : the money : ['the', 'money']
[2010, 3167, 3241] : his personal thinking : ['his', 'personal', 'thinking']
[3059, 3059, 10833] : italian italian countryside : ['italian', 'italian', 'countryside']
[11546, 1998, 2000, 2014] : vegetables and to her : ['vegetables', 'and', 'to', 'her']
[2286, 2088, 2399, 2399] : 2013 world games games : ['2013', 'world', 'games', 'games']


In [17]:
# Conditional MLM
# filling masks autoregressively (Order approach - left to right)
# TO DO - top_n predictions
def fill_masks_autoregressively(model_checkpoint: str, inputs: list[str], top_n=5, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    outputs = []
    outputs_decoded = []

    for input_text in inputs:
        if verbose:
          print(f"input sentence: {input_text}")

        tokenized_input = tokenizer(input_text, return_tensors="tf")
        if verbose:
          print(f"tokenized input: {tokenized_input}")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]
        if verbose:
          print(f"input_ids: {input_ids}")

        # finding all positions of the [MASK] tokens
        mask_token_id = tokenizer.mask_token_id
        mask_token_indices = np.where(input_ids.numpy()[0] == mask_token_id)[0]

        if verbose:
         print(f"mask positions: {mask_token_indices}")

        prediction = []
        for mask_index in mask_token_indices:

          token_logits = model(**tokenized_input).logits[0]
          mask_token_logits = token_logits[mask_index, :]

          # getting the top predicted token
          top_token = np.argmax(mask_token_logits.numpy())

          if verbose:
            print(f"{top_token}: {tokenizer.convert_ids_to_tokens([top_token])} ")

          prediction.append(top_token)
                                                            # list of tensor coordinates to change
          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [top_token])
          if verbose:
            print(f"input_ids: {input_ids}")

          # making new tokenized_input tensor
          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

          if verbose:
            print(f"tokenized input: {tokenized_input}")

        outputs.append(prediction)
        prediction_decoded = tokenizer.decode(prediction, skip_special_tokens=True)
        outputs_decoded.append(prediction_decoded)

    return outputs, outputs_decoded

In [None]:
fill_masks_autoregressively(bert_models['BERT_base'], test_input, verbose=1)

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
input sentence: Paris is [MASK] [MASK] to visit.
tokenized input: {'input_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=
array([[ 101, 3000, 2003,  103,  103, 2000, 3942, 1012,  102]],
      dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.T

([[2036, 2825],
  [3103, 2291],
  [1996, 4633],
  [4586, 3785],
  [2334, 2381],
  [1996, 3105],
  [2010, 5541, 2943],
  [3059, 2406, 17363],
  [11546, 2005, 1996, 2154],
  [2286, 2621, 3783, 6042]],
 ['also possible',
  'sun system',
  'the weather',
  'snow conditions',
  'local history',
  'the job',
  'his creative energy',
  'italian country scenery',
  'vegetables for the day',
  '2013 summer olympics qualifying'])

In [None]:
fill_masks_autoregressively(roberta_models['RoBERTa_base'], test_input, verbose=1)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Chosen model: FacebookAI/roberta-base
Model: "tf_roberta_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  124055040 
 r)                                                              
                                                                 
 lm_head (TFRobertaLMHead)   multiple                  39642969  
                                                                 
Total params: 124697433 (475.68 MB)
Trainable params: 124697433 (475.68 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
input sentence: Paris is <mask> <mask> to visit.
tokenized input: {'input_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=
array([[    0, 32826,    16, 50264, 50264,     7,   825,     4,     2]],
      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array

([[10, 343],
  [4118, 467],
  [2779, 12034],
  [2016, 1958],
  [92, 18293],
  [5, 515],
  [657, 9, 3122],
  [26815, 8, 2040],
  [12849, 13, 69, 5671],
  [232, 18, 934, 12731]],
 [' a city',
  ' solar system',
  ' cloudy skies',
  ' heavy snow',
  ' new exhibits',
  ' the event',
  ' love of animals',
  ' scenery and culture',
  ' fruits for her garden',
  " world's biggest cycling"])

In [None]:
outputs, outputs_dec = fill_masks_autoregressively(roberta_models['RoBERTa_large'], test_input)

test_tokenizer = AutoTokenizer.from_pretrained(roberta_models['RoBERTa_large'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

[10, 317] :  a place : ['Ġa', 'Ġplace']
[4118, 467] :  solar system : ['Ġsolar', 'Ġsystem']
[5419, 12034] :  sunny skies : ['Ġsunny', 'Ġskies']
[4775, 5621] :  thunderstorms : ['Ġthunder', 'storms']
[1808, 2783] :  art collection : ['Ġart', 'Ġcollection']
[5, 945] :  the opportunity : ['Ġthe', 'Ġopportunity']
[2720, 7, 12411] :  commitment to excellence : ['Ġcommitment', 'Ġto', 'Ġexcellence']
[8, 3575, 247] :  and historic country : ['Ġand', 'Ġhistoric', 'Ġcountry']
[8942, 13, 69, 284] :  vegetables for her family : ['Ġvegetables', 'Ġfor', 'Ġher', 'Ġfamily']
[386, 9, 5, 12731] :  start of the cycling : ['Ġstart', 'Ġof', 'Ġthe', 'Ġcycling']


In [None]:
outputs, outputs_dec = fill_masks_autoregressively(albert_models['ALBERT_base'], test_input)
# bad predictions (!!!)
test_tokenizer = AutoTokenizer.from_pretrained(albert_models['ALBERT_base'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

[105, 209] : them place : ['▁them', '▁place']
[24913, 161] : milky way : ['▁milky', '▁way']
[105, 7575] : them preparing : ['▁them', '▁preparing']
[23876, 18] : thunderstorms : ['▁thunderstorm', 's']
[22557, 22557] : museo museo : ['▁museo', '▁museo']
[105, 29833] : them evalle : ['▁them', '▁evalle']
[105, 28153, 28153] : them joyah joyah : ['▁them', '▁joyah', '▁joyah']
[105, 29833, 20614] : them evalleitaly : ['▁them', '▁evalle', 'italy']
[105, 105, 29833, 29833] : them them evalle evalle : ['▁them', '▁them', '▁evalle', '▁evalle']
[105, 105, 29833, 1446] : them them evalle championships : ['▁them', '▁them', '▁evalle', '▁championships']


In [None]:
outputs, outputs_dec = fill_masks_autoregressively(biobert_models['BioBERT'], test_input)

test_tokenizer = AutoTokenizer.from_pretrained(biobert_models['BioBERT'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

[1167, 2846] : more difficult : ['more', 'difficult']
[5015, 1227] : planet known : ['planet', 'known']
[1103, 4250] : the weather : ['the', 'weather']
[16076, 1958] : drought events : ['drought', 'events']
[1469, 3750] : local environment : ['local', 'environment']
[1103, 4166] : the promotion : ['the', 'promotion']
[13314, 1106, 1250] : dedication to work : ['dedication', 'to', 'work']
[2731, 1104, 1122] : nature of it : ['nature', 'of', 'it']
[11872, 1111, 1103, 1482] : vegetables for the children : ['vegetables', 'for', 'the', 'children']
[1148, 1159, 1107, 1142] : first time in this : ['first', 'time', 'in', 'this']


In [None]:
outputs, outputs_dec = fill_masks_autoregressively(biomedbert_models['BioMedBERT_base_full'], test_input)
# pretrained on domain-specific dataset - separating common words into multiple subtokens (!!!)
test_tokenizer = AutoTokenizer.from_pretrained(biomedbert_models['BioMedBERT_base_full'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

[4843, 13043] : easiest : ['eas', '##iest']
[7893, 1031] : planet : ['plane', '##t']
[1920, 4115] : the future : ['the', 'future']
[2149, 7750] : high temperatures : ['high', 'temperatures']
[7510, 1036] : surroundings : ['surrounding', '##s']
[1920, 2659] : the research : ['the', 'research']
[4333, 6419, 1972] : diligence : ['dil', '##igen', '##ce']
[4267, 3947, 1036] : ceremons : ['cere', '##mon', '##s']
[4551, 4879, 5148, 2057] : chocolate : ['cho', '##co', '##la', '##te']
[3684, 17, 18162, 1008] : mar - 1910 : ['mar', '-', '191', '##0']


In [18]:
# Conditional MLM
# filling masks sorted by the maximum confidence (Greedy approach)
# TO DO - top_n predictions
def fill_masks_by_confidence(model_checkpoint: str, inputs: list[str], top_n=5, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    outputs = []
    outputs_decoded = []

    for input_text in inputs:
        if verbose:
          print(f"input sentence: {input_text}")

        tokenized_input = tokenizer(input_text, return_tensors="tf")
        if verbose:
          print(f"tokenized input: {tokenized_input}")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]
        if verbose:
          print(f"input_ids: {input_ids}")

        prediction_dict = {}

        while True:

          # finding all positions of the [MASK] tokens
          mask_token_id = tokenizer.mask_token_id
          mask_token_indices = np.where(input_ids.numpy()[0] == mask_token_id)[0]

          # all tokens at mask positions are predicted
          if len(mask_token_indices) == 0:
            break

          if verbose:
            print(f"mask positions: {mask_token_indices}")

          # getting token logits at mask_token_indices
          token_logits = model(**tokenized_input).logits[0]
          mask_token_logits = tf.gather(token_logits, mask_token_indices)

          # tf.matf.top_k returns k top values and indices from the input tensor along last dimension (by default)
          top_k_values, top_k_indices = tf.math.top_k(mask_token_logits, k=1)
          # remove extra dimenstion (FOR NOW: k=1 - numbers instead of lists)
          top_k_values = tf.squeeze(top_k_values)
          top_k_indices = tf.squeeze(top_k_indices)

          if verbose:
            print(top_k_values)
            print(top_k_indices)

          # checking if top_k_values is scalar tensor (case when there is only 1 mask index) - REFACTOR (!!!)
          if top_k_values.shape == ():
              most_confident_mask_position, most_confident_token = mask_token_indices[0], top_k_indices.numpy()
          else:
            k = tf.argmax(top_k_values)
            most_confident_mask_position, most_confident_token = mask_token_indices[k], top_k_indices[k].numpy()

          if verbose:
            print(f"{most_confident_token}: {tokenizer.convert_ids_to_tokens([most_confident_token])} - index: {most_confident_mask_position}")

          prediction_dict[most_confident_mask_position] = most_confident_token

                                                            # list of tensor coordinates to change
          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, most_confident_mask_position]], [most_confident_token])
          if verbose:
            print(f"input_ids: {input_ids}")

          # making new tokenized_input tensor
          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

          if verbose:
            print(f"tokenized input: {tokenized_input}")

        prediction = [value for key, value in sorted(prediction_dict.items())]
        outputs.append(prediction)
        prediction_decoded = tokenizer.decode(prediction, skip_special_tokens=True)
        outputs_decoded.append(prediction_decoded)

    return outputs, outputs_decoded

In [None]:
fill_masks_by_confidence(bert_models['BERT_base'], test_input, verbose=1)

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
input sentence: Paris is [MASK] [MASK] to visit.
tokenized input: {'input_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=
array([[ 101, 3000, 2003,  103,  103, 2000, 3942, 1012,  102]],
      dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.

([[2467, 4569],
  [3103, 2291],
  [1996, 4633],
  [2844, 7266],
  [3019, 2088],
  [1996, 3105],
  [2844, 4187, 3241],
  [2670, 3059, 10833],
  [11546, 2005, 2014, 4596],
  [2286, 3467, 3783, 6042]],
 ['always fun',
  'sun system',
  'the weather',
  'strong winds',
  'natural world',
  'the job',
  'strong critical thinking',
  'southern italian countryside',
  'vegetables for her dinner',
  '2013 winter olympics qualifying'])

In [None]:
outputs, outputs_dec = fill_masks_by_confidence(bert_models['BERT_large'], test_input)
test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_large'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

[2200, 3733] : very easy : ['very', 'easy']
[5943, 2291] : solar system : ['solar', 'system']
[3082, 4542] : heavy rain : ['heavy', 'rain']
[5956, 9451] : flash flooding : ['flash', 'flooding']
[2396, 2088] : art world : ['art', 'world']
[1996, 3336] : the baby : ['the', 'baby']
[2010, 2652, 2806] : his playing style : ['his', 'playing', 'style']
[17363, 1997, 3304] : scenery of italy : ['scenery', 'of', 'italy']
[5909, 2005, 1996, 2154] : fruit for the day : ['fruit', 'for', 'the', 'day']
[1021, 1012, 1019, 2463] : 7. 5 km : ['7', '.', '5', 'km']


In [None]:
outputs, outputs_dec = fill_masks_by_confidence(bert_models['BERT_large_wwm'], test_input)
test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_large_wwm'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

[1037, 2173] : a place : ['a', 'place']
[5943, 2291] : solar system : ['solar', 'system']
[1996, 4633] : the weather : ['the', 'weather']
[2844, 7266] : strong winds : ['strong', 'winds']
[3019, 2088] : natural world : ['natural', 'world']
[1996, 3336] : the baby : ['the', 'baby']
[2566, 3366, 21998] : perseverance : ['per', '##se', '##verance']
[2103, 1997, 4199] : city of rome : ['city', 'of', 'rome']
[7852, 2005, 2014, 2155] : bread for her family : ['bread', 'for', 'her', 'family']
[2034, 3179, 1997, 1996] : first edition of the : ['first', 'edition', 'of', 'the']


In [None]:
outputs, outputs_dec = fill_masks_by_confidence(roberta_models['RoBERTa_base'], test_input)
test_tokenizer = AutoTokenizer.from_pretrained(roberta_models['RoBERTa_base'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

[10, 343] :  a city : ['Ġa', 'Ġcity']
[4118, 467] :  solar system : ['Ġsolar', 'Ġsystem']
[2016, 1895] :  heavy rain : ['Ġheavy', 'Ġrain']
[670, 2372] :  strong winds : ['Ġstrong', 'Ġwinds']
[430, 18293] :  different exhibits : ['Ġdifferent', 'Ġexhibits']
[5, 515] :  the event : ['Ġthe', 'Ġevent']
[27022, 7, 643] :  devotion to others : ['Ġdevotion', 'Ġto', 'Ġothers']
[26815, 8, 2040] :  scenery and culture : ['Ġscenery', 'Ġand', 'Ġculture']
[12849, 13, 69, 5671] :  fruits for her garden : ['Ġfruits', 'Ġfor', 'Ġher', 'Ġgarden']
[78, 1289, 9, 5] :  first stage of the : ['Ġfirst', 'Ġstage', 'Ġof', 'Ġthe']


In [None]:
outputs, outputs_dec = fill_masks_by_confidence(albert_models['ALBERT_xxlarge'], test_input)
test_tokenizer = AutoTokenizer.from_pretrained(albert_models['ALBERT_xxlarge'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/893M [00:00<?, ?B/s]

[253, 25349] : very tempting : ['▁very', '▁tempting']
[4535, 329] : solar system : ['▁solar', '▁system']
[23876, 8791] : thunderstorm threats : ['▁thunderstorm', '▁threats']
[23876, 5699] : thunderstorm opportunities : ['▁thunderstorm', '▁opportunities']
[27597, 4764] : zoological gardens : ['▁zoological', '▁gardens']
[28153, 10525] : joyahresponsibilities : ['▁joyah', 'responsibilities']
[13704, 1016, 5673] : dedication towards athletics : ['▁dedication', '▁towards', '▁athletics']
[22271, 16, 24483] : vineyards of tuscany : ['▁vineyards', '▁of', '▁tuscany']
[13, 24068, 6230, 38] : yoghurt : ['▁', 'yog', 'hur', 't']
[13226, 1866, 13, 28421] : 3000 metres steeplechase : ['▁3000', '▁metres', '▁', 'steeplechase']


In [None]:
outputs, outputs_dec = fill_masks_by_confidence(biobert_models['BioBERT'], test_input)
test_tokenizer = AutoTokenizer.from_pretrained(biobert_models['BioBERT'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

[3860, 3123] : relatively easy : ['relatively', 'easy']
[5015, 2746] : planet Earth : ['planet', 'Earth']
[1103, 4250] : the weather : ['the', 'weather']
[4883, 119] : snow. : ['snow', '.']
[3480, 119] : museum. : ['museum', '.']
[1103, 4166] : the promotion : ['the', 'promotion']
[13314, 1106, 1250] : dedication to work : ['dedication', 'to', 'work']
[19335, 1104, 1122] : scenery of it : ['scenery', 'of', 'it']
[11872, 1107, 1103, 3440] : vegetables in the evening : ['vegetables', 'in', 'the', 'evening']
[1322, 1104, 1103, 1269] : end of the same : ['end', 'of', 'the', 'same']


In [None]:
outputs, outputs_dec = fill_masks_by_confidence(biomedbert_models['BioMedBERT_large_abstract'], test_input)
# pretrained on domain-specific dataset - separating common words into multiple subtokens (!!!)
test_tokenizer = AutoTokenizer.from_pretrained(biomedbert_models['BioMedBERT_large_abstract'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

[4707, 13428] : easiest : ['eas', '##iest']
[8177, 1007] : universe : ['univers', '##e']
[1680, 17242] : the weather : ['the', 'weather']
[5624, 1007] : haze : ['haz', '##e']
[7620, 1026] : surroundings : ['surrounding', '##s']
[18293, 2377] : marketing process : ['marketing', 'process']
[23133, 1711, 26532] : enthusiasm : ['enth', '##us', '##iasm']
[2967, 1862, 1026] : paintings : ['pain', '##ting', '##s']
[5624, 1699, 26709, 1026] : hazelnuts : ['haz', '##el', '##nut', '##s']
[1725, 3589, 15160, 1700] : forgotting : ['for', '##go', '##tt', '##ing']


In [19]:
# Conditional MLM
# Initial predictions (Order) + Refinement (Order) until predictions converge or maximum number of iterations is reached
# TO DO: top_n predictions
def fill_masks_autoregressively_with_refinement(model_checkpoint: str, inputs: list[str], top_n=5, max_iter=10, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    outputs = []
    outputs_decoded = []

    for input_text in inputs:
        if verbose:
          print(f"Initial input sentence: {input_text}")

        tokenized_input = tokenizer(input_text, return_tensors="tf")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]

        # finding all positions of the [MASK] tokens
        mask_token_id = tokenizer.mask_token_id
        mask_token_indices = np.where(input_ids.numpy()[0] == mask_token_id)[0]

        if verbose:
         print(f"mask positions: {mask_token_indices}")

        prediction_dict = OrderedDict((mask_index, mask_token_id) for mask_index in mask_token_indices)

        for i in range(max_iter+1):
          if verbose:
            print(f"Iteration: {i}")

          updated_tokens = 0

          for mask_index in mask_token_indices:

            token_logits = model(**tokenized_input).logits[0]
            mask_token_logits = token_logits[mask_index, :]

            # getting the top predicted token
            top_token = np.argmax(mask_token_logits.numpy())

            if verbose:
              print(f"{mask_index}: {top_token}")

            if prediction_dict[mask_index] != top_token:
              prediction_dict[mask_index] = top_token
              updated_tokens += 1
                                                              # list of tensor coordinates to change
            input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [top_token])

            # making new tokenized_input tensor
            if use_token_type_ids:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
                'token_type_ids': tokenized_input['token_type_ids']
              }
            else:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
              }

          if updated_tokens == 0:
            if verbose:
              print(f"\033[1mConvergence reached in iteration {i}!\033[0m")
            break

          prediction_i = [value for key, value in prediction_dict.items()]
          prediction_i_decoded = tokenizer.decode(prediction_i, skip_special_tokens=True)
          if verbose:
            print(f"Predicted tokens in iteration {i}: {prediction_i}: {prediction_i_decoded}")

        if verbose:
          print('---------------------------------------------------------------------------------------------')

        final_prediction = prediction_i
        final_prediction_decoded = prediction_i_decoded
        outputs.append(prediction_i)
        outputs_decoded.append(final_prediction_decoded)

    return outputs, outputs_decoded

In [None]:
# early convergence
outputs, outputs_dec = fill_masks_autoregressively_with_refinement(bert_models['BERT_base'], test_input, verbose=1)
test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is [MASK] [MASK] to visit.
mask positions: [3 4]
Iteration: 0
Predicted tokens in iteration 0: [2036, 2825]: also possible
Iteration: 1
[1mConvergence reached in iteration 1![0m
---------------------------------------------------------------------------------------------
Initial input sentence: Jupyter is the larges

In [None]:
outputs, outputs_dec = fill_masks_autoregressively_with_refinement(bert_models['BERT_large_wwm'], test_input, verbose=1)
test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_large_wwm'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

Chosen model: google-bert/bert-large-uncased-whole-word-masking
Model: "tf_bert_for_masked_lm_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  334092288 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  32865082  
                                                                 
Total params: 335174458 (1.25 GB)
Trainable params: 335174458 (1.25 GB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is [MASK] [MASK] to visit.
mask positions: [3 4]
Iteration: 0
Predicted tokens in iteration 0: [1037, 2173]: a place
Iteration: 1
[1mConvergence reached in iteration 1![0m
---------------------------------------------------------------------------------------------
Initial input sentence: Jupyter is 

In [None]:
outputs, outputs_dec = fill_masks_autoregressively_with_refinement(roberta_models['RoBERTa_large'], test_input, verbose=1)
test_tokenizer = AutoTokenizer.from_pretrained(roberta_models['RoBERTa_large'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

Chosen model: FacebookAI/roberta-large
Model: "tf_roberta_for_masked_lm_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  354310144 
 r)                                                              
                                                                 
 lm_head (TFRobertaLMHead)   multiple                  53102681  
                                                                 
Total params: 355412057 (1.32 GB)
Trainable params: 355412057 (1.32 GB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is <mask> <mask> to visit.
mask positions: [3 4]
Iteration: 0
Predicted tokens in iteration 0: [10, 317]:  a place
Iteration: 1
[1mConvergence reached in iteration 1![0m
---------------------------------------------------------------------------------------

In [None]:
outputs, outputs_dec = fill_masks_autoregressively_with_refinement(albert_models['ALBERT_xxlarge'], test_input, verbose=1)
test_tokenizer = AutoTokenizer.from_pretrained(albert_models['ALBERT_xxlarge'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

Chosen model: albert/albert-xxlarge-v2
Model: "tf_albert_for_masked_lm_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 albert (TFAlbertMainLayer)  multiple                  205814272 
                                                                 
 predictions (TFAlbertMLMHe  multiple                  4490720   
 ad)                                                             
                                                                 
Total params: 206398944 (787.35 MB)
Trainable params: 206398944 (787.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is [MASK] [MASK] to visit.
mask positions: [3 4]
Iteration: 0
Predicted tokens in iteration 0: [253, 25349]: very tempting
Iteration: 1
[1mConvergence reached in iteration 1![0m
----------------------------------------------------------------------------

In [None]:
outputs, outputs_dec = fill_masks_autoregressively_with_refinement(biobert_models['BioBERT'], test_input, verbose=1)
test_tokenizer = AutoTokenizer.from_pretrained(biobert_models['BioBERT'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")



config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Chosen model: dmis-lab/biobert-base-cased-v1.2
Model: "tf_bert_for_masked_lm_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  107719680 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  23286340  
                                                                 
Total params: 108340804 (413.29 MB)
Trainable params: 108340804 (413.29 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is [MASK] [MASK] to visit.
mask positions: [4 5]
Iteration: 0
Predicted tokens in iteration 0: [1167, 2846]: more difficult
Iteration: 1
[1mConvergence reached in iteration 1![0m
---------------------------------------------------------------------------------------------
Initial input sentence: Jupyter is the la

In [None]:
outputs, outputs_dec = fill_masks_autoregressively_with_refinement(biomedbert_models['BioMedBERT_base_abstract'], test_input, verbose=1)
# pretrained on domain-specific dataset - separating common words into multiple subtokens (!!!)
test_tokenizer = AutoTokenizer.from_pretrained(biomedbert_models['BioMedBERT_base_abstract'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Chosen model: microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract
Model: "tf_bert_for_masked_lm_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is [MASK] [MASK] to visit.
mask positions: [3 4]
Iteration: 0
Predicted tokens in iteration 0: [42, 3831]: a place
Iteration: 1
[1mConvergence reached in iteration 1![0m
---------------------------------------------------------------------------------------------
Initial input sentence: Jupyte

In [20]:
# Conditional MLM
# Initial predictions (Greedy) + Refinement (Order) until predictions converge or maximum number of iterations is reached
# TO DO: top_n predictions
def fill_masks_by_confidence_order_refinement(model_checkpoint: str, inputs: list[str], top_n=5, max_iter=10, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    outputs = []
    outputs_decoded = []

    for input_text in inputs:
        if verbose:
          print(f"Initial input sentence: {input_text}")

        tokenized_input = tokenizer(input_text, return_tensors="tf")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]

        mask_token_id = tokenizer.mask_token_id
        # needed for refinement phase
        initial_mask_token_indices = np.where(input_ids.numpy()[0] == mask_token_id)[0]

        if verbose:
          print(f"Mask position: {initial_mask_token_indices}")

        prediction_dict = {}

        # Greedy initial
        while True:

          # finding all positions of the [MASK] tokens
          mask_token_indices = np.where(input_ids.numpy()[0] == mask_token_id)[0]

          # all tokens at mask positions are predicted
          if len(mask_token_indices) == 0:
            break

          # getting token logits at mask_token_indices
          token_logits = model(**tokenized_input).logits[0]
          mask_token_logits = tf.gather(token_logits, mask_token_indices)

          # tf.matf.top_k returns k top values and indices from the input tensor along last dimension (by default)
          top_k_values, top_k_indices = tf.math.top_k(mask_token_logits, k=1)
          # remove extra dimenstion (FOR NOW: k=1 - numbers instead of lists)
          top_k_values = tf.squeeze(top_k_values)
          top_k_indices = tf.squeeze(top_k_indices)

          # checking if top_k_values is scalar tensor (case when there is only 1 mask index) - REFACTOR (!!!)
          if top_k_values.shape == ():
              most_confident_mask_position, most_confident_token = mask_token_indices[0], top_k_indices.numpy()
          else:
            k = tf.argmax(top_k_values)
            most_confident_mask_position, most_confident_token = mask_token_indices[k], top_k_indices[k].numpy()

          if verbose:
            print(f"{most_confident_token}: {tokenizer.convert_ids_to_tokens([most_confident_token])} - index: {most_confident_mask_position}")

          prediction_dict[most_confident_mask_position] = most_confident_token

                                                            # list of tensor coordinates to change
          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, most_confident_mask_position]], [most_confident_token])

          # making new tokenized_input tensor
          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

        # sorting prediction_dict
        prediction_dict = OrderedDict({mask_position : prediction_dict[mask_position] for mask_position in sorted(prediction_dict)})
        prediction_i = [value for key, value in prediction_dict.items()]
        prediction_i_decoded = tokenizer.decode(prediction_i, skip_special_tokens=True)

        if verbose:
          print(f"Initial predicted tokens after Greedy approach: {prediction_i}: {prediction_i_decoded}")

        # Order refinement
        for i in range(max_iter):
          if verbose:
            print(f"Refinement iteration: {i}")

          updated_tokens = 0

          for mask_index in initial_mask_token_indices:

            token_logits = model(**tokenized_input).logits[0]
            mask_token_logits = token_logits[mask_index, :]

            # getting the top predicted token
            top_token = np.argmax(mask_token_logits.numpy())

            if verbose:
              print(f"{mask_index}: {top_token}")

            if prediction_dict[mask_index] != top_token:
              prediction_dict[mask_index] = top_token
              updated_tokens += 1
                                                              # list of tensor coordinates to change
            input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [top_token])

            # making new tokenized_input tensor
            if use_token_type_ids:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
                'token_type_ids': tokenized_input['token_type_ids']
              }
            else:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
              }

          if updated_tokens == 0:
            if verbose:
              print(f"\033[1mConvergence reached in refinement iteration {i}!\033[0m")
            break

          prediction_i = [value for key, value in prediction_dict.items()]
          prediction_i_decoded = tokenizer.decode(prediction_i, skip_special_tokens=True)
          if verbose:
            print(f"Predicted tokens in refinement iteration {i}: {prediction_i}: {prediction_i_decoded}")

        if verbose:
          print('---------------------------------------------------------------------------------------------')

        final_prediction = prediction_i
        final_prediction_decoded = prediction_i_decoded
        outputs.append(prediction_i)
        outputs_decoded.append(final_prediction_decoded)

    return outputs, outputs_decoded

In [None]:
outputs, outputs_dec = fill_masks_by_confidence_order_refinement(bert_models['BERT_base'], test_input, verbose=1)
test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is [MASK] [MASK] to visit.
Mask position: [3 4]
4569: ['fun'] - index: 4
2467: ['always'] - index: 3
Initial predicted tokens after Greedy approach: [2467, 4569]: always fun
Refinement iteration: 0
3: 2467
4: 4569
[1mConvergence reached in refinement iteration 0![0m
-------------------------------------------------

In [None]:
outputs, outputs_dec = fill_masks_by_confidence_order_refinement(biomedbert_models['BioMedBERT_base_full'], test_input, verbose=1)
# pretrained on domain-specific dataset - separating common words into multiple subtokens (!!!)
test_tokenizer = AutoTokenizer.from_pretrained(biomedbert_models['BioMedBERT_base_full'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Chosen model: microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext
Model: "tf_bert_for_masked_lm_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is [MASK] [MASK] to visit.
Mask position: [3 4]
13043: ['##iest'] - index: 4
1920: ['the'] - index: 3
Initial predicted tokens after Greedy approach: [1920, 13043]: theiest
Refinement iteration: 0
3: 1920
4: 2561
Predicted tokens in refinement iteration 0: [1920, 2561]: the first
Refine

In [None]:
outputs, outputs_dec = fill_masks_by_confidence_order_refinement(roberta_models['RoBERTa_large'], test_input, verbose=1)
test_tokenizer = AutoTokenizer.from_pretrained(roberta_models['RoBERTa_large'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

Chosen model: FacebookAI/roberta-large
Model: "tf_roberta_for_masked_lm_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  354310144 
 r)                                                              
                                                                 
 lm_head (TFRobertaLMHead)   multiple                  53102681  
                                                                 
Total params: 355412057 (1.32 GB)
Trainable params: 355412057 (1.32 GB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is <mask> <mask> to visit.
Mask position: [3 4]
10: ['Ġa'] - index: 3
317: ['Ġplace'] - index: 4
Initial predicted tokens after Greedy approach: [10, 317]:  a place
Refinement iteration: 0
3: 10
4: 317
[1mConvergence reached in refinement iteration 0![0m
--

In [None]:
outputs, outputs_dec = fill_masks_by_confidence_order_refinement(albert_models['ALBERT_base'], test_input, verbose=1)
test_tokenizer = AutoTokenizer.from_pretrained(albert_models['ALBERT_base'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Chosen model: albert/albert-base-v2
Model: "tf_albert_for_masked_lm_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 albert (TFAlbertMainLayer)  multiple                  11092992  
                                                                 
 predictions (TFAlbertMLMHe  multiple                  4064736   
 ad)                                                             
                                                                 
Total params: 11251680 (42.92 MB)
Trainable params: 11251680 (42.92 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is [MASK] [MASK] to visit.
Mask position: [3 4]
105: ['▁them'] - index: 3
209: ['▁place'] - index: 4
Initial predicted tokens after Greedy approach: [105, 209]: them place
Refinement iteration: 0
3: 226
4: 209
Predicted tokens in refinement iteration 0: [226, 209

In [None]:
outputs, outputs_dec = fill_masks_by_confidence_order_refinement(biobert_models['BioBERT'], test_input, verbose=1)
test_tokenizer = AutoTokenizer.from_pretrained(biobert_models['BioBERT'])
for output, output_dec in zip(outputs, outputs_dec):
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")



Chosen model: dmis-lab/biobert-base-cased-v1.2
Model: "tf_bert_for_masked_lm_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  107719680 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  23286340  
                                                                 
Total params: 108340804 (413.29 MB)
Trainable params: 108340804 (413.29 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Paris is [MASK] [MASK] to visit.
Mask position: [4 5]
3123: ['easy'] - index: 5
3860: ['relatively'] - index: 4
Initial predicted tokens after Greedy approach: [3860, 3123]: relatively easy
Refinement iteration: 0
4: 3860
5: 3123
[1mConvergence reached in refinement iteration 0![0m
------------------------------------

In [None]:
# Different number of tokens generated for same input text when using different tokenizers
print('BERT model tokenizer')
tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])
tokenization = tokenizer(['Anastomosis of superior vena cava to pulmonary artery'], return_tensors="tf")
print(tokenization['input_ids'].numpy()[0])
print(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(tokenization['input_ids'].numpy()[0])}")
print('---------------------------------------------------')
print(f"BioMedBERT model tokenizer")
tokenizer = AutoTokenizer.from_pretrained(biomedbert_models['BioMedBERT_base_full'])
tokenization = tokenizer(['Anastomosis of superior vena cava to pulmonary artery'], return_tensors="tf")
print(tokenization['input_ids'].numpy()[0])
print(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(tokenization['input_ids'].numpy()[0])}")
print('---------------------------------------------------')
print(f"RoBERTa model tokenizer")
tokenizer = AutoTokenizer.from_pretrained(roberta_models['RoBERTa_base'])
tokenization = tokenizer(['Anastomosis of superior vena cava to pulmonary artery'], return_tensors="tf")
print(tokenization['input_ids'].numpy()[0])
print(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(tokenization['input_ids'].numpy()[0])}")
print('---------------------------------------------------')
print(f"ALBERT model tokenizer")
tokenizer = AutoTokenizer.from_pretrained(albert_models['ALBERT_base'])
tokenization = tokenizer(['Anastomosis of superior vena cava to pulmonary artery'], return_tensors="tf")
print(tokenization['input_ids'].numpy()[0])
print(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(tokenization['input_ids'].numpy()[0])}")
print('---------------------------------------------------')
print(f"BioBERT model tokenizer")
tokenizer = AutoTokenizer.from_pretrained(biobert_models['BioBERT'])
tokenization = tokenizer(['Anastomosis of superior vena cava to pulmonary artery'], return_tensors="tf")
print(tokenization['input_ids'].numpy()[0])
print(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(tokenization['input_ids'].numpy()[0])}")

BERT model tokenizer
[  101  9617 16033 15530  2483  1997  6020  2310  2532  6187  3567  2000
 21908 16749   102]
Decoded tokens: ['[CLS]', 'ana', '##sto', '##mos', '##is', 'of', 'superior', 've', '##na', 'ca', '##va', 'to', 'pulmonary', 'artery', '[SEP]']
---------------------------------------------------
BioMedBERT model tokenizer
[    2 17331  1927  6566 22876 23326  1942  5352  5511     3]
Decoded tokens: ['[CLS]', 'anastomosis', 'of', 'superior', 'vena', 'cava', 'to', 'pulmonary', 'artery', '[SEP]']
---------------------------------------------------
RoBERTa model tokenizer
[    0  4688  1988  1075 13310     9 10295   748  4242   740  8604     7
 34049 30404     2]
Decoded tokens: ['<s>', 'An', 'ast', 'om', 'osis', 'Ġof', 'Ġsuperior', 'Ġv', 'ena', 'Ġc', 'ava', 'Ġto', 'Ġpulmonary', 'Ġartery', '</s>']
---------------------------------------------------
ALBERT model tokenizer


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

[    2    40   472  6015  8076    16  4475  7287    58 24073    20  6971
  2111  1857 19759     3]
Decoded tokens: ['[CLS]', '▁an', 'as', 'tom', 'osis', '▁of', '▁superior', '▁ven', 'a', '▁cava', '▁to', '▁pul', 'mon', 'ary', '▁artery', '[SEP]']
---------------------------------------------------
BioBERT model tokenizer


config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

[  101  1126 12788 18445  4863  1104  7298  1396  1605 11019  2497  1106
 26600 18593   102]
Decoded tokens: ['[CLS]', 'an', '##ast', '##omo', '##sis', 'of', 'superior', 've', '##na', 'ca', '##va', 'to', 'pulmonary', 'artery', '[SEP]']


In [21]:
# Preparing inputs in multi-token scenario
def prepare_inputs_multi_token(model_checkpoint:str, data, prompt:str):
  # number of rows
  n = len(data)

  inputs = [prompt for _ in range(n)]
  heads = data['head_name'].tolist()

  inputs = [input.replace('[X]', head) for input, head in zip(inputs, heads)]

  tails = data['tail_names'].tolist()
  tails = list(map(lambda x: x.split(' || '), tails))

  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

  num_masks = []

  for answers in tails:
    # padding necessary for batching
    tokenized_answers = tokenizer(answers, return_tensors="tf", padding=True, truncation=True)

    # n - number of true answers
    n = len(tokenized_answers['input_ids'].numpy())
    max_len = 0
    for i in range(n):
      input_ids = list(tokenized_answers['input_ids'].numpy()[i])

      # handling possible padding ([PAD] / <pad>) tokens
      if 'roberta' in model_checkpoint:
        pad_token_id = 1
      else:
        pad_token_id = 0

      # finding index of first padding token
      pad_index = input_ids.index(pad_token_id) if pad_token_id in input_ids else len(input_ids)

      input_ids = input_ids[:pad_index]
      tkz_len = len(input_ids) - 2 # Ignoring [CLS] / <s> token at the beginning and [SEP] / </s> token at the end

      if tkz_len > max_len:
        max_len = tkz_len

    num_masks.append(max_len)

  # replacing object placeholder with as much mask tokens as we get from tokenizing true object (may differ depending on the tokenizer used)
  # multiple true objects - use longest in terms of number of tokens (???)
  inputs = [input.replace('[Y] ', ' '.join('[MASK]' for _ in range(num_masks[i]))) for i, input in enumerate(inputs)]

  return inputs

In [None]:
inputs_mt_roberta = prepare_inputs_multi_token(roberta_models['RoBERTa_base'], occurs_data, default_prompt)
print(inputs_mt_roberta)
# making predictions using Greedy initial + Order refinement approach
pred_tokens_roberta, pred_decoded_roberta = fill_masks_by_confidence_order_refinement(roberta_models['RoBERTa_base'], inputs=inputs_mt_roberta[:100], verbose=0)

['Post influenza vaccination encephalitis occurs after [MASK] [MASK] [MASK] [MASK] [MASK].', 'Basal cell carcinoma recurrent following cryosurgery occurs after [MASK] [MASK] [MASK] [MASK].', 'Adverse effect from PUVA photochemotherapy occurs after [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK].', 'Allergy to pea occurs after [MASK] [MASK] [MASK] [MASK].', 'Bite of unidentified snake with neurological signs occurs after [MASK] [MASK].', 'Allergy to hypothalamic hormone occurs after [MASK] [MASK] [MASK] [MASK].', 'Late effect of accidental injury occurs after [MASK] [MASK] [MASK].', 'Radiotherapy scar occurs after [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK].', 'Atonic postpartum hemorrhage occurs after [MASK] [MASK].', 'Late effect of skin and subcutaneous tissue injury occurs after [MASK] [MASK] [MASK].', 'Calcinosis following localized fat necrosis occurs after [MASK] [MASK] [MASK].', 'Apraxia due to and following spontaneous intracerebr

In [None]:
inputs_mt_biomed = prepare_inputs_multi_token(biomedbert_models['BioMedBERT_base_full'], occurs_data, default_prompt)
print(inputs_mt_biomed)
# making predictions using Greedy initial + Order refinement approach
pred_tokens_biomed, pred_decoded_biomed = fill_masks_by_confidence_order_refinement(biomedbert_models['BioMedBERT_base_full'], inputs=inputs_mt_biomed[:100], verbose=0)

['Post influenza vaccination encephalitis occurs after [MASK] [MASK] [MASK] [MASK].', 'Basal cell carcinoma recurrent following cryosurgery occurs after [MASK] [MASK].', 'Adverse effect from PUVA photochemotherapy occurs after [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK].', 'Allergy to pea occurs after [MASK] [MASK].', 'Bite of unidentified snake with neurological signs occurs after [MASK] [MASK].', 'Allergy to hypothalamic hormone occurs after [MASK] [MASK].', 'Late effect of accidental injury occurs after [MASK] [MASK].', 'Radiotherapy scar occurs after [MASK] [MASK] [MASK] [MASK] [MASK] [MASK].', 'Atonic postpartum hemorrhage occurs after [MASK] [MASK].', 'Late effect of skin and subcutaneous tissue injury occurs after [MASK] [MASK].', 'Calcinosis following localized fat necrosis occurs after [MASK] [MASK].', 'Apraxia due to and following spontaneous intracerebral hemorrhage occurs after [MASK] [MASK] [MASK].', 'Arteriovenous shunt stenosis occurs after [MAS

In [None]:
inputs_mt_bert = prepare_inputs_multi_token(bert_models['BERT_base'], occurs_data, default_prompt)
print(inputs_mt_bert)
# making predictions using Greedy initial + Order refinement approach
pred_tokens_bert, pred_decoded_bert = fill_masks_by_confidence_order_refinement(bert_models['BERT_base'], inputs=inputs_mt_bert[:100], verbose=0)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['Post influenza vaccination encephalitis occurs after [MASK] [MASK] [MASK] [MASK].', 'Basal cell carcinoma recurrent following cryosurgery occurs after [MASK] [MASK] [MASK] [MASK].', 'Adverse effect from PUVA photochemotherapy occurs after [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK].', 'Allergy to pea occurs after [MASK] [MASK] [MASK] [MASK].', 'Bite of unidentified snake with neurological signs occurs after [MASK] [MASK].', 'Allergy to hypothalamic hormone occurs after [MASK] [MASK] [MASK] [MASK].', 'Late effect of accidental injury occurs after [MASK] [MASK].', 'Radiotherapy scar occurs after [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK].', 'Atonic postpartum hemorrhage occurs after [MASK] [MASK].', 'Late effect of skin and subcutaneous tissue injury occurs after [MASK] [MASK].', 'Calcinosis following localized fat necrosis occurs after [MASK] [MASK] [MASK].', 'Apraxia due to and following spontaneous intracerebral hemo

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# must be list of lists for top_n scenario (TO DO - change fill masks functions return value type)
pred_decoded_roberta = [[pred] for pred in pred_decoded_roberta]

top_1_acc_multitoken = compute_accuracy(pred_decoded_roberta, tails)
print(f"RoBERTa base multitoken top 1 accuracy: {top_1_acc_multitoken} %")

RoBERTa base multitoken top 1 accuracy: 4.0 %


In [None]:
pred_decoded_bert = [[pred] for pred in pred_decoded_bert]

top_1_acc_multitoken = compute_accuracy(pred_decoded_bert, tails)
print(f"BERT base multitoken top 1 accuracy: {top_1_acc_multitoken} %")

BERT base multitoken top 1 accuracy: 2.0 %


In [None]:
pred_decoded_biomed = [[pred] for pred in pred_decoded_biomed]

top_1_acc_multitoken = compute_accuracy(pred_decoded_biomed, tails)
print(f"BioMedBERT base full multitoken top 1 accuracy: {top_1_acc_multitoken} %")

BioMedBERT base full multitoken top 1 accuracy: 6.0 %


In [22]:
# needed for compute_cos_sim_accuracy_multitoken
def replace_masks_multitoken(model_checkpoint, masked_inputs, tails):

  n = len(masked_inputs)
  full_sentences = []

  mask_string = '[MASK]'

  if 'roberta' in model_checkpoint:
    mask_string = '<mask>'

  for i in range(n):
   # replacing first occurence of mask_token with tail
   example_sentences = [masked_inputs[i].replace(mask_string, tail.strip().lower(), 1) for tail in tails[i]]
   # replacing other occurences with empty string + removing trailing extra spaces before '.'
   example_sentences = [example_sentence.replace(mask_string, '').rstrip('.').rstrip() + '.' for example_sentence in example_sentences]

   full_sentences.append(example_sentences)

  return full_sentences

In [None]:
print(replace_masks_multitoken(biomedbert_models['BioMedBERT_base_full'], inputs_mt_biomed[:100], tails[:100]))

[['Post influenza vaccination encephalitis occurs after administration of influenza vaccine.'], ['Basal cell carcinoma recurrent following cryosurgery occurs after cryosurgery.'], ['Adverse effect from PUVA photochemotherapy occurs after light therapy.', 'Adverse effect from PUVA photochemotherapy occurs after photochemotherapy with psoralens and ultraviolet a.'], ['Allergy to pea occurs after allergic sensitization.'], ['Bite of unidentified snake with neurological signs occurs after animal bite.'], ['Allergy to hypothalamic hormone occurs after allergic sensitization.'], ['Late effect of accidental injury occurs after traumatic injury.'], ['Radiotherapy scar occurs after procedure.', 'Radiotherapy scar occurs after radiation oncology and/or radiotherapy.'], ['Atonic postpartum hemorrhage occurs after delivery procedure.'], ['Late effect of skin and subcutaneous tissue injury occurs after injury.', 'Late effect of skin and subcutaneous tissue injury occurs after traumatic injury.'], [

In [None]:
print(replace_masks_multitoken(biomedbert_models['BioMedBERT_base_full'], inputs_mt_biomed[:100], pred_decoded_biomed[:100]))

[['Post influenza vaccination encephalitis occurs after a single influenza vaccination.'], ['Basal cell carcinoma recurrent following cryosurgery occurs after 5 years.'], ['Adverse effect from PUVA photochemotherapy occurs after photo - mold mold mold mold mold mold mold mold.'], ['Allergy to pea occurs after pea exposure.'], ['Bite of unidentified snake with neurological signs occurs after head trauma.'], ['Allergy to hypothalamic hormone occurs after adrenalectomy.'], ['Late effect of accidental injury occurs after 5 years.'], ['Radiotherapy scar occurs after the molding of the mold.'], ['Atonic postpartum hemorrhage occurs after cesarean delivery.'], ['Late effect of skin and subcutaneous tissue injury occurs after the surgery.'], ['Calcinosis following localized fat necrosis occurs after bariatric surgery.'], ['Apraxia due to and following spontaneous intracerebral hemorrhage occurs after craniotomy.'], ['Arteriovenous shunt stenosis occurs after transatrial shunt surgery.'], ['Pos

In [23]:
# multi token scenario - using inputs format (mask tokens in a sequence at the end of sentence):
# combine embeddings of all tokens of the predicted answer - from first index of mask token to '.' token (stop at any special or punctuation token (???))
# combine embeddings of all tokens of the true answer - from first index of mask token to '.' token (handling different lengts of answers on the same question)
def compute_cos_sim_accuracy_multitoken(model_checkpoint:str, original_inputs, predictions, tails, verbose=0):

  # Different true answers lengths adjustment (!!!)
  # --------------------------------------------------------------
  # checking if the original_inputs is a list of lists
  if isinstance(original_inputs, list) and all(isinstance(original_input, list) for original_input in original_inputs):
    # if list of list use first input of every example for initial tokenization and finding first mask token index (same for every input of the same example)
    original_inputs = [original_input[0] for original_input in original_inputs]
  # -------------------------------------------------------------

  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = TFAutoModel.from_pretrained(model_checkpoint, from_pt=True, output_hidden_states=True)

  # Adjusting inputs for RoBERTa models
  if 'roberta' in model_checkpoint:
    original_inputs = [change_input_format(input) for input in original_inputs]

  # model_max_length field not set by default for BioBERT and BioMedBERT models
  if 'bio' in model_checkpoint.lower():
    tokenizer.model_max_length = 512

  # Tokenizing original inputs
  tokenized_inputs = tokenizer(original_inputs, return_tensors="tf", padding=True, truncation=True) # max_length=128 (by default max_length = tokenizer.model_max_length)

  # Getting first mask token indices
  first_mask_token_indices = []

  for i, input in enumerate(tokenized_inputs["input_ids"]):
    first_mask_token_indices.append(np.argwhere(tokenized_inputs["input_ids"].numpy()[i] == tokenizer.mask_token_id)[0, 0])

  predicted_sentences = replace_masks_multitoken(model_checkpoint, original_inputs, predictions)
  true_sentences = replace_masks_multitoken(model_checkpoint, original_inputs, tails)

  hits = 0
  n = len(original_inputs)
  # for every example in relation dataset
  for i in range(n):

    predicted_sentences_tkz = tokenizer(predicted_sentences[i], return_tensors="tf", padding=True, truncation=True)
    hidden_states = model(**predicted_sentences_tkz).hidden_states

    if not isinstance(hidden_states, tuple):
      raise ValueError("Model configuration does not support returning hidden states!")

    # combining last 4 (TRY OTHER) layers for more robust results - experiment with combining embeddings from earlier layers (???), Dimensionality reduction - PCA (???)
    combined_last_four_layers = tf.reduce_mean(tf.stack([hidden_states[i] for i in range(-4, 0)]), axis=0)

    # finding end indexes of the predicted phrases (position of the first punctuation token - different for every predicted phrase!)
    punctuation_ids = tokenizer.convert_tokens_to_ids([char for char in string.punctuation if char not in {',', ':', ';', '-'}])
    end_indices_pred = []
    embedding_size = combined_last_four_layers.shape[2]
    pred_embeddings = tf.zeros((0, embedding_size))

    for p in range(len(predicted_sentences_tkz["input_ids"])):
      punctuation_indices = np.where(np.isin(predicted_sentences_tkz["input_ids"].numpy()[p], punctuation_ids))[0]

      # first punctuation index > first_mask_token_indices[i]
      indices = np.where(punctuation_indices > first_mask_token_indices[i])[0]
      end_index = punctuation_indices[indices[0]] if indices.size > 0 else None

      if end_index is None:
        end_index = np.where(np.isin(predicted_sentences_tkz["input_ids"].numpy()[p], tokenizer.all_special_ids))[0][0]

      phrase_embedding = combined_last_four_layers[p, first_mask_token_indices[i]:end_index, :]
      pred_embedding = tf.nn.l2_normalize(tf.reduce_mean(phrase_embedding, axis=0), axis=0)

      pred_embeddings = tf.concat([pred_embeddings, tf.reshape(pred_embedding, (1, -1))], axis=0)

      end_indices_pred.append(end_index)

    true_sentences_tkz = tokenizer(true_sentences[i], return_tensors="tf", padding=True, truncation=True)
    hidden_states = model(**true_sentences_tkz).hidden_states

    # combining last 4 layers for more robust results
    combined_last_four_layers = tf.reduce_mean(tf.stack([hidden_states[i] for i in range(-4, 0)]), axis=0)

    # finding end indexes of the true phrases (position of the period token '.' - different for every true phrase!)
    period_token_id = tokenizer.convert_tokens_to_ids(".")
    end_indices_true = []
    true_embeddings = tf.zeros((0, embedding_size))

    for m in range(len(true_sentences_tkz["input_ids"])):

      full_stop_indices = np.where(true_sentences_tkz["input_ids"].numpy()[m] == period_token_id)[0]

      # first full stop index > first_mask_token_indices[i]
      indices = np.where(full_stop_indices > first_mask_token_indices[i])[0]
      end_index = full_stop_indices[indices[0]] if indices.size > 0 else None

      if end_index is None:
        # in case there's no '.', take until some special token occurs (should not happen)
        end_index = np.where(np.isin(true_sentences_tkz["input_ids"].numpy()[m], tokenizer.all_special_ids))[0][0]

      # extracting the embeddings from the first_mask_token_indices[i] (start index) to the end index
      phrase_embedding = combined_last_four_layers[m, first_mask_token_indices[i]:end_index, :]

      # normalizing and combining the embeddings of the whole phrase
      true_embedding = tf.nn.l2_normalize(tf.reduce_mean(phrase_embedding, axis=0), axis=0)

      true_embeddings = tf.concat([true_embeddings, tf.reshape(true_embedding, (1, -1))], axis=0)

      end_indices_true.append(end_index)

    highest_similarity = -1

    if verbose:
      most_similar_pred = None
      most_similar_true = None

    for j in range(len(pred_embeddings)):
      for k in range(len(true_embeddings)):

        if verbose:
          pred_phrase = tokenizer.decode(predicted_sentences_tkz["input_ids"][j].numpy()[first_mask_token_indices[i]: end_indices_pred[j]])
          true_phrase = tokenizer.decode(true_sentences_tkz["input_ids"][k].numpy()[first_mask_token_indices[i]:end_indices_true[k]])

        similarity = cosine_similarity(tf.reshape(pred_embeddings[j], shape=(1, -1)), tf.reshape(true_embeddings[k], shape=(-1, 1)))[0][0]

        if verbose:
          print(f"Cosine similarity between '{pred_phrase}' and '{true_phrase}': {similarity}")

        if similarity > highest_similarity:
            highest_similarity = similarity
            if verbose:
              most_similar_pred = pred_phrase
              most_similar_true = true_phrase

    if verbose:
      print(f"\033[1mHighest similarity is between '{most_similar_pred}' and '{most_similar_true}': {highest_similarity}\033[0m")
      print('--------------------------------------------')

    # 0.95 - threshold (can be function parameter) - TRY OTHER
    if highest_similarity >= 0.95:
      hits += 1

  return (hits/n) * 100

In [None]:
print(f"{compute_cos_sim_accuracy_multitoken(biomedbert_models['BioMedBERT_base_full'], inputs_mt_biomed[:100], pred_decoded_biomed[:100], tails[:100], verbose=1)} %")

Cosine similarity between 'a single influenza vaccination' and 'administration of influenza vaccine': 0.9735644459724426
[1mHighest similarity is between 'a single influenza vaccination' and 'administration of influenza vaccine': 0.9735644459724426[0m
--------------------------------------------
Cosine similarity between '5 years' and 'cryosurgery': 0.870233952999115
[1mHighest similarity is between '5 years' and 'cryosurgery': 0.870233952999115[0m
--------------------------------------------
Cosine similarity between 'photo - mold mold mold mold mold mold mold mold' and 'light therapy': 0.8694904446601868
Cosine similarity between 'photo - mold mold mold mold mold mold mold mold' and 'photochemotherapy with psoralens and ultraviolet a': 0.8768989443778992
[1mHighest similarity is between 'photo - mold mold mold mold mold mold mold mold' and 'photochemotherapy with psoralens and ultraviolet a': 0.8768989443778992[0m
--------------------------------------------
Cosine similarity b

# Restricted candidate set

In [24]:
# For every relation:
# Idea: 1) Extract all possible true objects from relation (v2: all relations (???)) dataset and tokenize them (different tokens for different tokenizers)
#       2) Merge all extracted tokens in a list, add special tokens and punctuation tokens (???)
#       3) In predicting process: consider only the logits of the tokens from the list  - Additional parameter for get_predictions and fill_masks functions -> candidate_set_tokens

# making one list of all answers
def make_token_candidate_set(model_checkpoint:str, tails):

  tails_flattened = sum(tails, [])
  tails_flattened = [tail.lower().strip() for tail in tails_flattened]

  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  tokenized_tails = tokenizer(tails_flattened, return_tensors="tf", padding=True, truncation=True)

  # [PAD] / <pad>, [CLS] / <s>, [SEP] / </s>, '.' already in a array, adding other special and punctuation tokens
  all_tokens = np.concatenate((tokenized_tails["input_ids"].numpy().flatten(), np.array(tokenizer.all_special_ids), np.array(tokenizer.convert_tokens_to_ids([char for char in string.punctuation]))))
  all_tokens = np.unique(all_tokens)

  return list(all_tokens)

In [None]:
all_tokens_biomed = make_token_candidate_set(biomedbert_models['BioMedBERT_base_full'], tails)
print(len(all_tokens_biomed))

484


In [25]:
# restricted candidate_set_tokens added as parameter
def fill_masks_by_confidence_order_refinement_r(model_checkpoint: str, inputs: list[str], candidate_set_tokens, top_n=5, max_iter=10, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    outputs = []
    outputs_decoded = []

    for input_text in inputs:
        if verbose:
          print(f"Initial input sentence: {input_text}")

        tokenized_input = tokenizer(input_text, return_tensors="tf")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]

        mask_token_id = tokenizer.mask_token_id
        # needed for refinement phase
        initial_mask_token_indices = np.where(input_ids.numpy()[0] == mask_token_id)[0]

        if verbose:
          print(f"Mask position: {initial_mask_token_indices}")

        prediction_dict = {}

        # Greedy initial
        while True:

          # finding all positions of the [MASK] tokens
          mask_token_indices = np.where(input_ids.numpy()[0] == mask_token_id)[0]

          # all tokens at mask positions are predicted
          if len(mask_token_indices) == 0:
            break

          # getting token logits at mask_token_indices
          token_logits = model(**tokenized_input).logits[0]
          mask_token_logits = tf.gather(token_logits, mask_token_indices)
          # ----------------------------------------------------------------------
          # getting logits of tokens that are present in a candidate set
          mask_token_logits_candidates = tf.gather(mask_token_logits, candidate_set_tokens, axis=1)
          # ----------------------------------------------------------------------

          # tf.matf.top_k returns k top values and indices from the input tensor along last dimension (by default)
          top_k_values, top_k_indices = tf.math.top_k(mask_token_logits_candidates, k=1)

          # ----------------------------------------------------------------------
          # findinging original indices (token ids)
          # convertint candidate_set_tokens to a tf tensor
          candidate_set_tokens_tensor = tf.constant(candidate_set_tokens, dtype=tf.int32)

          # using tf.gather to transform the indices to corresponding values from candidate_set_tokens_tensor
          top_k_indices_original = tf.gather(candidate_set_tokens_tensor, top_k_indices)
          # ----------------------------------------------------------------------

          # remove extra dimenstion (FOR NOW: k=1 - numbers instead of lists)
          top_k_values = tf.squeeze(top_k_values)
          top_k_indices_original = tf.squeeze(top_k_indices_original)

          # checking if top_k_values is scalar tensor (case when there is only 1 mask index) - REFACTOR (!!!)
          if top_k_values.shape == ():
              most_confident_mask_position, most_confident_token = mask_token_indices[0], top_k_indices_original.numpy()
          else:
            k = tf.argmax(top_k_values)
            most_confident_mask_position, most_confident_token = mask_token_indices[k], top_k_indices_original[k].numpy()

          if verbose:
            print(f"{most_confident_token}: {tokenizer.convert_ids_to_tokens([most_confident_token])} - index: {most_confident_mask_position}")

          prediction_dict[most_confident_mask_position] = most_confident_token
                                                            # list of tensor coordinates to change
          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, most_confident_mask_position]], [most_confident_token])

          # making new tokenized_input tensor
          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

        # sorting prediction_dict
        prediction_dict = OrderedDict({mask_position : prediction_dict[mask_position] for mask_position in sorted(prediction_dict)})
        prediction_i = [value for key, value in prediction_dict.items()]
        prediction_i_decoded = tokenizer.decode(prediction_i, skip_special_tokens=True)

        if verbose:
          print(f"Initial predicted tokens after Greedy approach: {prediction_i}: {prediction_i_decoded}")

        # Order refinement
        for i in range(max_iter):
          if verbose:
            print(f"Refinement iteration: {i}")

          updated_tokens = 0

          for mask_index in initial_mask_token_indices:

            token_logits = model(**tokenized_input).logits[0]
            mask_token_logits = token_logits[mask_index, :]

            # -------------------------------------
            # getting the top predicted token from candidate set
            top_token = candidate_set_tokens[np.argmax(mask_token_logits.numpy()[candidate_set_tokens])]
            #--------------------------------------

            if verbose:
              print(f"{mask_index}: {top_token}")

            if prediction_dict[mask_index] != top_token:
              prediction_dict[mask_index] = top_token
              updated_tokens += 1
                                                              # list of tensor coordinates to change
            input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [top_token])

            # making new tokenized_input tensor
            if use_token_type_ids:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
                'token_type_ids': tokenized_input['token_type_ids']
              }
            else:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
              }

          if updated_tokens == 0:
            if verbose:
              print(f"\033[1mConvergence reached in refinement iteration {i}!\033[0m")
            break

          prediction_i = [value for key, value in prediction_dict.items()]
          prediction_i_decoded = tokenizer.decode(prediction_i, skip_special_tokens=True)
          if verbose:
            print(f"Predicted tokens in refinement iteration {i}: {prediction_i}: {prediction_i_decoded}")

        if verbose:
          print('---------------------------------------------------------------------------------------------')

        final_prediction = prediction_i
        final_prediction_decoded = prediction_i_decoded
        outputs.append(prediction_i)
        outputs_decoded.append(final_prediction_decoded)

    return outputs, outputs_decoded

In [None]:
pred_tokens_biomed_r, pred_decoded_biomed_r = fill_masks_by_confidence_order_refinement_r(biomedbert_models['BioMedBERT_base_full'], inputs=inputs_mt_biomed[:100], candidate_set_tokens=all_tokens_biomed, verbose=1)

Chosen model: microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext
Model: "tf_bert_for_masked_lm_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial input sentence: Post influenza vaccination encephalitis occurs after [MASK] [MASK] [MASK] [MASK].
Mask position: [ 7  8  9 10]
7507: ['vaccination'] - index: 10
7108: ['influenza'] - index: 9
43: ['a'] - index: 7
4825: ['double'] - index: 8
Initial predicted tokens after Greedy approach: [43, 4825, 7108, 750

In [None]:
pred_decoded_biomed_r = [[pred] for pred in pred_decoded_biomed_r]

top_1_acc_multitoken = compute_accuracy(pred_decoded_biomed_r, tails)
print(f"BioMedBERT base full multitoken top 1 accuracy after restricting candidate set: {top_1_acc_multitoken:.2f} %")

BioMedBERT base full multitoken top 1 accuracy after restricting candidate set: 7.00 %


In [None]:
print(f"{compute_cos_sim_accuracy_multitoken(biomedbert_models['BioMedBERT_base_full'], inputs_mt_biomed[:100], pred_decoded_biomed_r[:100], tails[:100], verbose=1):.2f} %")

Cosine similarity between 'a double influenza vaccination' and 'administration of influenza vaccine': 0.9685256481170654
[1mHighest similarity is between 'a double influenza vaccination' and 'administration of influenza vaccine': 0.9685256481170654[0m
--------------------------------------------
Cosine similarity between 'surgical excision' and 'cryosurgery': 0.9180818200111389
[1mHighest similarity is between 'surgical excision' and 'cryosurgery': 0.9180818200111389[0m
--------------------------------------------
Cosine similarity between 'the patient in the event of photochemotherapy accident' and 'light therapy': 0.9287478923797607
Cosine similarity between 'the patient in the event of photochemotherapy accident' and 'photochemotherapy with psoralens and ultraviolet a': 0.9429712295532227
[1mHighest similarity is between 'the patient in the event of photochemotherapy accident' and 'photochemotherapy with psoralens and ultraviolet a': 0.9429712295532227[0m
---------------------

## Different true answers lengths adjustment

In [26]:
# Problem: in approach used so far model always tries to fill in as many tokens as there are in the longest correct answer
# New approach: the model tries to fill in as many mask tokens as there are in each correct answer individually (minor adjustments in functions)
def prepare_inputs_multi_token_v2(model_checkpoint:str, data, prompt:str):
  # number of rows
  n = len(data)

  inputs = [prompt for _ in range(n)]
  heads = data['head_name'].tolist()

  inputs = [input.replace('[X]', head) for input, head in zip(inputs, heads)]

  tails = data['tail_names'].tolist()
  tails = list(map(lambda x: x.split(' || '), tails))

  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

  num_masks = []

  for answers in tails:
    # padding necessary for batching
    tokenized_answers = tokenizer(answers, return_tensors="tf", padding=True, truncation=True)

    # n - number of true answers
    n = len(tokenized_answers['input_ids'].numpy())

    tkz_lens = []
    for i in range(n):
      input_ids = list(tokenized_answers['input_ids'].numpy()[i])

      # handling possible padding ([PAD] / <pad>) tokens
      if 'roberta' in model_checkpoint:
        pad_token_id = 1
      else:
        pad_token_id = 0

      # finding index of first padding token
      pad_index = input_ids.index(pad_token_id) if pad_token_id in input_ids else len(input_ids)

      input_ids = input_ids[:pad_index]
      tkz_len = len(input_ids) - 2 # Ignoring [CLS] / <s> token at the beginning and [SEP] / </s> token at the end
      tkz_lens.append(tkz_len)

    num_masks.append(tkz_lens)

  # replacing object placeholder with as much mask tokens as we get from tokenizing true object (may differ depending on the tokenizer used)
  # multiple true objects - v2: generate input for every possible number of tokens in a true answer
  final_inputs = []
  for i, input in enumerate(inputs):
    inputs = [input.replace('[Y] ', ' '.join('[MASK]' for _ in range(num_mask))) for num_mask in num_masks[i]]
    final_inputs.append(inputs)

  return final_inputs

In [None]:
inputs_mt_biomed_v2 = prepare_inputs_multi_token_v2(biomedbert_models['BioMedBERT_base_full'], occurs_data, default_prompt)
print(inputs_mt_biomed_v2)

[['Post influenza vaccination encephalitis occurs after [MASK] [MASK] [MASK] [MASK].'], ['Basal cell carcinoma recurrent following cryosurgery occurs after [MASK] [MASK].'], ['Adverse effect from PUVA photochemotherapy occurs after [MASK] [MASK].', 'Adverse effect from PUVA photochemotherapy occurs after [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK].'], ['Allergy to pea occurs after [MASK] [MASK].'], ['Bite of unidentified snake with neurological signs occurs after [MASK] [MASK].'], ['Allergy to hypothalamic hormone occurs after [MASK] [MASK].'], ['Late effect of accidental injury occurs after [MASK] [MASK].'], ['Radiotherapy scar occurs after [MASK].', 'Radiotherapy scar occurs after [MASK] [MASK] [MASK] [MASK] [MASK] [MASK].'], ['Atonic postpartum hemorrhage occurs after [MASK] [MASK].'], ['Late effect of skin and subcutaneous tissue injury occurs after [MASK].', 'Late effect of skin and subcutaneous tissue injury occurs after [MASK] [MASK].'], ['Calcinosis fo

In [27]:
# inputs type changed
# TO DO: top_n predictions
def fill_masks_by_confidence_order_refinement_v2(model_checkpoint: str, inputs: list[list[str]], candidate_set_tokens=None, top_n=5, max_iter=10, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    # if candidate_set_tokens is None, setting it to all tokens of the model
    if candidate_set_tokens is None:
      candidate_set_tokens = list(tokenizer.get_vocab().values()) # .keys() - decoded tokens (words/subwords)

    outputs = []
    outputs_decoded = []

    # for every example
    for example_inputs in inputs:
      outputs_example = []
      outputs_example_decoded = []
      # for every input in example
      for input_text in example_inputs:
          if verbose:
            print(f"Initial input sentence: {input_text}")

          tokenized_input = tokenizer(input_text, return_tensors="tf")

          # checking if the model uses token_type_ids (not used in RoBERTa models)
          use_token_type_ids = 'token_type_ids' in tokenized_input

          input_ids = tokenized_input["input_ids"]

          mask_token_id = tokenizer.mask_token_id
          # needed for refinement phase
          initial_mask_token_indices = np.where(input_ids.numpy()[0] == mask_token_id)[0]

          if verbose:
            print(f"Mask position: {initial_mask_token_indices}")

          prediction_dict = {}

          # Greedy initial
          while True:

            # finding all positions of the [MASK] tokens
            mask_token_indices = np.where(input_ids.numpy()[0] == mask_token_id)[0]

            # all tokens at mask positions are predicted
            if len(mask_token_indices) == 0:
              break

            # getting token logits at mask_token_indices
            token_logits = model(**tokenized_input).logits[0]
            mask_token_logits = tf.gather(token_logits, mask_token_indices)
            # ----------------------------------------------------------------------
            # getting logits of tokens that are present in a candidate set
            mask_token_logits_candidates = tf.gather(mask_token_logits, candidate_set_tokens, axis=1)
            # ----------------------------------------------------------------------

            # tf.matf.top_k returns k top values and indices from the input tensor along last dimension (by default)
            top_k_values, top_k_indices = tf.math.top_k(mask_token_logits_candidates, k=1)

            # ----------------------------------------------------------------------
            # findinging original indices (token ids)
            # convertint candidate_set_tokens to a tf tensor
            candidate_set_tokens_tensor = tf.constant(candidate_set_tokens, dtype=tf.int32)

            # using tf.gather to transform the indices to corresponding values from candidate_set_tokens_tensor
            top_k_indices_original = tf.gather(candidate_set_tokens_tensor, top_k_indices)
            # ----------------------------------------------------------------------

            # remove extra dimenstion (FOR NOW: k=1 - numbers instead of lists)
            top_k_values = tf.squeeze(top_k_values)
            top_k_indices_original = tf.squeeze(top_k_indices_original)

            # checking if top_k_values is scalar tensor (case when there is only 1 mask index) - REFACTOR (!!!)
            if top_k_values.shape == ():
              most_confident_mask_position, most_confident_token = mask_token_indices[0], top_k_indices_original.numpy()
            else:
              k = tf.argmax(top_k_values)
              most_confident_mask_position, most_confident_token = mask_token_indices[k], top_k_indices_original[k].numpy()

            if verbose:
              print(f"{most_confident_token}: {tokenizer.convert_ids_to_tokens([most_confident_token])} - index: {most_confident_mask_position}")

            prediction_dict[most_confident_mask_position] = most_confident_token
                                                            # list of tensor coordinates to change
            input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, most_confident_mask_position]], [most_confident_token])

            # making new tokenized_input tensor
            if use_token_type_ids:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
                'token_type_ids': tokenized_input['token_type_ids']
              }
            else:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
              }

          # sorting prediction_dict
          prediction_dict = OrderedDict({mask_position : prediction_dict[mask_position] for mask_position in sorted(prediction_dict)})
          prediction_i = [value for key, value in prediction_dict.items()]
          prediction_i_decoded = tokenizer.decode(prediction_i, skip_special_tokens=True)

          if verbose:
            print(f"Initial predicted tokens after Greedy approach: {prediction_i}: {prediction_i_decoded}")

          # Order refinement
          for i in range(max_iter):
            if verbose:
              print(f"Refinement iteration: {i}")

            updated_tokens = 0

            for mask_index in initial_mask_token_indices:

              token_logits = model(**tokenized_input).logits[0]
              mask_token_logits = token_logits[mask_index, :]

              # -------------------------------------
              # getting the top predicted token from candidate set
              top_token = candidate_set_tokens[np.argmax(mask_token_logits.numpy()[candidate_set_tokens])]
              #--------------------------------------

              if verbose:
                print(f"{mask_index}: {top_token}")

              if prediction_dict[mask_index] != top_token:
                prediction_dict[mask_index] = top_token
                updated_tokens += 1
                                                              # list of tensor coordinates to change
              input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [top_token])

              # making new tokenized_input tensor
              if use_token_type_ids:
                tokenized_input = {
                  'input_ids': input_ids,
                  'attention_mask': tokenized_input['attention_mask'],
                  'token_type_ids': tokenized_input['token_type_ids']
                }
              else:
                tokenized_input = {
                  'input_ids': input_ids,
                  'attention_mask': tokenized_input['attention_mask'],
                }

            if updated_tokens == 0:
              if verbose:
                print(f"\033[1mConvergence reached in refinement iteration {i}!\033[0m")
              break

            prediction_i = [value for key, value in prediction_dict.items()]
            prediction_i_decoded = tokenizer.decode(prediction_i, skip_special_tokens=True)
            if verbose:
              print(f"Predicted tokens in refinement iteration {i}: {prediction_i}: {prediction_i_decoded}")

          if verbose:
            print('---------------------------------------------------------------------------------------------')

          final_prediction = prediction_i
          final_prediction_decoded = prediction_i_decoded
          outputs_example.append(prediction_i)
          outputs_example_decoded.append(final_prediction_decoded)

      if verbose:
        print('---------------------------------------------------------------------------------------------')

      outputs.append(outputs_example)
      outputs_decoded.append(outputs_example_decoded)

    return outputs, outputs_decoded

In [None]:
# small adjustement made in compute_cos_sim_accuracy_multitoken (!!!)

In [None]:
# using whole relation dataset!
pred_tokens_biomed_v2, pred_decoded_biomed_v2 = fill_masks_by_confidence_order_refinement_v2(biomedbert_models['BioMedBERT_base_full'], inputs=inputs_mt_biomed_v2, candidate_set_tokens=all_tokens_biomed, verbose=1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1mConvergence reached in refinement iteration 0![0m
---------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------
Initial input sentence: Traumatic arthropathy of the lower leg occurs after [MASK].
Mask position: [10]
7176: ['trauma'] - index: 10
Initial predicted tokens after Greedy approach: [7176]: trauma
Refinement iteration: 0
10: 7176
[1mConvergence reached in refinement iteration 0![0m
---------------------------------------------------------------------------------------------
Initial input sentence: Traumatic arthropathy of the lower leg occurs after [MASK] [MASK].
Mask position: [10 11]
7176: ['trauma'] - index: 11
18047: ['blunt'] - index: 10
Initial predicted tokens after Greedy approach: [18047, 7176]: blunt trauma
Refinement iteration: 0
10: 18047
11: 7176
[1mConvergence reached

In [None]:
# on whole relation dataset
top_1_acc_multitoken_v2 = compute_accuracy(pred_decoded_biomed_v2, tails)
print(f"BioMedBERT base full multitoken top 1 accuracy after restricting candidate set (v2): {top_1_acc_multitoken_v2:.2f} %")

BioMedBERT base full multitoken top 1 accuracy after restricting candidate set (v2): 8.60 %


In [None]:
# on 10% of relation dataset
print(f"{compute_cos_sim_accuracy_multitoken(biomedbert_models['BioMedBERT_base_full'], inputs_mt_biomed_v2[:100], pred_decoded_biomed_v2[:100], tails[:100], verbose=1):.2f} %")

Cosine similarity between 'a double influenza vaccination' and 'administration of influenza vaccine': 0.9685256481170654
[1mHighest similarity is between 'a double influenza vaccination' and 'administration of influenza vaccine': 0.9685256481170654[0m
--------------------------------------------
Cosine similarity between 'surgical excision' and 'cryosurgery': 0.9180818200111389
[1mHighest similarity is between 'surgical excision' and 'cryosurgery': 0.9180818200111389[0m
--------------------------------------------
Cosine similarity between 'the administration' and 'light therapy': 0.9177975058555603
Cosine similarity between 'the administration' and 'photochemotherapy with psoralens and ultraviolet a': 0.9099854230880737
Cosine similarity between 'the patient in the event of photochemotherapy accident' and 'light therapy': 0.9287481307983398
Cosine similarity between 'the patient in the event of photochemotherapy accident' and 'photochemotherapy with psoralens and ultraviolet a': 0

## Final MLM acccuracy measuring function multitoken

In [28]:
# TO DO: top_n
# using different true answers lengths adjustment
def compute_mlm_top_n_accuracy_multitoken(model_checkpoint:str, accuracy_function:Callable, multitoken_fun:Callable, relation_dataset:str, restricted_candidate_set:bool, dataset_frac=1 , top_n=5, random_state=123, verbose=0, **kwargs):

  data = pd.read_csv(relation_dataset, usecols=["head_name", "rel", "tail_names"])
  # For quicker testing due to resource limitations
  data_chunk = data.sample(frac=dataset_frac, random_state=random_state).reset_index(drop=True)

  rel_name = data_chunk['rel'][0]
  default_prompt = prompts.loc[prompts['pid'] == rel_name]['default_prompt'].tolist()[0]

  model_inputs = prepare_inputs_multi_token_v2(model_checkpoint, data_chunk, default_prompt)

  true_objects = data_chunk['tail_names'].tolist()
  true_objects = list(map(lambda x: x.split(' || '), true_objects))

  candidate_set_tokens = None
  if restricted_candidate_set is True:
    candidate_set_tokens = make_token_candidate_set(model_checkpoint, true_objects)

  # Add restricted candidate set and different true answers lengths adjustment for other approaches too (like for confidence_order v2) (!!!)
  # allowed_funs = [fill_masks_independently_v2, fill_masks_autoregressively_v2, fill_masks_by_confidence_v2, fill_masks_autoregressively_with_refinement_v2, fill_masks_by_confidence_order_refinement_v2]
  allowed_funs = [fill_masks_by_confidence_order_refinement_v2]
  if multitoken_fun not in allowed_funs:
    raise ValueError("Unsupported multitoken prediction function")

  if 'max_iter' in kwargs:
    # for approaches with refinement
    _ , predictions = multitoken_fun(model_checkpoint, model_inputs, candidate_set_tokens, top_n, max_iter=kwargs['max_iter'], verbose=0)
  else:
    # for approaches without refinement
    _ , predictions = multitoken_fun(model_checkpoint, model_inputs, candidate_set_tokens, top_n, verbose=0)

  if accuracy_function not in [compute_accuracy, compute_cos_sim_accuracy_multitoken]:
    raise ValueError("Unsupported accuracy functions")

  if accuracy_function == compute_accuracy:
    return compute_accuracy(predictions, true_objects, verbose=verbose)
  elif accuracy_function == compute_cos_sim_accuracy_multitoken:
    return compute_cos_sim_accuracy_multitoken(model_checkpoint, model_inputs, predictions, true_objects, verbose=verbose)

In [None]:
relation = 'occurs_after'
print(f"Relation: \033[1m{relation}\033[0m")

print('BERT models multitoken exact accuracy:')

# using 10 % of dataset
for k, v in bert_models.items():
  acc_mt = compute_mlm_top_n_accuracy_multitoken(v, compute_accuracy, fill_masks_by_confidence_order_refinement_v2, f'{relation}_1000.csv', True, 0.1, verbose=1, max_iter=10)
  print(f"{k}: {acc_mt:.2f}%\n")

Relation: [1moccurs_after[0m
BERT models multitoken exact accuracy:
Predictions: ['termination of pregnancy', 'surgical termination of pregnancy']
True answers: ['termination of pregnancy', 'induced termination of pregnancy']
[1mTrue prediction![0m
Predictions: ['trauma : traumatic event or traumatic event', 'blood injection']
True answers: ['implantation of prosthetic device', 'surgical procedure']
Predictions: ['acute acute blood test']
True answers: ['congenital syphilis']
Predictions: ['acute traumatic brain injury']
True answers: ['disorder due to infection']
Predictions: ['acute blood blood test']
True answers: ['allergic sensitization']
Predictions: ['her pregnancy pregnancy test']
True answers: ['allergic sensitization']
Predictions: ['procedure', 'surgical procedure. surgical, or surgical procedure']
True answers: ['procedure', 'radiation oncology and/or radiotherapy']
[1mTrue prediction![0m
Predictions: ['patent duct or patent ductular duct repair']
True answers: ['repa

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Predictions: ['birth - birth', '. birth..']
True answers: ['termination of pregnancy', 'induced termination of pregnancy']
Predictions: ['birth. congenital heart valve repair test', 'heart transplant']
True answers: ['implantation of prosthetic device', 'surgical procedure']
Predictions: ['birth. open neck']
True answers: ['congenital syphilis']
Predictions: ['birth, or pregnancy']
True answers: ['disorder due to infection']
Predictions: ['acute injection..']
True answers: ['allergic sensitization']
Predictions: ['birth, or pregnancy']
True answers: ['allergic sensitization']
Predictions: ['procedure', 'procedure. - - ( trans. )']
True answers: ['procedure', 'radiation oncology and/or radiotherapy']
[1mTrue prediction![0m
Predictions: ['patent ductus repair and patent duct repair']
True answers: ['repair of patent ductus arteriosus']
Predictions: ['birth.... )']
True answers: ['acute poliomyelitis']
Predictions: ['injury']
True answers: ['injury']
[1mTrue prediction![0m
Predictions

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Predictions: ['termination of pregnancy', 'misorctation']
True answers: ['termination of pregnancy', 'induced termination of pregnancy']
[1mTrue prediction![0m
Predictions: ['acute blood transfusion ( trans )', 'heart injury']
True answers: ['implantation of prosthetic device', 'surgical procedure']
Predictions: ['syphilis infection']
True answers: ['congenital syphilis']
Predictions: ['rheuma arthritis']
True answers: ['disorder due to infection']
Predictions: ['surgical termination of pregnancy']
True answers: ['allergic sensitization']
Predictions: ['insertion of synthetic implant']
True answers: ['allergic sensitization']
Predictions: ['procedure', 'procedure of cardioctomyctomy']
True answers: ['procedure', 'radiation oncology and/or radiotherapy']
[1mTrue prediction![0m
Predictions: ['acute patent ductus arteriosus repair']
True answers: ['repair of patent ductus arteriosus']
Predictions: ['acute poliomyelitis']
True answers: ['acute poliomyelitis']
[1mTrue prediction![0m
P

# Consistency analysis

In [29]:
paraphrases = pd.read_csv('relations_paraphrases_beta.csv')
paraphrases

Unnamed: 0,original_relation,paraphrase_1,paraphrase_2,paraphrase_3,paraphrase_4,paraphrase_5,paraphrase_6,paraphrase_7,paraphrase_8,paraphrase_9
0,disease has normal tissue origin,disease originates from normal tissue,disease arises from normal tissue,disease starts in normal tissue,normal tissue is where the disease begins,normal tissue is the source of the disease,disease's origin is found in normal tissue,disease is derived from normal tissue,disease traces back to normal tissue,the origin of the disease is normal tissue
1,disease has normal cell origin,disease originates from normal cells,disease arises from normal cells,disease starts in normal cells,normal cells are where the disease begins,normal cells are the source of the disease,disease's origin is found in normal cells,disease is derived from normal cells,disease traces back to normal cells,the origin of the disease is normal cells
2,disease may have molecular abnormality,disease might involve molecular abnormality,disease could have molecular abnormality,disease may show molecular abnormality,molecular abnormality may be present in disease,disease might present molecular abnormality,molecular abnormality can be seen in disease,disease may exhibit molecular abnormality,disease potentially has molecular abnormality,disease might demonstrate molecular abnormality
3,disease has associated anatomic site,disease is linked to an anatomic site,disease is associated with an anatomic site,disease occurs at an anatomic site,an anatomic site is related to the disease,disease manifests in an anatomic site,anatomic site has an association with the disease,anatomic site is connected to the disease,disease is found in an anatomic site,the anatomic site is where the disease is found
4,gene product has associated anatomy,gene product is linked to an anatomy,gene product is associated with an anatomy,gene product occurs in an anatomy,an anatomy is related to the gene product,anatomy has an association with the gene product,anatomy is connected to the gene product,gene product is found in an anatomy,gene product manifests in an anatomy,the anatomy is where the gene product is found
5,gene product has biochemical function,gene product exhibits biochemical function,gene product has a role in biochemical function,biochemical function involves gene product,gene product performs biochemical function,gene product is responsible for biochemical fu...,biochemical function is carried out by the gen...,gene product engages in biochemical function,gene product shows biochemical function,the biochemical function is demonstrated by th...
6,may prevent,can prevent,might prevent,could prevent,is able to prevent,has the potential to prevent,possibly prevents,may be preventive,has the capacity to prevent,potentially prevents
7,disease may have associated disease,disease might be linked with another disease,disease may be associated with another disease,disease could have a related disease,disease can have an associated disease,another disease may be linked to this disease,disease might exhibit an associated disease,disease may have a related disease,disease could be associated with another disease,disease potentially has an associated disease
8,gene associated with disease,gene is linked to disease,gene is associated with disease,disease is connected to the gene,gene is related to disease,disease has an association with the gene,gene is found in disease,disease is related to the gene,gene is implicated in disease,the disease is associated with the gene
9,disease mapped to gene,disease is linked to a gene,disease corresponds to a gene,gene is associated with the disease,disease is connected to a gene,gene mapping shows the disease,disease has a mapped gene,disease aligns with a gene,disease is traced to a gene,gene is identified for the disease


In [30]:
filtered_df = paraphrases[paraphrases['original_relation'] == 'occurs after'].iloc[0]
print(filtered_df['paraphrase_1'])

happens after


In [43]:
# Minor change compared to compute_cos_sim_accuracy_multitoken:
# Different model_inputs for different paraphrases (tokenization - indices of mask tokens can be different)
def compute_cos_sim_consistency_multitoken(model_checkpoint:str, inputs_par_1, inputs_par_2, predictions_par_1, predictions_par_2, verbose=0):

  # Different true answers lengths adjustment for inputs of both paraphrases (!!!)

  if isinstance(inputs_par_1, list) and all(isinstance(input_par_1, list) for input_par_1 in inputs_par_1):
    inputs_par_1 = [input_par_1[0] for input_par_1 in inputs_par_1]

  if isinstance(inputs_par_2, list) and all(isinstance(input_par_2, list) for input_par_2 in inputs_par_2):
    inputs_par_2 = [input_par_2[0] for input_par_2 in inputs_par_2]


  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = TFAutoModel.from_pretrained(model_checkpoint, from_pt=True, output_hidden_states=True)

  # Adjusting inputs for RoBERTa models
  if 'roberta' in model_checkpoint:
    inputs_par_1 = [change_input_format(input) for input_par_1 in inputs_par_1]
    inputs_par_2 = [change_input_format(input) for input_par_2 in inputs_par_2]

  # model_max_length field not set by default for BioBERT and BioMedBERT models
  if 'bio' in model_checkpoint.lower():
    tokenizer.model_max_length = 512

  # Tokenizing inputs for both paraphrases
  tokenized_inputs_par_1 = tokenizer(inputs_par_1, return_tensors="tf", padding=True, truncation=True) # max_length=128 (by default max_length = tokenizer.model_max_length)
  tokenized_inputs_par_2 = tokenizer(inputs_par_2, return_tensors="tf", padding=True, truncation=True) # max_length=128 (by default max_length = tokenizer.model_max_length)

  # Getting first mask token indices for both paraphrases inputs
  first_mask_token_indices_par_1 = []
  first_mask_token_indices_par_2 = []

  for i, input in enumerate(tokenized_inputs_par_1["input_ids"]):
    first_mask_token_indices_par_1.append(np.argwhere(tokenized_inputs_par_1["input_ids"].numpy()[i] == tokenizer.mask_token_id)[0, 0])

  for i, input in enumerate(tokenized_inputs_par_2["input_ids"]):
    first_mask_token_indices_par_2.append(np.argwhere(tokenized_inputs_par_2["input_ids"].numpy()[i] == tokenizer.mask_token_id)[0, 0])


  predicted_sentences_par_1 = replace_masks_multitoken(model_checkpoint, tokenized_inputs_par_1, predictions_par_1)
  predicted_sentences_par_2 = replace_masks_multitoken(model_checkpoint, tokenized_inputs_par_2, predictions_par_2)

  hits = 0
  n = len(inputs_par_1) # == len(inputs_par_2)
  # for every example in relation dataset
  for i in range(n):

    predicted_sentences_par_1_tkz = tokenizer(predicted_sentences_par_1[i], return_tensors="tf", padding=True, truncation=True)
    hidden_states = model(**predicted_sentences_par_1_tkz).hidden_states

    if not isinstance(hidden_states, tuple):
      raise ValueError("Model configuration does not support returning hidden states!")

    # combining last 4 (TRY OTHER) layers for more robust results - experiment with combining embeddings from earlier layers (???), Dimensionality reduction - PCA (???)
    combined_last_four_layers = tf.reduce_mean(tf.stack([hidden_states[i] for i in range(-4, 0)]), axis=0)

    # finding end indexes of the predicted phrases par 1 (position of the first punctuation token - different for every predicted phrase!)
    punctuation_ids = tokenizer.convert_tokens_to_ids([char for char in string.punctuation if char not in {',', ':', ';', '-'}])
    end_indices_pred_par_1 = []
    embedding_size = combined_last_four_layers.shape[2]
    pred_embeddings_par_1 = tf.zeros((0, embedding_size))

    for p in range(len(predicted_sentences_par_1_tkz["input_ids"])):
      punctuation_indices = np.where(np.isin(predicted_sentences_par_1_tkz["input_ids"].numpy()[p], punctuation_ids))[0]

      # first punctuation index > first_mask_token_indices_par_1[i]
      indices = np.where(punctuation_indices > first_mask_token_indices_par_1[i])[0]
      end_index = punctuation_indices[indices[0]] if indices.size > 0 else None

      if end_index is None:
        end_index = np.where(np.isin(predicted_sentences_par_1_tkz["input_ids"].numpy()[p], tokenizer.all_special_ids))[0][0]

      phrase_embedding = combined_last_four_layers[p, first_mask_token_indices_par_1[i]:end_index, :]
      pred_embedding = tf.nn.l2_normalize(tf.reduce_mean(phrase_embedding, axis=0), axis=0)

      pred_embeddings_par_1 = tf.concat([pred_embeddings_par_1, tf.reshape(pred_embedding, (1, -1))], axis=0)

      end_indices_pred_par_1.append(end_index)

    predicted_sentences_par_2_tkz = tokenizer(predicted_sentences_par_2[i], return_tensors="tf", padding=True, truncation=True)
    hidden_states = model(**predicted_sentences_par_2_tkz).hidden_states

    if not isinstance(hidden_states, tuple):
      raise ValueError("Model configuration does not support returning hidden states!")

    # combining last 4 (TRY OTHER) layers for more robust results - experiment with combining embeddings from earlier layers (???), Dimensionality reduction - PCA (???)
    combined_last_four_layers = tf.reduce_mean(tf.stack([hidden_states[i] for i in range(-4, 0)]), axis=0)

    # finding end indexes of the predicted phrases par 1 (position of the first punctuation token - different for every predicted phrase!)
    punctuation_ids = tokenizer.convert_tokens_to_ids([char for char in string.punctuation if char not in {',', ':', ';', '-'}])
    end_indices_pred_par_2 = []
    embedding_size = combined_last_four_layers.shape[2]
    pred_embeddings_par_2 = tf.zeros((0, embedding_size))

    for p in range(len(predicted_sentences_par_2_tkz["input_ids"])):
      punctuation_indices = np.where(np.isin(predicted_sentences_par_2_tkz["input_ids"].numpy()[p], punctuation_ids))[0]

      # first punctuation index > first_mask_token_indices_par_2[i]
      indices = np.where(punctuation_indices > first_mask_token_indices_par_2[i])[0]
      end_index = punctuation_indices[indices[0]] if indices.size > 0 else None

      if end_index is None:
        end_index = np.where(np.isin(predicted_sentences_par_2_tkz["input_ids"].numpy()[p], tokenizer.all_special_ids))[0][0]

      phrase_embedding = combined_last_four_layers[p, first_mask_token_indices_par_2[i]:end_index, :]
      pred_embedding = tf.nn.l2_normalize(tf.reduce_mean(phrase_embedding, axis=0), axis=0)

      pred_embeddings_par_2 = tf.concat([pred_embeddings_par_2, tf.reshape(pred_embedding, (1, -1))], axis=0)

      end_indices_pred_par_2.append(end_index)

    highest_similarity = -1

    if verbose:
      most_similar_pred_par_1 = None
      most_similar_pred_par_2 = None

    # looking for highest similarity (???)
    for j in range(len(pred_embeddings_par_1)):
      for k in range(len(pred_embeddings_par_2)):

        if verbose:
          pred_phrase_par_1 = tokenizer.decode(predicted_sentences_par_1["input_ids"][j].numpy()[first_mask_token_indices_par_1[i]: end_indices_pred_par_1[j]])
          pred_phrase_par_2 = tokenizer.decode(predicted_sentences_par_2["input_ids"][j].numpy()[first_mask_token_indices_par_2[i]: end_indices_pred_par_2[j]])

        similarity = cosine_similarity(tf.reshape(pred_embeddings_par_1[j], shape=(1, -1)), tf.reshape(pred_embeddings_par_2[k], shape=(-1, 1)))[0][0]

        if verbose:
          print(f"Cosine similarity between '{pred_phrase_par_1}' and '{pred_phrase_par_2}': {similarity}")

        if similarity > highest_similarity:
            highest_similarity = similarity
            if verbose:
              most_similar_pred_par_1 = pred_phrase_par_1
              most_similar_pred_par_2 = pred_phrase_par_2

    if verbose:
      print(f"\033[1mHighest similarity is between '{most_similar_pred_par_1}' and '{most_similar_pred_par_2}': {highest_similarity}\033[0m")
      print('--------------------------------------------')

    # 0.95 - threshold (can be function parameter) - TRY OTHER
    if highest_similarity >= 0.95:
      hits += 1

  return (hits/n) * 100

In [44]:
# consistency in predictions for two paraphrases of the same relation
def compute_mlm_top_n_consistency(model_checkpoint:str, accuracy_function:Callable, multitoken_fun:Callable, relation_dataset:str, restricted_candidate_set:bool, par_1, par_2, dataset_frac=1 , top_n=5, random_state=123, verbose=0, **kwargs):

  data = pd.read_csv(relation_dataset, usecols=["head_name", "rel", "tail_names"])
  # For quicker testing due to resource limitations
  data_chunk = data.sample(frac=dataset_frac, random_state=random_state).reset_index(drop=True)

  rel_name = data_chunk['rel'][0]
  default_prompt = prompts.loc[prompts['pid'] == rel_name]['default_prompt'].tolist()[0]

  model_inputs_original = prepare_inputs_multi_token_v2(model_checkpoint, data_chunk, default_prompt)

  model_inputs_par_1 = [[input.replace(rel_name.replace("_", " ").lower().strip(), par_1.lower().strip()) for input in inputs] for inputs in model_inputs_original]
  model_inputs_par_2 = [[input.replace(rel_name.replace("_", " ").lower().strip(), par_2.lower().strip()) for input in inputs] for inputs in model_inputs_original]

  true_objects = data_chunk['tail_names'].tolist()
  true_objects = list(map(lambda x: x.split(' || '), true_objects))

  candidate_set_tokens = None
  if restricted_candidate_set is True:
    candidate_set_tokens = make_token_candidate_set(model_checkpoint, true_objects)

  # Add restricted candidate set and different true answers lengths adjustment for other approaches too (like for confidence_order v2) (!!!)
  # allowed_funs = [fill_masks_independently_v2, fill_masks_autoregressively_v2, fill_masks_by_confidence_v2, fill_masks_autoregressively_with_refinement_v2, fill_masks_by_confidence_order_refinement_v2]
  allowed_funs = [fill_masks_by_confidence_order_refinement_v2]
  if multitoken_fun not in allowed_funs:
    raise ValueError("Unsupported multitoken prediction function")

  if 'max_iter' in kwargs:
    # for approaches with refinement
    _ , predictions_par_1 = multitoken_fun(model_checkpoint, model_inputs_par_1, candidate_set_tokens, top_n, max_iter=kwargs['max_iter'], verbose=0)
    _ , predictions_par_2 = multitoken_fun(model_checkpoint, model_inputs_par_2, candidate_set_tokens, top_n, max_iter=kwargs['max_iter'], verbose=0)

  else:
    # for approaches without refinement
    _ , predictions_par_1 = multitoken_fun(model_checkpoint, model_inputs_par_1, candidate_set_tokens, top_n, verbose=0)
    _ , predictions_par_2 = multitoken_fun(model_checkpoint, model_inputs_par_2, candidate_set_tokens, top_n, verbose=0)

  if accuracy_function not in [compute_accuracy, compute_cos_sim_consistency_multitoken]:
    raise ValueError("Unsupported accuracy functions")

  if accuracy_function == compute_accuracy:
    # same approach like for accuracy, comparing two sets of predictions instead of predictions and true answers (???)
    return compute_accuracy(predictions_par_1, predictions_par_2, verbose=verbose)
  elif accuracy_function == compute_cos_sim_accuracy_multitoken:
    return compute_cos_sim_consistency_multitoken(model_checkpoint, model_inputs_par_1, model_inputs_par_2, predictions_par_1, predictions_par_2, verbose=verbose)

In [42]:
# using 10% of relation dataset
relation = 'occurs_after'
print(f"Relation: \033[1m{relation}\033[0m")
paraphrase_1 = filtered_df['original_relation']
paraphrase_2 = filtered_df['paraphrase_1']
print(f"Paraphrases: {paraphrase_1} and {paraphrase_2}")
model = biomedbert_models['BioMedBERT_base_full']
con_exact = compute_mlm_top_n_consistency(model, compute_accuracy, fill_masks_by_confidence_order_refinement_v2, f'{relation}_1000.csv', True, paraphrase_1, paraphrase_2, 0.1, verbose=1, max_iter=10)
print(f"{model} :{con_exact: .2f} %")

Relation: [1moccurs_after[0m
Paraphrases: occurs after and happens after


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Predictions: ['termination of pregnancy', 'total colectomy']
True answers: ['termination of pregnancy', 'total colectomy']
[1mTrue prediction![0m
Predictions: ['heart heart device implantation', 'brain injury']
True answers: ['termination of ventricular block', 'device implantation']
Predictions: ['head trauma']
True answers: ['syphilis infection']
Predictions: ['trauma and acute infection']
True answers: ['trauma and acute infection']
[1mTrue prediction![0m
Predictions: ['intrauterine injection']
True answers: ['intrauterine injection']
[1mTrue prediction![0m
Predictions: ['testicular transplantation']
True answers: ['contraceptive injection']
Predictions: ['radiotherapy', 'arterioplasty and colectomy']
True answers: ['radiotherapy', 'stomoplasty and colectomy']
[1mTrue prediction![0m
Predictions: ['aort - ductus arteriosus shunt']
True answers: ['ductatoctomycty procedure']
Predictions: ['acute poliomyelitis']
True answers: ['acute poliomyelitis']
[1mTrue prediction![0m
Pre

In [None]:
# using 2% of relation dataset
model = biomedbert_models['BioMedBERT_base_full']
con_exact = compute_mlm_top_n_consistency(model, compute_cos_sim_consistency_multitoken, fill_masks_by_confidence_order_refinement_v2, f'{relation}_1000.csv', True, paraphrase_1, paraphrase_2, 0.02, verbose=1, max_iter=10)
print(f"{model} :{con_exact: .2f} %")