<a href="https://colab.research.google.com/github/PavleSavic/MLM_consistency/blob/main/consistency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import random
import string
import logging
import pandas as pd
import numpy as np
import tensorflow as tf
#!pip install transformers datasets evaluate
from transformers import AutoTokenizer, TFAutoModelForMaskedLM, TFAutoModel

In [3]:
random.seed(123)
tf.keras.mixed_precision.set_global_policy('mixed_float16')
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

## Relations used in analysis

In [4]:
relations = []
with open("final_19_relations.txt") as f:
    lines = f.readlines()
    for l in lines:
        relations.append(l.strip())
print(len(relations))
relations.sort()
print(relations)

19
['associated_morphology_of', 'disease_has_abnormal_cell', 'disease_has_associated_anatomic_site', 'disease_has_normal_cell_origin', 'disease_has_normal_tissue_origin', 'disease_mapped_to_gene', 'disease_may_have_associated_disease', 'disease_may_have_finding', 'disease_may_have_molecular_abnormality', 'gene_associated_with_disease', 'gene_encodes_gene_product', 'gene_product_encoded_by_gene', 'gene_product_has_associated_anatomy', 'gene_product_has_biochemical_function', 'gene_product_plays_role_in_biological_process', 'has_physiologic_effect', 'may_prevent', 'may_treat', 'occurs_after']


## Prompts

In [5]:
prompts = pd.read_csv('prompts.csv')

In [6]:
prompts

Unnamed: 0,pid,default_prompt,human_prompt
0,associated_morphology_of,[X] associated morphology of [Y] .,[X] is associated morphology of [Y] .
1,disease_has_abnormal_cell,[X] disease has abnormal cell [Y] .,[X] has the abnormal cell [Y] .
2,disease_has_associated_anatomic_site,[X] disease has associated anatomic site [Y] .,The disease [X] can stem from the associated a...
3,disease_has_normal_cell_origin,[X] disease has normal cell origin [Y] .,The disease [X] stems from the normal cell [Y] .
4,disease_has_normal_tissue_origin,[X] disease has normal tissue origin [Y] .,The disease [X] stems from the normal tissue [...
5,disease_mapped_to_gene,[X] disease mapped to gene [Y] .,The disease [X] is mapped to gene [Y] .
6,disease_may_have_associated_disease,[X] disease may have associated disease [Y] .,The disease [X] might have the associated dise...
7,disease_may_have_finding,[X] disease may have finding [Y] .,[X] may have [Y] .
8,disease_may_have_molecular_abnormality,[X] disease may have molecular abnormality [Y] .,The disease [X] may have molecular abnormality...
9,gene_associated_with_disease,[X] gene associated with disease [Y] .,The gene [X] is associatied with disease [Y] .


## Masked Language Models

In [7]:
# uncased
bert_models = {'BERT_base' : "google-bert/bert-base-uncased", 'BERT_large': "google-bert/bert-large-uncased",
                'BERT_large_wwm': "google-bert/bert-large-uncased-whole-word-masking"}
# cased
roberta_models = {'RoBERTa_base': "FacebookAI/roberta-base", 'RoBERTa_large': "FacebookAI/roberta-large"}
# uncased
albert_models = {'ALBERT_base': "albert/albert-base-v2", 'ALBERT_xxlarge': "albert/albert-xxlarge-v2"}
# cased
biobert_models = {'BioBERT': "dmis-lab/biobert-base-cased-v1.2"}
# uncased
biomedbert_models = {'BioMedBERT_base_abstract' : "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract",
                     'BioMedBERT_base_full': "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
                     'BioMedBERT_large_abstract': "microsoft/BiomedNLP-BiomedBERT-large-uncased-abstract"}

## Example

In [8]:
# List of input texts with masked tokens
texts = ["She is from the city of [MASK].", "This is a great [MASK].", "He is an excellent [MASK]."]

In [9]:
def change_input_format(input):
  new_input = input.replace('[MASK]','<mask>')
  return new_input

In [10]:
def analyze_tokenizer(model_checkpoint:str, inputs):

  if 'roberta' in model_checkpoint:
    inputs = [change_input_format(input) for input in inputs]

  tokz = AutoTokenizer.from_pretrained(model_checkpoint)

  # model_max_length field not set by default for BioBERT and BioMedBERT models
  if 'bio' in model_checkpoint.lower():
    tokz.model_max_length = 512

  tokenization = tokz(inputs, return_tensors='tf', padding=True)  # truncation=True, max_length=tokz.model_max_length
  print(f"Tokenization example: {tokenization['input_ids']}")
  for l in tokenization['input_ids']:
    print(f"Decoded tokens: {tokz.decode(l)}")
  print(f"End of sequence token: {tokz.eos_token}")
  print(f"Mask token id: {tokz.mask_token_id}")
  print(f"All special tokens ids: {tokz.all_special_ids}")
  print(f"All special tokens: {tokz.decode(tokz.all_special_ids)}")
  print(f"Maximum model input length: {tokz.model_max_length}")

In [None]:
# One model for each group
print('BERT_base')
analyze_tokenizer(bert_models['BERT_base'], texts)
print('------------------------------------------------------------------------------')
print('BERT_large_wwm')
analyze_tokenizer(bert_models['BERT_large_wwm'], texts)
print('------------------------------------------------------------------------------')
print('RoBERTa_base')
analyze_tokenizer(roberta_models['RoBERTa_base'], texts)
print('------------------------------------------------------------------------------')
print('ALBERT_base')
analyze_tokenizer(albert_models['ALBERT_base'], texts)
print('------------------------------------------------------------------------------')
print('BioBERT')
analyze_tokenizer(biobert_models['BioBERT'], texts)
print('------------------------------------------------------------------------------')
print('BioMedBERT_base_full')
analyze_tokenizer(biomedbert_models['BioMedBERT_base_full'], texts)
print('------------------------------------------------------------------------------')

BERT_base
Tokenization example: [[ 101 2016 2003 2013 1996 2103 1997  103 1012  102]
 [ 101 2023 2003 1037 2307  103 1012  102    0    0]
 [ 101 2002 2003 2019 6581  103 1012  102    0    0]]
Decoded tokens: [CLS] she is from the city of [MASK]. [SEP]
Decoded tokens: [CLS] this is a great [MASK]. [SEP] [PAD] [PAD]
Decoded tokens: [CLS] he is an excellent [MASK]. [SEP] [PAD] [PAD]
End of sequence token: None
Mask token id: 103
All special tokens ids: [100, 102, 0, 101, 103]
All special tokens: [UNK] [SEP] [PAD] [CLS] [MASK]
Maximum model input length: 512
------------------------------------------------------------------------------
BERT_large_wwm
Tokenization example: [[ 101 2016 2003 2013 1996 2103 1997  103 1012  102]
 [ 101 2023 2003 1037 2307  103 1012  102    0    0]
 [ 101 2002 2003 2019 6581  103 1012  102    0    0]]
Decoded tokens: [CLS] she is from the city of [MASK]. [SEP]
Decoded tokens: [CLS] this is a great [MASK]. [SEP] [PAD] [PAD]
Decoded tokens: [CLS] he is an excellen

In [11]:
def get_model_predictions(model_checkpoint:str, inputs:list[str], top_n=5, verbose=0):
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

  # Adjusting inputs for RoBERTa models
  if 'roberta' in model_checkpoint:
    inputs = [change_input_format(input) for input in inputs]

  # model_max_length field not set by default for BioBERT and BioMedBERT models
  if 'bio' in model_checkpoint.lower():
    tokenizer.model_max_length = 512

  if verbose:
    print(f'Choosen model: {model_checkpoint}')
    model.summary()

  # Tokenizing the inputs
  tokenized_inputs = tokenizer(inputs, return_tensors="tf", padding=True, truncation=True) # max_length=128 (by default max_length = tokenizer.model_max_length)

  # Getting the token logits from the model
  token_logits = model(**tokenized_inputs).logits

  outputs = []

  for i, input in enumerate(inputs):
                                                                                   # not necessary (all tokenizers have mask_token_id defined)
    mask_token_id = tokenizer.mask_token_id if tokenizer.mask_token_id is not None else tokenizer.convert_tokens_to_ids(["[MASK]"])[0]

    mask_token_index = np.argwhere(tokenized_inputs["input_ids"].numpy()[i] == mask_token_id)[0, 0]

    mask_token_logits = token_logits[i, mask_token_index, :]

    top_tokens = np.argsort(-mask_token_logits.numpy()).tolist()

    predictions = []
    if verbose:
      print(f"Input: {input}")

    for token_id in top_tokens:
        # Skip special tokens
        if token_id in tokenizer.all_special_ids:
          continue

        predicted_token = tokenizer.decode([token_id])

        # Skip punctutation tokens
        if predicted_token in string.punctuation:
          continue

        predictions.append(predicted_token)
        if verbose:
          print(f">>> {input.replace(tokenizer.mask_token, predicted_token)}")

        if len(predictions) == top_n:
          break
    if verbose:
      print()

    outputs.append(predictions)

  return np.array(outputs)

In [None]:
pred = get_model_predictions(model_checkpoint=roberta_models['RoBERTa_base'], inputs=texts, top_n=10)
pred

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

array([[' Chicago', ' London', ' Toronto', ' Seattle', ' Boston',
        ' Portland', ' Philadelphia', ' Vancouver', ' Minneapolis',
        ' Houston'],
       [' idea', ' example', ' article', ' video', ' post', ' story',
        ' read', ' question', ' book', ' game'],
       [' player', ' writer', ' athlete', ' student', ' shooter',
        ' coach', ' hitter', ' defender', ' guy', ' broadcaster']],
      dtype='<U13')

# Relations

In [12]:
occurs_data = pd.read_csv('occurs_after_1000.csv', usecols=["head_name", "rel", "tail_names"])
occurs_data.head(10)

Unnamed: 0,head_name,rel,tail_names
0,Post influenza vaccination encephalitis,occurs_after,Administration of influenza vaccine
1,Basal cell carcinoma recurrent following cryos...,occurs_after,Cryosurgery
2,Adverse effect from PUVA photochemotherapy,occurs_after,Light therapy || Photochemotherapy with psoral...
3,Allergy to pea,occurs_after,Allergic sensitization
4,Bite of unidentified snake with neurological s...,occurs_after,Animal bite
5,Allergy to hypothalamic hormone,occurs_after,Allergic sensitization
6,Late effect of accidental injury,occurs_after,Traumatic injury
7,Radiotherapy scar,occurs_after,Procedure || Radiation oncology AND/OR radioth...
8,Atonic postpartum hemorrhage,occurs_after,Delivery procedure
9,Late effect of skin and subcutaneous tissue in...,occurs_after,Injury || Traumatic injury


In [13]:
# Prompts to use
rel_name = occurs_data['rel'][0]
default_prompt = prompts.loc[prompts['pid'] == rel_name]['default_prompt'].tolist()[0]
human_prompt  = prompts.loc[prompts['pid'] == rel_name]['human_prompt'].tolist()[0]
print(f"Default prompt: {default_prompt}\nHuman prompt: {human_prompt}")

Default prompt: [X] occurs after [Y] .
Human prompt: [X] occurs after [Y] .


In [14]:
# Preparing inputs
def prepare_inputs(data, prompt:str):
  # number of rows
  n = len(data)

  inputs = [prompt for _ in range(n)]
  heads = data['head_name'].tolist()

  inputs = [input.replace('[X]', head) for input, head in zip(inputs, heads)]
  inputs = [input.replace('[Y] ', '[MASK]') for input in inputs]
  return inputs

In [42]:
inputs = prepare_inputs(occurs_data, default_prompt)
print(inputs[:10])

['Post influenza vaccination encephalitis occurs after [MASK].', 'Basal cell carcinoma recurrent following cryosurgery occurs after [MASK].', 'Adverse effect from PUVA photochemotherapy occurs after [MASK].', 'Allergy to pea occurs after [MASK].', 'Bite of unidentified snake with neurological signs occurs after [MASK].', 'Allergy to hypothalamic hormone occurs after [MASK].', 'Late effect of accidental injury occurs after [MASK].', 'Radiotherapy scar occurs after [MASK].', 'Atonic postpartum hemorrhage occurs after [MASK].', 'Late effect of skin and subcutaneous tissue injury occurs after [MASK].']


In [None]:
# Getting predictions
pred_1 = get_model_predictions(model_checkpoint=biomedbert_models['BioMedBERT_base_abstract'], inputs=inputs, top_n=1)
pred_1

array([['vaccination'],
       ['years'],
       ['transplantation'],
       ['birth'],
       ['birth'],
       ['surgery'],
       ['birth'],
       ['radiotherapy'],
       ['delivery'],
       ['surgery'],
       ['trauma'],
       ['stroke'],
       ['surgery'],
       ['trauma'],
       ['injection'],
       ['surgery'],
       ['surgery'],
       ['transplantation'],
       ['surgery'],
       ['trauma'],
       ['surgery'],
       ['transplantation'],
       ['surgery'],
       ['immunization'],
       ['chemotherapy'],
       ['cholecystectomy'],
       ['pregnancy'],
       ['esophagectomy'],
       ['varicella'],
       ['treatment'],
       ['surgery'],
       ['surgery'],
       ['cesarean'],
       ['laminectomy'],
       ['surgery'],
       ['trauma'],
       ['colonoscopy'],
       ['catheterization'],
       ['surgery'],
       ['surgery'],
       ['surgery'],
       ['ingestion'],
       ['surgery'],
       ['injection'],
       ['delivery'],
       ['surgery'],
     

In [16]:
tails = occurs_data['tail_names'].tolist()
tails = list(map(lambda x: x.split(' || '), tails))
tails

[['Administration of influenza vaccine'],
 ['Cryosurgery'],
 ['Light therapy', 'Photochemotherapy with psoralens and ultraviolet A'],
 ['Allergic sensitization'],
 ['Animal bite'],
 ['Allergic sensitization'],
 ['Traumatic injury'],
 ['Procedure', 'Radiation oncology AND/OR radiotherapy'],
 ['Delivery procedure'],
 ['Injury', 'Traumatic injury'],
 ['Fat necrosis'],
 ['Spontaneous cerebral hemorrhage'],
 ['Transplantation',
  'Implantation of prosthetic device',
  'Surgical construction of arteriovenous shunt'],
 ['Injury of knee', 'Traumatic event'],
 ['Procedure', 'Injection'],
 ['Colostomy', 'Procedure'],
 ['Procedure'],
 ['Corneal transplant'],
 ['Allergic sensitization'],
 ['Traumatic injury', 'Traumatic event'],
 ['Extraction of cataract', 'Implantation of phakic intraocular lens implant'],
 ['Transplantation of bone marrow', 'Grafting procedure'],
 ['Allergic sensitization'],
 ['Active or passive immunization'],
 ['Allergic sensitization'],
 ['Implantation of prosthetic device', 

In [17]:
# Computing top_n accuracy - percentage of examples where one of the top n answers is in the set of correct tails (ignore case)
def compute_accuracy(predictions, tails):
  hits = 0
  n = len(predictions)

  for i in range(n):
    preds = [prediction.strip().lower() for prediction in predictions[i]]
    tls = [tail.strip().lower() for tail in tails[i]]
    if set(preds).intersection(tls):
      hits += 1

  return (hits/n)*100

In [None]:
top_1_acc = compute_accuracy(pred_1, tails)
print(f'Top 1 accuracy: {top_1_acc:.2f} %')

Top 1 accuracy: 4.00 %


In [18]:
pred_10 = get_model_predictions(model_checkpoint=biomedbert_models['BioMedBERT_base_full'], inputs=inputs, top_n=10)
pred_10

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

array([['vaccination', 'birth', 'immunization', ..., 'delivery',
        'influenza', 'seroconversion'],
       ['radiotherapy', 'years', 'surgery', ..., 'treatment', 'excision',
        'irradiation'],
       ['surgery', 'radiotherapy', 'treatment', ..., 'birth',
        'radiation', 'therapy'],
       ...,
       ['menopause', 'childbirth', 'birth', ..., 'thyroidectomy',
        'puberty', 'splenectomy'],
       ['trauma', 'surgery', 'stroke', ..., 'earthquake', 'operation',
        'accidents'],
       ['surgery', 'transplantation', 'exposure', ..., 'grafting',
        'injection', 'dialysis']], dtype='<U17')

In [19]:
top_10_acc = compute_accuracy(pred_10, tails)
print(f'Top 10 accuracy: {top_10_acc:.2f} %')

Top 10 accuracy: 12.10 %


In [20]:
def cosine_similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)

    dot_product = np.dot(v1, v2)

    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)

    cosine_sim = dot_product / (norm_v1 * norm_v2)

    return cosine_sim

print(f'Cosine Similarity: {cosine_similarity([1,2,3], [4,5,6])}')

Cosine Similarity: 0.9746318461970762


In [40]:
model_name = bert_models['BERT_base']
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name, from_pt=True, output_hidden_states=True)

input_texts = [
    "The cat sat on the mat.",
    "The dog sat on the mat.",
    "The kitten sat on the mat.",
    "The mailman sat on the mat."
    ]

inputs = tokenizer(input_texts, return_tensors="tf", padding=True, truncation=True)

print(f"Tokens: {inputs['input_ids']}")
for i, input_ids in enumerate(inputs['input_ids']):
    print(f"Decoded tokens for sentence {i}: {tokenizer.decode(input_ids)}")

outputs = model(**inputs)

hidden_states = outputs.hidden_states

if not isinstance(hidden_states, tuple):
    raise ValueError("Model configuration does not support returning hidden states!")

# combining last 4 (TRY OTHER) layers for more robust results
last_four_layers = [hidden_states[i] for i in range(-4, 0)]
combined_layers = tf.reduce_mean(tf.stack(last_four_layers), axis=0)

# normalizing the combined embeddings
normalized_embeddings = tf.nn.l2_normalize(combined_layers, axis=2)

# position of mask token in original sentence
third_token_embeddings = normalized_embeddings[:, 2, :]

print(third_token_embeddings)

# compare word embeddings ()
print(f"Cosine similarity (\"cat\", \"dog\"): {cosine_similarity(tf.reshape(third_token_embeddings[0], shape=(1, -1)), tf.reshape(third_token_embeddings[1], shape=(-1, 1)))[0][0]}")
print(f"Cosine similarity (\"cat\", \"kitten\"): {cosine_similarity(tf.reshape(third_token_embeddings[0], shape=(1, -1)), tf.reshape(third_token_embeddings[2], shape=(-1, 1)))[0][0]}")
print(f"Cosine similarity (\"dog\", \"kitten\"): {cosine_similarity(tf.reshape(third_token_embeddings[1], shape=(1, -1)), tf.reshape(third_token_embeddings[2], shape=(-1, 1)))[0][0]}")
print(f"Cosine similarity (\"cat\", \"mailman\"): {cosine_similarity(tf.reshape(third_token_embeddings[0], shape=(1, -1)), tf.reshape(third_token_embeddings[3], shape=(-1, 1)))[0][0]}")
print(f"Cosine similarity (\"dog\", \"mailman\"): {cosine_similarity(tf.reshape(third_token_embeddings[1], shape=(1, -1)), tf.reshape(third_token_embeddings[3], shape=(-1, 1)))[0][0]}")
print(f"Cosine similarity (\"kitten\", \"mailman\"): {cosine_similarity(tf.reshape(third_token_embeddings[2], shape=(1, -1)), tf.reshape(third_token_embeddings[3], shape=(-1, 1)))[0][0]}")

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Tokens: [[  101  1996  4937  2938  2006  1996 13523  1012   102     0]
 [  101  1996  3899  2938  2006  1996 13523  1012   102     0]
 [  101  1996 18401  2938  2006  1996 13523  1012   102     0]
 [  101  1996  5653  2386  2938  2006  1996 13523  1012   102]]
Decoded tokens for sentence 0: [CLS] the cat sat on the mat. [SEP] [PAD]
Decoded tokens for sentence 1: [CLS] the dog sat on the mat. [SEP] [PAD]
Decoded tokens for sentence 2: [CLS] the kitten sat on the mat. [SEP] [PAD]
Decoded tokens for sentence 3: [CLS] the mailman sat on the mat. [SEP]
tf.Tensor(
[[-0.01194338 -0.0051559   0.0104109  ... -0.02537592  0.03168032
   0.04000831]
 [ 0.02839966  0.02874123  0.00292385 ... -0.04067679  0.02913046
   0.03197852]
 [-0.04018649 -0.03780384 -0.012027   ... -0.03123694  0.027043
  -0.03035366]
 [ 0.05976407 -0.01487961  0.02051217 ...  0.01845756  0.0075776
  -0.05614597]], shape=(4, 768), dtype=float32)
Cosine similarity ("cat", "dog"): 0.8437002897262573
Cosine similarity ("cat", "k

In [24]:
def replace_masks(masked_inputs, tails):

  n = len(masked_inputs)
  full_sentences = []

  for i in range(n):
   example_sentences = [masked_inputs[i].replace('[MASK]', tail) for tail in tails[i]]
   full_sentences.append(example_sentences)

  return full_sentences

print(replace_masks(inputs[:10], tails[:10]))
print(replace_masks(inputs[:10], pred_10[:10]))

[['Post influenza vaccination encephalitis occurs after Administration of influenza vaccine.'], ['Basal cell carcinoma recurrent following cryosurgery occurs after Cryosurgery.'], ['Adverse effect from PUVA photochemotherapy occurs after Light therapy.', 'Adverse effect from PUVA photochemotherapy occurs after Photochemotherapy with psoralens and ultraviolet A.'], ['Allergy to pea occurs after Allergic sensitization.'], ['Bite of unidentified snake with neurological signs occurs after Animal bite.'], ['Allergy to hypothalamic hormone occurs after Allergic sensitization.'], ['Late effect of accidental injury occurs after Traumatic injury.'], ['Radiotherapy scar occurs after Procedure.', 'Radiotherapy scar occurs after Radiation oncology AND/OR radiotherapy.'], ['Atonic postpartum hemorrhage occurs after Delivery procedure.'], ['Late effect of skin and subcutaneous tissue injury occurs after Injury.', 'Late effect of skin and subcutaneous tissue injury occurs after Traumatic injury.']]
[

In [47]:
# 1. obtain embeddings of the expected token and the predicted token from the word embedding layer of a model
# 2. compute cosine similarity metric (TO DO: Multi Token scenario: Averaging cosine similarities, BLEU score (???))

# single token scenario ( a) only first token of true answer taken into consideration, b) token with highest cosine similarity with predicted token)
def word_embedding_similarity(model_checkpoint:str, original_inputs, predictions, tails):
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = TFAutoModel.from_pretrained(model_checkpoint, from_pt=True, output_hidden_states=True)

  # Adjusting inputs for RoBERTa models
  if 'roberta' in model_checkpoint:
    original_inputs = [change_input_format(input) for input in original_inputs]

  # model_max_length field not set by default for BioBERT and BioMedBERT models
  if 'bio' in model_checkpoint.lower():
    tokenizer.model_max_length = 512

  # Tokenizing original inputs
  tokenized_inputs = tokenizer(original_inputs, return_tensors="tf", padding=True, truncation=True) # max_length=128 (by default max_length = tokenizer.model_max_length)
  #print(tokenized_inputs['input_ids'])

  # Getting mask_token indexes for every example
  mask_token_indexes = []

  for i, input in enumerate(tokenized_inputs["input_ids"]):
                                                                                   # not necessary (all tokenizers have mask_token_id defined)
    mask_token_id = tokenizer.mask_token_id if tokenizer.mask_token_id is not None else tokenizer.convert_tokens_to_ids(["[MASK]"])[0]

    mask_token_indexes.append(np.argwhere(tokenized_inputs["input_ids"].numpy()[i] == mask_token_id)[0, 0])

  #print(mask_token_indexes)

  predicted_sentences = replace_masks(original_inputs, predictions)
  true_sentences = replace_masks(original_inputs, tails)

  # for every example in relation
  for i in range(len(original_inputs)):

    predicted_sentences_tkz = tokenizer(predicted_sentences[i], return_tensors="tf", padding=True, truncation=True)

    hidden_states = model(**predicted_sentences_tkz).hidden_states

    if not isinstance(hidden_states, tuple):
      raise ValueError("Model configuration does not support returning hidden states!")

    # combining last 4 (TRY OTHER) layers for more robust results
    combined_last_four_layers = tf.reduce_mean(tf.stack([hidden_states[i] for i in range(-4, 0)]), axis=0)
    # normalizing the combined embeddings
    pred_embeddings = tf.nn.l2_normalize(combined_last_four_layers, axis=2) [:, mask_token_indexes[i], :]

    true_sentences_tkz = tokenizer(true_sentences[i], return_tensors="tf", padding=True, truncation=True)
    hidden_states = model(**true_sentences_tkz).hidden_states
    # combining last 4 (TRY OTHER) layers for more robust results
    combined_last_four_layers = tf.reduce_mean(tf.stack([hidden_states[i] for i in range(-4, 0)]), axis=0)
    # normalizing the combined embeddings
    true_embeddings = tf.nn.l2_normalize(combined_last_four_layers, axis=2) [:, mask_token_indexes[i], :]

    print(f"Cosine similarity: {cosine_similarity(tf.reshape(pred_embeddings[0], shape=(1, -1)), tf.reshape(true_embeddings[0], shape=(-1, 1)))[0][0]}")

word_embedding_similarity(biomedbert_models['BioMedBERT_base_full'], inputs[:10], pred_10[:10], tails[:10])

Cosine similarity: 0.8990528583526611
Cosine similarity: 0.8991973996162415
Cosine similarity: 0.8882724046707153
Cosine similarity: 0.850302517414093
Cosine similarity: 0.8405792117118835
Cosine similarity: 0.852641761302948
Cosine similarity: 0.8753180503845215
Cosine similarity: 0.8966502547264099
Cosine similarity: 0.9692301154136658
Cosine similarity: 0.9248853325843811


# MLM acccuracy measuring function

In [None]:
def compute_mml_top_n_accuracy(model_checkpoint:str, relation_dataset:str, dataset_frac=1 , top_n=5, random_state=123, verbose=0):

  data = pd.read_csv(relation_dataset, usecols=["head_name", "rel", "tail_names"])
  # For quicker testing due to resource limitations
  data_chunk = data.sample(frac=dataset_frac, random_state=random_state).reset_index(drop=True)

  rel_name = data_chunk['rel'][0]
  default_prompt = prompts.loc[prompts['pid'] == rel_name]['default_prompt'].tolist()[0]

  inputs = prepare_inputs(data_chunk, default_prompt)

  predicted_objects = get_model_predictions(model_checkpoint=model_checkpoint, inputs=inputs, top_n=top_n, verbose=verbose)

  true_objects = data_chunk['tail_names'].tolist()
  true_objects = list(map(lambda x: x.split(' || '), true_objects))

  return compute_accuracy(predicted_objects, true_objects)

In [None]:
# Comparing BERT models top 10 acc on a chunk of data (10%)
print('Top 5 accuracy')

relation = 'occurs_after'
print(f"Relation: \033[1m{relation}\033[0m")

for k, v in bert_models.items():
  acc = compute_mml_top_n_accuracy(v, f'{relation}_1000.csv', top_n=5)
  print(f"{k}: {acc:.2f}\n")

Top 5 accuracy
Relation: [1moccurs_after[0m
BERT_base: 5.40

BERT_large: 5.40

BERT_large_wwm: 6.40



In [None]:
for k, v in roberta_models.items():
  acc = compute_mml_top_n_accuracy(v, f'{relation}_1000.csv', dataset_frac=0.1, top_n=5)
  print(f"{k}: {acc:.2f}\n")

RoBERTa_base: 2.00

RoBERTa_large: 3.00



In [None]:
for k, v in albert_models.items():
  acc = compute_mml_top_n_accuracy(v, f'{relation}_1000.csv', dataset_frac=0.1, top_n=5)
  print(f"{k}: {acc:.2f}\n")

ALBERT_base: 8.00



pytorch_model.bin:  74%|#######3  | 661M/893M [00:00<?, ?B/s]

ALBERT_xxlarge: 8.00



In [None]:
for k, v in biobert_models.items():
  acc = compute_mml_top_n_accuracy(v, f'{relation}_1000.csv', dataset_frac=0.1, top_n=5)
  print(f"{k}: {acc:.2f}\n")



pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

BioBERT: 2.00



In [None]:
for k, v in biomedbert_models.items():
  acc = compute_mml_top_n_accuracy(v, f'{relation}_1000.csv', dataset_frac=0.1, top_n=5)
  print(f"{k}: {acc:.2f}\n")

BioMedBERT_base_abstract: 8.00

BioMedBERT_base_full: 8.00



tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

BioMedBERT_large_abstract: 10.00



# Multi-token issue

In [None]:
# Conditional MLM
def fill_masks_independently(model_checkpoint: str, input_query: str, mask_token="[MASK]", top_n=5):

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    tokenized_input = tokenizer(input_query, return_tensors="tf", padding=True, truncation=True)

    mask_token_id = tokenizer.mask_token_id if tokenizer.mask_token_id is not None else tokenizer.convert_tokens_to_ids([mask_token])[0]
    mask_indices = tf.where(tf.equal(tokenized_input["input_ids"], mask_token_id))

    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)
    token_logits = model(**tokenized_input).logits

    predictions = []
    for mask_index in mask_indices:
        mask_position = mask_index[1].numpy()
        mask_logits = token_logits[0, mask_position, :]
        top_token_ids = tf.argsort(-mask_logits)[:top_n].numpy()
        top_tokens = tokenizer.decode(top_token_ids)
        predictions.append(top_tokens)

    return predictions

In [None]:
def fill_masks_autoregressively(model_checkpoint: str, input_query: str, mask_token="[MASK]", top_n=5):

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    tokenized_input = tokenizer(input_query, return_tensors="tf", padding=True, truncation=True)

    mask_token_id = tokenizer.mask_token_id if tokenizer.mask_token_id is not None else tokenizer.convert_tokens_to_ids([mask_token])[0]
    mask_indices = tf.where(tf.equal(tokenized_input["input_ids"], mask_token_id))

    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)
    #token_logits = model(**tokenized_input).logits

    predictions = []
    for mask_index in mask_indices:
        mask_position = mask_index[1].numpy()
        context = tokenized_input["input_ids"][:, :mask_position]

        for _ in range(top_n):
            mask_logits = model(input_ids=context).logits[0, -1, :]
            predicted_token_id = tf.argmax(mask_logits, axis=-1).numpy()
            predicted_token = tokenizer.decode(predicted_token_id)

            context = tf.concat([context, [[predicted_token_id]]], axis=-1)

            if tokenizer.decode(predicted_token_id) in tokenizer.all_special_tokens or predicted_token in ['.', '!', '?']:
              break

        predictions.append(predicted_token)

    return predictions

In [None]:
# TO DO
def fill_masks_by_confidence(model_checkpoint: str, input_query: str, mask_token="[MASK]", top_n=5):

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    tokenized_input = tokenizer(input_query, return_tensors="tf", padding=True, truncation=True)

    mask_token_id = tokenizer.mask_token_id if tokenizer.mask_token_id is not None else tokenizer.convert_tokens_to_ids([mask_token])[0]
    mask_indices = tf.where(tf.equal(tokenized_input["input_ids"], mask_token_id))

    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)
    token_logits = model(**tokenized_input).logits

    predictions = []
    for mask_index in mask_indices:
        mask_position = mask_index[1].numpy()
        mask_logits = token_logits[0, mask_position, :]
        top_token_ids = tf.argsort(-mask_logits)[:top_n].numpy()
        top_tokens = tokenizer.decode(top_token_ids)
        predictions.append(top_tokens)

    return predictions

In [None]:
fill_masks_independently(bert_models['BERT_base'], "Paris is [MASK][MASK] to visit.")

['also not always a definitely', 'fun easy pleasant welcome hard']

In [None]:
fill_masks_autoregressively(bert_models['BERT_base'], "Paris is [MASK][MASK] to visit.")

In [None]:
fill_masks_by_confidence(bert_models['BERT_base'], "Paris is [MASK][MASK] to visit.")

# Restricted candidate set