<a href="https://colab.research.google.com/github/PavleSavic/MLM_consistency/blob/main/consistency_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Тhis notebook presents the final versions of the functions required for testing model accuracy and consistency, as well as the actual testing of the mentioned measures on some of the relations from the datаset. In the final part of the notebook, a method for fine-tuning the model with the aim of increasing consistency is presented.

In [1]:
import random
import string
import logging
import heapq
from typing import Callable
from collections import OrderedDict
import pandas as pd
import numpy as np
import tensorflow as tf
#!pip install transformers datasets evaluate
from transformers import AutoTokenizer, TFAutoModelForMaskedLM, TFAutoModel

In [2]:
random.seed(123)
tf.keras.mixed_precision.set_global_policy('mixed_float16')
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


# Analyzed models

In [None]:
# uncased
bert_models = {'BERT_base' : "google-bert/bert-base-uncased", 'BERT_large': "google-bert/bert-large-uncased",
                'BERT_large_wwm': "google-bert/bert-large-uncased-whole-word-masking"}
# cased
roberta_models = {'RoBERTa_base': "FacebookAI/roberta-base", 'RoBERTa_large': "FacebookAI/roberta-large"}
# uncased
albert_models = {'ALBERT_base': "albert/albert-base-v2", 'ALBERT_xxlarge': "albert/albert-xxlarge-v2"}
# cased
biobert_models = {'BioBERT': "dmis-lab/biobert-base-cased-v1.2"}
# uncased
biomedbert_models = {'BioMedBERT_base_abstract' : "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract",
                     'BioMedBERT_base_full': "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
                     'BioMedBERT_large_abstract': "microsoft/BiomedNLP-BiomedBERT-large-uncased-abstract"}

# Proposed functions for making multi-token predictions - Conditional MLM

In [3]:
def change_input_format(input):
  new_input = input.replace('[MASK]','<mask>')
  return new_input

In [4]:
def reduce_masks(text:str, num_masks_to_keep: int, mask_str='[MASK]'):
    parts = text.split('[MASK]')
    num_masks = len(parts) - 1

    if num_masks_to_keep > num_masks:
        raise ValueError(f"The text only contains {num_masks} '[MASK]' tokens, but {num_masks_to_keep} were requested to keep!")

    reduced_text = ' [MASK]'.join(part.strip() for part in parts[:num_masks_to_keep + 1])
    remaining_text = ''.join(parts[num_masks_to_keep + 1:]).strip()

    if remaining_text:
        reduced_text += ' ' + remaining_text

    return reduced_text.strip()

In [10]:
# Conditional MLM
# filling masks in parallel independently (Independent approach)
# trying different mask token sequence lengths
def fill_masks_independently(model_checkpoint: str, inputs: list[str], candidate_set_tokens=None, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    mask_str = '[MASK]'
    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        mask_str = '<mask>'
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    # if candidate_set_tokens is None, setting it to all tokens of the model
    if candidate_set_tokens is None:
      candidate_set_tokens = list(tokenizer.get_vocab().values()) # .keys() - decoded tokens (words/subwords)

    outputs = []
    outputs_decoded = []

    for input in inputs:
      # trying different mask token sequence lengths - from 1 to M (length of longest true answer)
      M = input.count(mask_str)

      max_confidence = 0
      most_confident_prediction = None

      for i in range(1, M+1):

        input_text = reduce_masks(input, i, mask_str)
        tokenized_input = tokenizer(input_text, return_tensors="tf")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        # getting the token logits from the model
        token_logits = model(**tokenized_input).logits[0]

        token_probs = tf.nn.softmax(token_logits, axis=-1)

        input_ids = tokenized_input["input_ids"]

        mask_token_indices = np.where(input_ids.numpy()[0] == tokenizer.mask_token_id)[0]

        mask_token_probs = tf.gather(token_probs, mask_token_indices)

        # getting probs of tokens that are present in a candidate set
        mask_token_probs_candidates = tf.gather(mask_token_probs, candidate_set_tokens, axis=1)

        # tf.matf.top_k returns k top values and indices from the input tensor along last dimension (by default)
        top_values, top_indices  = tf.math.top_k(mask_token_probs_candidates, k=1)

        # finding original indices (token ids):
        # converting candidate_set_tokens to a tf tensor
        candidate_set_tokens_tensor = tf.constant(candidate_set_tokens, dtype=tf.int32)
        # using tf.gather to transform the indices to corresponding values from candidate_set_tokens_tensor
        top_indices_original = tf.gather(candidate_set_tokens_tensor, top_indices)

        # confidence - probs of the predicted tokens / number of predicted tokens
        confidence = np.sum(tf.squeeze(top_values).numpy()) / len(mask_token_indices)
        prediction = list(np.atleast_1d(tf.squeeze(top_indices_original).numpy()))

        if verbose:
          print(f'Prediction: {prediction} : {tokenizer.convert_ids_to_tokens(prediction)}')
          print(f'Confidence: {confidence}')

        if confidence > max_confidence:
          max_confidence = confidence
          most_confident_prediction = prediction

      outputs.append(most_confident_prediction)
      prediction_decoded = tokenizer.decode(most_confident_prediction, skip_special_tokens=True)
      outputs_decoded.append(prediction_decoded)
      if verbose:
        print('-----------------------------------------------------------------------------------')

    return outputs, outputs_decoded

In [11]:
test_inputs = ["Paris is a [MASK] [MASK] to visit.", "Jupyter is the largest planet of the [MASK] [MASK].", "The weather forecast predicts [MASK] [MASK] for tomorrow.", "The weather forecast predicts heavy rain and [MASK] [MASK].", "He wanted to visit the museum and explore the [MASK] [MASK].", "She was excited about the promotion and [MASK] [MASK].", "He is known for his dedication and [MASK] [MASK] [MASK].", "They plan to travel to Italy and enjoy the beautiful [MASK] [MASK] [MASK].",  "She decided to go to the market and buy some fresh [MASK] [MASK] [MASK] [MASK].", "He set a new world record at the [MASK] [MASK] [MASK] [MASK] event."]
outputs, outputs_dec = fill_masks_independently(bert_models['BERT_base'], test_inputs, verbose=1)

test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])

i = 0
for output, output_dec in zip(outputs, outputs_dec):
  print(test_inputs[i])
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")
  i += 1
  print('-------------------------------------------------------------------------')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Prediction: [2173] : ['place']
Confidence: 0.7230950593948364
Prediction: [2307, 2173] : ['great', 'place']
Confidence: 0.5290212631225586
-----------------------------------------------------------------------------------
Prediction: [3103] : ['sun']
Confidence: 0.20109564065933228
Prediction: [3103, 2155] : ['sun', 'family']
Confidence: 0.23593088

In [None]:
# Conditional MLM
# filling masks autoregressively (Order approach - left to right)
# trying different mask token sequence lengths
def fill_masks_autoregressively(model_checkpoint: str, inputs: list[str], candidate_set_tokens=None, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    mask_str = '[MASK]'
    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        inputs = [change_input_format(input) for input in inputs]
        mask_str = '<mask>'

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    # if candidate_set_tokens is None, setting it to all tokens of the model
    if candidate_set_tokens is None:
      candidate_set_tokens = list(tokenizer.get_vocab().values()) # .keys() - decoded tokens (words/subwords)

    outputs = []
    outputs_decoded = []

    for input in inputs:
      # trying different mask token sequence lengths - from 1 to M (length of longest true answer)
      M = input.count(mask_str)

      max_confidence = 0
      most_confident_prediction = None

      for i in range(1, M+1):

        input_text = reduce_masks(input, i, mask_str)

        tokenized_input = tokenizer(input_text, return_tensors="tf")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]

        # finding all positions of the mask tokens
        mask_token_indices = np.where(input_ids.numpy()[0] == tokenizer.mask_token_id)[0]

        # not necessary to be computed
        if verbose:
          initial_confidence = 0

        prediction = []
        for mask_index in mask_token_indices:

          token_logits = model(**tokenized_input).logits[0]
          token_probs = tf.nn.softmax(token_logits, axis=-1)
          mask_token_probs = token_probs[mask_index, :]

          # getting the top predicted token from candidate set
          top_token = candidate_set_tokens[np.argmax(mask_token_probs.numpy()[candidate_set_tokens])]

          if verbose:
            initial_confidence += mask_token_probs.numpy()[top_token]

          prediction.append(top_token)
                                                            # list of tensor coordinates to change
          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [top_token])

          # making new tokenized_input tensor
          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

        if verbose:
          initial_confidence /= i
          print(f'Prediction: {prediction} : {tokenizer.convert_ids_to_tokens(prediction)}')
          print(f'Confidence before recomputing: {initial_confidence}')

        # recompute confidence of every predicted token (this provides the probability of each token in the context of the entire sequence -  bidirectional conditional distributions)
        confidence = 0
        for mask_index in mask_token_indices:
          predicted_token = tf.gather_nd(input_ids, [[0, mask_index]])[0]

          # replacing predicted token with mask to remove bias
          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [tokenizer.mask_token_id])

          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

          token_logits = model(**tokenized_input).logits[0]
          token_probs = tf.nn.softmax(token_logits, axis=-1)
          mask_token_probs = token_probs[mask_index, :]
          # getting prob of predicted token in the context of the entire predicted sequence
          confidence += mask_token_probs.numpy()[predicted_token]

          # putting predicted token back to the context
          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [predicted_token])

          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

        confidence /= i
        if verbose:
          print(f'Confidence after recomputing: {confidence}')

        if confidence > max_confidence:
          max_confidence = confidence
          most_confident_prediction = prediction

      outputs.append(most_confident_prediction)
      prediction_decoded = tokenizer.decode(most_confident_prediction, skip_special_tokens=True)
      outputs_decoded.append(prediction_decoded)
      if verbose:
        print('-----------------------------------------------------------------------------------')

    return outputs, outputs_decoded

In [None]:
outputs, outputs_dec = fill_masks_autoregressively(bert_models['BERT_base'], test_inputs, verbose=1)

test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])

i = 0
for output, output_dec in zip(outputs, outputs_dec):
  print(test_inputs[i])
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")
  i += 1
  print('-------------------------------------------------------------------------')

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Prediction: [2173] : ['place']
Confidence before recomputing: 0.7230950593948364
Confidence after recomputing: 0.7230950593948364
Prediction: [2307, 2173] : ['great', 'place']
Confidence before recomputing: 0.5600524544715881
Confidence after recomputing: 0.6126892864704132
--------------------------------------------------------------------------

In [None]:
# Conditional MLM
# filling masks sorted by the maximum confidence (Greedy approach)
# trying different mask token sequence lengths
def fill_masks_by_confidence(model_checkpoint: str, inputs: list[str], candidate_set_tokens=None, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    mask_str = '[MASK]'
    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
      mask_str = '<mask>'
      inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
      tokenizer.model_max_length = 512

    if verbose:
      print(f'Chosen model: {model_checkpoint}')
      model.summary()

    # if candidate_set_tokens is None, setting it to all tokens of the model
    if candidate_set_tokens is None:
      candidate_set_tokens = list(tokenizer.get_vocab().values()) # .keys() - decoded tokens (words/subwords)

    outputs = []
    outputs_decoded = []

    for input in inputs:
      # trying different mask token sequence lengths - from 1 to M (length of longest true answer)
      M = input.count(mask_str)

      max_confidence = 0
      most_confident_prediction = None

      for i in range(1, M+1):

        input_text = reduce_masks(input, i, mask_str)

        tokenized_input = tokenizer(input_text, return_tensors="tf")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]

        # needed for confidence recomputation
        initial_mask_token_indices = np.where(input_ids.numpy()[0] == tokenizer.mask_token_id)[0]

        # not necessary to be computed
        if verbose:
          initial_confidence = 0

        prediction_dict = {}
        while True:

          # finding all positions of the [MASK] tokens
          mask_token_indices = np.where(input_ids.numpy()[0] == tokenizer.mask_token_id)[0]

          # all tokens at mask positions are predicted
          if len(mask_token_indices) == 0:
            break

          # getting token logits at mask_token_indices
          token_logits = model(**tokenized_input).logits[0]
          token_probs = tf.nn.softmax(token_logits, axis=-1)

          mask_token_probs = tf.gather(token_probs, mask_token_indices)

          mask_token_probs_candidates = tf.gather(mask_token_probs, candidate_set_tokens, axis=1)

          # tf.matf.top_k returns k top values and indices from the input tensor along last dimension (by default)
          top_values, top_indices = tf.math.top_k(mask_token_probs_candidates, k=1)

          # finding original indices (token ids)
          # converting candidate_set_tokens to a tf tensor
          candidate_set_tokens_tensor = tf.constant(candidate_set_tokens, dtype=tf.int32)
          # using tf.gather to transform the indices to corresponding values from candidate_set_tokens_tensor
          top_indices_original = tf.gather(candidate_set_tokens_tensor, top_indices)

          top_values = np.atleast_1d(tf.squeeze(top_values).numpy())
          top_indices_original = np.atleast_1d(tf.squeeze(top_indices_original).numpy())

          k = tf.argmax(top_values)
          most_confident_mask_position, most_confident_token = mask_token_indices[k], top_indices_original[k]

          if verbose:
            initial_confidence += top_values[k]

          if verbose:
            print(f"{most_confident_token}: {tokenizer.convert_ids_to_tokens([most_confident_token])} - index: {most_confident_mask_position}")

          prediction_dict[most_confident_mask_position] = most_confident_token

          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, most_confident_mask_position]], [most_confident_token])

          # making new tokenized_input tensor
          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

        prediction = [value for key, value in sorted(prediction_dict.items())]
        if verbose:
          initial_confidence /= i
          print(f'Prediction: {prediction} : {tokenizer.convert_ids_to_tokens(prediction)}')
          print(f'Confidence before recomputing: {initial_confidence}')

        # recompute confidence of every predicted token (this provides the probability of each token in the context of the entire sequence -  bidirectional conditional distributions)
        confidence = 0
        for mask_index in initial_mask_token_indices:
          predicted_token = prediction_dict[mask_index]

          # replacing predicted token with mask to remove bias
          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [tokenizer.mask_token_id])

          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

          token_logits = model(**tokenized_input).logits[0]
          token_probs = tf.nn.softmax(token_logits, axis=-1)
          mask_token_probs = token_probs[mask_index, :]
          # getting prob of predicted token in the context of the entire predicted sequence
          confidence += mask_token_probs.numpy()[predicted_token]

          # putting predicted token back to the context
          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [predicted_token])

          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

        confidence /= i
        if verbose:
          print(f'Confidence after recomputing: {confidence}')

        if confidence > max_confidence:
          max_confidence = confidence
          most_confident_prediction = prediction

      outputs.append(most_confident_prediction)
      prediction_decoded = tokenizer.decode(most_confident_prediction, skip_special_tokens=True)
      outputs_decoded.append(prediction_decoded)
      if verbose:
        print('-----------------------------------------------------------------------------------')

    return outputs, outputs_decoded

In [None]:
outputs, outputs_dec = fill_masks_by_confidence(bert_models['BERT_base'], test_inputs, verbose=1)

test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])

i = 0
for output, output_dec in zip(outputs, outputs_dec):
  print(test_inputs[i])
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")
  i += 1
  print('-------------------------------------------------------------------------')

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
2173: ['place'] - index: 4
Prediction: [2173] : ['place']
Confidence before recomputing: 0.7230950593948364
Confidence after recomputing: 0.7230950593948364
2173: ['place'] - index: 5
2307: ['great'] - index: 4
Prediction: [2307, 2173] : ['great', 'place']
Confidence before recomputing: 0.5816580951213837
Confidence after recomputing: 0.6126892864

In [None]:
# Conditional MLM
# Initial predictions (Order) + Refinement (Order) until predictions converge or maximum number of iterations is reached
# trying different mask token sequence lengths
# ADD prediction length penalty (???)
def fill_masks_autoregressively_with_refinement(model_checkpoint: str, inputs: list[str], candidate_set_tokens=None, max_iter=10, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    mask_str = '[MASK]'
    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        mask_str = '<mask>'
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    # if candidate_set_tokens is None, setting it to all tokens of the model
    if candidate_set_tokens is None:
      candidate_set_tokens = list(tokenizer.get_vocab().values()) # .keys() - decoded tokens (words/subwords)

    outputs = []
    outputs_decoded = []

    for input in inputs:
      # trying different mask token sequence lengths - from 1 to M (length of longest true answer)
      M = input.count(mask_str)

      max_confidence = 0
      most_confident_prediction = None

      for i in range(1, M+1):
        input_text = reduce_masks(input, i, mask_str)

        tokenized_input = tokenizer(input_text, return_tensors="tf")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]

        # finding all positions of the [MASK] tokens
        mask_token_indices = np.where(input_ids.numpy()[0] == tokenizer.mask_token_id)[0]

        confidence = 0
        prediction_dict = OrderedDict((mask_index, tokenizer.mask_token_id) for mask_index in mask_token_indices)

        # making initial predictions
        for mask_index in mask_token_indices:

          token_logits = model(**tokenized_input).logits[0]
          token_probs = tf.nn.softmax(token_logits, axis=-1)
          mask_token_probs = token_probs[mask_index, :]

          # getting the top predicted token from candidate set
          top_token = candidate_set_tokens[np.argmax(mask_token_probs.numpy()[candidate_set_tokens])]
          confidence += mask_token_probs.numpy()[top_token]

          prediction_dict[mask_index] = top_token

          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [top_token])

          # making new tokenized_input tensor
          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

        confidence /= i

        if verbose:
          prediction_initial = [value for key, value in prediction_dict.items()]
          prediction_initial_decoded = tokenizer.decode(prediction_initial, skip_special_tokens=True)
          print(f'Initial prediction: {prediction_initial} : {tokenizer.convert_ids_to_tokens(prediction_initial)} : {prediction_initial_decoded}')
          print(f'Initial confidence: {confidence}')

        # refining predictions - UPDATE: replacing old predicted token with mask token before predicting to remove bias
        for j in range(max_iter):

          if verbose:
            print(f"Iteration: {j}")

          updated_tokens = 0
          new_confidence = 0
          for mask_index in mask_token_indices:
            predicted_token = prediction_dict[mask_index]

            # replacing predicted token with mask token to remove bias
            input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [tokenizer.mask_token_id])

            if use_token_type_ids:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
                'token_type_ids': tokenized_input['token_type_ids']
              }
            else:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
              }

            token_logits = model(**tokenized_input).logits[0]
            token_probs = tf.nn.softmax(token_logits, axis=-1)
            mask_token_probs = token_probs[mask_index, :]

            # getting the top predicted token from candidate set
            top_token = candidate_set_tokens[np.argmax(mask_token_probs.numpy()[candidate_set_tokens])]
            new_confidence += mask_token_probs.numpy()[top_token]

            if prediction_dict[mask_index] != top_token:
              prediction_dict[mask_index] = top_token
              updated_tokens += 1

            # putting predicted token to the context
            input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [top_token])

            # making new tokenized_input tensor
            if use_token_type_ids:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
                'token_type_ids': tokenized_input['token_type_ids']
              }
            else:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
              }

          # confidence can change even if no tokens are updated (we want the probability of each token recomputed in the context of the entire predicted sequence - bidirectional conditional distributions)
          # SMALL ISSUE: if for max_iter iterations prediction changes (no convergence) final prediction confidence won't be recomputed - not happening in tested examples
          confidence = new_confidence / i

          if verbose:
            prediction_j = [value for key, value in prediction_dict.items()]
            prediction_j_decoded = tokenizer.decode(prediction_j, skip_special_tokens=True)
            print(f'Prediction in iteration {j}: {prediction_j} : {tokenizer.convert_ids_to_tokens(prediction_j)} : {prediction_j_decoded}')
            print(f'Confidence in iteration {j}: {confidence}')

          # checking if convergence happened
          if updated_tokens == 0:
            if verbose:
              print(f"\033[1mConvergence reached in iteration {j}!\033[0m")
            break

        if confidence > max_confidence:
          max_confidence = confidence
          most_confident_prediction = [value for key, value in prediction_dict.items()]

        if verbose:
          print('-------------------------------------------------------')

      outputs.append(most_confident_prediction)
      prediction_decoded = tokenizer.decode(most_confident_prediction, skip_special_tokens=True)
      outputs_decoded.append(prediction_decoded)
      if verbose:
        print('------------------------------------------------------------------------------------------')

    return outputs, outputs_decoded

In [None]:
outputs, outputs_dec = fill_masks_autoregressively_with_refinement(bert_models['BERT_base'], test_inputs, verbose=1)

test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])
i = 0
for output, output_dec in zip(outputs, outputs_dec):
  print(test_inputs[i])
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")
  i += 1
  print('-------------------------------------------------------------------------')

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial prediction: [2173] : ['place'] : place
Initial confidence: 0.7230950593948364
Iteration: 0
Prediction in iteration 0: [2173] : ['place'] : place
Confidence in iteration 0: 0.7230950593948364
[1mConvergence reached in iteration 0![0m
-------------------------------------------------------
Initial prediction: [2307, 2173] : ['great', 'plac

In [None]:
# Conditional MLM
# Initial predictions (Greedy) + Refinement (Order) until predictions converge or maximum number of iterations is reached
# trying different mask token sequence lengths
# ADD prediction length penalty (???)
def fill_masks_by_confidence_order_refinement(model_checkpoint: str, inputs: list[str], candidate_set_tokens=None, top_n=5, max_iter=10, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    mask_str = '[MASK]'
    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        mask_str = '<mask>'
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    # if candidate_set_tokens is None, setting it to all tokens of the model
    if candidate_set_tokens is None:
      candidate_set_tokens = list(tokenizer.get_vocab().values()) # .keys() - decoded tokens (words/subwords)

    outputs = []
    outputs_decoded = []

    for input in inputs:
      # trying different mask token sequence lengths - from 1 to M (length of longest true answer)
      M = input.count(mask_str)

      max_confidence = 0
      most_confident_prediction = None

      for i in range(1, M+1):
        input_text = reduce_masks(input, i, mask_str)

        tokenized_input = tokenizer(input_text, return_tensors="tf")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]

        # needed for refinement phase
        initial_mask_token_indices = np.where(input_ids.numpy()[0] == tokenizer.mask_token_id)[0]

        confidence = 0
        prediction_dict = {}

        # Greedy initial
        while True:

          # finding all positions of the [MASK] tokens
          mask_token_indices = np.where(input_ids.numpy()[0] == tokenizer.mask_token_id)[0]

          # all tokens at mask positions are predicted
          if len(mask_token_indices) == 0:
            break

          # getting token logits at mask_token_indices
          token_logits = model(**tokenized_input).logits[0]
          token_probs = tf.nn.softmax(token_logits, axis=-1)

          mask_token_probs = tf.gather(token_probs, mask_token_indices)

          mask_token_probs_candidates = tf.gather(mask_token_probs, candidate_set_tokens, axis=1)

          # tf.matf.top_k returns k top values and indices from the input tensor along last dimension (by default)
          top_values, top_indices = tf.math.top_k(mask_token_probs_candidates, k=1)

          # finding original indices (token ids)
          # converting candidate_set_tokens to a tf tensor
          candidate_set_tokens_tensor = tf.constant(candidate_set_tokens, dtype=tf.int32)
          # using tf.gather to transform the indices to corresponding values from candidate_set_tokens_tensor
          top_indices_original = tf.gather(candidate_set_tokens_tensor, top_indices)

          top_values = np.atleast_1d(tf.squeeze(top_values).numpy())
          top_indices_original = np.atleast_1d(tf.squeeze(top_indices_original).numpy())

          k = tf.argmax(top_values)
          most_confident_mask_position, most_confident_token = mask_token_indices[k], top_indices_original[k]
          confidence += top_values[k]

          if verbose:
            print(f"{most_confident_token}: {tokenizer.convert_ids_to_tokens([most_confident_token])} - index: {most_confident_mask_position}")

          prediction_dict[most_confident_mask_position] = most_confident_token

          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, most_confident_mask_position]], [most_confident_token])

          # making new tokenized_input tensor
          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

        confidence /= i
        # sorting prediction_dict by mask positions
        prediction_dict = OrderedDict({mask_position : prediction_dict[mask_position] for mask_position in sorted(prediction_dict)})

        if verbose:
          prediction_initial = [value for key, value in prediction_dict.items()]
          prediction_initial_decoded = tokenizer.decode(prediction_initial, skip_special_tokens=True)
          print(f'Initial greedy prediction: {prediction_initial} : {tokenizer.convert_ids_to_tokens(prediction_initial)} : {prediction_initial_decoded}')
          print(f'Initial confidence: {confidence}')

        # refining predictions - UPDATE: replacing old predicted token with mask token before predicting to remove bias
        for j in range(max_iter):

          if verbose:
            print(f"Iteration: {j}")

          updated_tokens = 0
          new_confidence = 0
          for mask_index in initial_mask_token_indices:
            predicted_token = prediction_dict[mask_index]

            # replacing predicted token with mask token to remove bias
            input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [tokenizer.mask_token_id])

            if use_token_type_ids:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
                'token_type_ids': tokenized_input['token_type_ids']
              }
            else:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
              }

            token_logits = model(**tokenized_input).logits[0]
            token_probs = tf.nn.softmax(token_logits, axis=-1)
            mask_token_probs = token_probs[mask_index, :]

            # getting the top predicted token from candidate set
            top_token = candidate_set_tokens[np.argmax(mask_token_probs.numpy()[candidate_set_tokens])]
            new_confidence += mask_token_probs.numpy()[top_token]

            if prediction_dict[mask_index] != top_token:
              prediction_dict[mask_index] = top_token
              updated_tokens += 1

            # putting predicted token to the context
            input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [top_token])

            # making new tokenized_input tensor
            if use_token_type_ids:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
                'token_type_ids': tokenized_input['token_type_ids']
              }
            else:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
              }

          # confidence can change even if no tokens are updated (we want the probability of each token recomputed in the context of the entire predicted sequence - bidirectional conditional distributions)
          # SMALL ISSUE: if for max_iter iterations prediction changes (no convergence) final prediction confidence won't be recomputed - not happening in tested examples
          confidence = new_confidence / i

          if verbose:
            prediction_j = [value for key, value in prediction_dict.items()]
            prediction_j_decoded = tokenizer.decode(prediction_j, skip_special_tokens=True)
            print(f'Prediction in iteration {j} : {prediction_j} : {tokenizer.convert_ids_to_tokens(prediction_j)} : {prediction_j_decoded}')
            print(f'Confidence in iteration {j}: {confidence}')

          # checking if convergence happened
          if updated_tokens == 0:
            if verbose:
              print(f"\033[1mConvergence reached in iteration {j}!\033[0m")
            break

        if confidence > max_confidence:
          max_confidence = confidence
          most_confident_prediction = [value for key, value in prediction_dict.items()]

        if verbose:
          print('-------------------------------------------------------')

      outputs.append(most_confident_prediction)
      prediction_decoded = tokenizer.decode(most_confident_prediction, skip_special_tokens=True)
      outputs_decoded.append(prediction_decoded)
      if verbose:
        print('------------------------------------------------------------------------------------------')

    return outputs, outputs_decoded

In [None]:
outputs, outputs_dec = fill_masks_by_confidence_order_refinement(bert_models['BERT_base'], test_inputs, verbose=1)

test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])
i = 0
for output, output_dec in zip(outputs, outputs_dec):
  print(test_inputs[i])
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")
  i += 1
  print('-------------------------------------------------------------------------')

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
2173: ['place'] - index: 4
Initial greedy prediction: [2173] : ['place'] : place
Initial confidence: 0.7230950593948364
Iteration: 0
Prediction in iteration 0 : [2173] : ['place'] : place
Confidence in iteration 0: 0.7230950593948364
[1mConvergence reached in iteration 0![0m
-------------------------------------------------------
2173: ['place']

In [None]:
# Conditional MLM
# Initial predictions (Order) + Refinement (Greedy) until predictions converge or maximum number of iterations is reached
# trying different mask token sequence lengths
# ADD prediction length penalty (???)
def fill_masks_autoregressively_greedy_refinement(model_checkpoint: str, inputs: list[str], candidate_set_tokens=None, max_iter=10, verbose=0):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    mask_str = '[MASK]'
    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        mask_str = '<mask>'
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    # if candidate_set_tokens is None, setting it to all tokens of the model
    if candidate_set_tokens is None:
      candidate_set_tokens = list(tokenizer.get_vocab().values()) # .keys() - decoded tokens (words/subwords)

    outputs = []
    outputs_decoded = []

    for input in inputs:
      # trying different mask token sequence lengths - from 1 to M (length of longest true answer)
      M = input.count(mask_str)

      max_confidence = 0
      most_confident_prediction = None

      for i in range(1, M+1):
        input_text = reduce_masks(input, i, mask_str)

        tokenized_input = tokenizer(input_text, return_tensors="tf")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]

        # finding all positions of the [MASK] tokens
        mask_token_indices = np.where(input_ids.numpy()[0] == tokenizer.mask_token_id)[0]

        prediction_dict = OrderedDict((mask_index, tokenizer.mask_token_id) for mask_index in mask_token_indices)

        # not necessary to be computed
        if verbose:
          initial_probs_dict = OrderedDict((mask_index, -1) for mask_index in mask_token_indices)

        # making initial predictions
        for mask_index in mask_token_indices:

          token_logits = model(**tokenized_input).logits[0]
          token_probs = tf.nn.softmax(token_logits, axis=-1)
          mask_token_probs = token_probs[mask_index, :]

          # getting the top predicted token from candidate set
          top_token = candidate_set_tokens[np.argmax(mask_token_probs.numpy()[candidate_set_tokens])]
          prediction_dict[mask_index] = top_token

          if verbose:
            token_prob = mask_token_probs.numpy()[top_token]
            initial_probs_dict[mask_index] = token_prob

          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [top_token])

          # making new tokenized_input tensor
          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

        if verbose:
          prediction_initial = [value for key, value in prediction_dict.items()]
          initial_probs = [value for key, value in initial_probs_dict.items()]
          prediction_initial_decoded = tokenizer.decode(prediction_initial, skip_special_tokens=True)
          print(f'Initial prediction: {prediction_initial} : {tokenizer.convert_ids_to_tokens(prediction_initial)} : {prediction_initial_decoded}')
          print(f'Initial probs: {initial_probs}')

        # refining predictions
        for j in range(max_iter):

          if verbose:
            print(f"Iteration: {j}")

          # recomputing prob of every token in the context of the entire predicted sequence - bidirectional conditional distributions
          probs_dict = OrderedDict((mask_index, -1) for mask_index in mask_token_indices)
          for mask_index in mask_token_indices:
            predicted_token = prediction_dict[mask_index]

            # replacing predicted token with mask to remove bias
            input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [tokenizer.mask_token_id])

            if use_token_type_ids:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
                'token_type_ids': tokenized_input['token_type_ids']
              }
            else:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
              }

            token_logits = model(**tokenized_input).logits[0]
            token_probs = tf.nn.softmax(token_logits, axis=-1)
            mask_token_probs = token_probs[mask_index, :]
            # getting prob of predicted token in the context of the entire predicted sequence

            token_prob = mask_token_probs.numpy()[predicted_token]
            probs_dict[mask_index] = token_prob

            # putting predicted token back to the context
            input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, mask_index]], [predicted_token])

            if use_token_type_ids:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
                'token_type_ids': tokenized_input['token_type_ids']
              }
            else:
              tokenized_input = {
                'input_ids': input_ids,
                'attention_mask': tokenized_input['attention_mask'],
              }

          if verbose:
            probs = [value for key, value in probs_dict.items()]
            print(f'Recomputed probs {j}: {probs}')

          # finding token with the lowest prob
          min_mask_index = min(probs_dict, key=lambda mask_index: probs_dict[mask_index])
          min_token = prediction_dict[min_mask_index]

          if verbose:
            print(f'Token with lowest confidence: {tokenizer.convert_ids_to_tokens(min_token)}')

          # repredicting token with lowest confidence
          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, min_mask_index]], [tokenizer.mask_token_id])

          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

          token_logits = model(**tokenized_input).logits[0]
          token_probs = tf.nn.softmax(token_logits, axis=-1)
          min_mask_index_probs = token_probs[min_mask_index, :]

          new_predicted_token = candidate_set_tokens[np.argmax(min_mask_index_probs.numpy()[candidate_set_tokens])]

          prediction_dict[min_mask_index] = new_predicted_token
          probs_dict[min_mask_index] = min_mask_index_probs.numpy()[new_predicted_token]

          input_ids = tf.tensor_scatter_nd_update(input_ids, [[0, min_mask_index]], [new_predicted_token])

          if use_token_type_ids:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
              'token_type_ids': tokenized_input['token_type_ids']
            }
          else:
            tokenized_input = {
              'input_ids': input_ids,
              'attention_mask': tokenized_input['attention_mask'],
            }

          if verbose:
            prediction_j = [value for key, value in prediction_dict.items()]
            prediction_j_decoded = tokenizer.decode(prediction_j, skip_special_tokens=True)
            probs_j = [value for key, value in probs_dict.items()]
            print(f'Prediction after iteration {j} : {prediction_j} : {tokenizer.convert_ids_to_tokens(prediction_j)} : {prediction_j_decoded}')
            print(f'Probs after iteration {j}: {probs_j}')

          # checking if convergence happened
          if new_predicted_token == min_token:
            if verbose:
              print(f"\033[1mConvergence reached in iteration {j}!\033[0m")
            break

        confidence = sum([value for key, value in probs_dict.items()]) / i
        if confidence > max_confidence:
          max_confidence = confidence
          most_confident_prediction = [value for key, value in prediction_dict.items()]

        if verbose:
          print('-------------------------------------------------------')

      outputs.append(most_confident_prediction)
      prediction_decoded = tokenizer.decode(most_confident_prediction, skip_special_tokens=True)
      outputs_decoded.append(prediction_decoded)
      if verbose:
        print('------------------------------------------------------------------------------------------')

    return outputs, outputs_decoded

In [None]:
outputs, outputs_dec = fill_masks_autoregressively_greedy_refinement(bert_models['BERT_base'], test_inputs, verbose=1)

test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])
i = 0
for output, output_dec in zip(outputs, outputs_dec):
  print(test_inputs[i])
  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")
  i += 1
  print('-------------------------------------------------------------------------')

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Initial prediction: [2173] : ['place'] : place
Initial probs: [0.72309506]
Iteration: 0
Recomputed probs 0: [0.72309506]
Token with lowest confidence: place
Prediction after iteration 0 : [2173] : ['place'] : place
Probs after iteration 0: [0.72309506]
[1mConvergence reached in iteration 0![0m
---------------------------------------------------

In [43]:
# Conditional MLM
# Beam search approach
# Update: confidence based (confidence - average of predicted tokens probs)
# restricted candidate_set_tokens added as parameter
def fill_masks_beam_search(model_checkpoint: str, inputs: list[str], candidate_set_tokens=None, top_n=5, beam_width=5, verbose=0):

    if top_n > beam_width:
       raise ValueError("top_n must be less than or equal to beam_width")

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint, from_pt=True)

    mask_str = '[MASK]'
    # Adjusting inputs for RoBERTa models
    if 'roberta' in model_checkpoint:
        mask_str = '<mask>'
        inputs = [change_input_format(input) for input in inputs]

    # model_max_length field not set by default for BioBERT and BioMedBERT models
    if 'bio' in model_checkpoint.lower():
        tokenizer.model_max_length = 512

    if verbose:
        print(f'Chosen model: {model_checkpoint}')
        model.summary()

    # if candidate_set_tokens is None, setting it to all tokens of the model
    if candidate_set_tokens is None:
      candidate_set_tokens = list(tokenizer.get_vocab().values()) # .keys() - decoded tokens (words/subwords)

    all_outputs = []
    all_outputs_decoded = []
    for input in inputs:
      # trying different mask token sequence lengths - from 1 to M (length of longest true answer)
      M = input.count(mask_str)

      max_confidence = 0
      all_candidates = []

      for i in range(1, M+1):
        input_text = reduce_masks(input, i, mask_str)

        tokenized_input = tokenizer(input_text, return_tensors="tf")

        # checking if the model uses token_type_ids (not used in RoBERTa models)
        use_token_type_ids = 'token_type_ids' in tokenized_input

        input_ids = tokenized_input["input_ids"]

        mask_token_indices = np.where(input_ids.numpy()[0] == tokenizer.mask_token_id)[0]

        # Initializing the beam with the initial tokenized text (input_ids, attention_mask and token_type_ids (optional)) and a confidence of 0
        beam = [(tokenized_input, 0.0)]

        # Continue until all masks are filled
        for mask_token_index in mask_token_indices:
          candidates = []

          for seq, confidence in beam:
            token_logits = model(**seq).logits[0]
            token_probs = tf.nn.softmax(token_logits, axis=-1)
            mask_token_probs = token_probs[mask_token_index, :]

            # getting logits of tokens that are present in a candidate set
            mask_token_probs_candidates = tf.gather(mask_token_probs, candidate_set_tokens)

            # tf.matf.top_k returns k top values and indices from the input tensor along last dimension (by default)
            top_k_values, top_k_indices  = tf.math.top_k(mask_token_probs_candidates, k=beam_width)

            # finding original indices (token ids):
            # converting candidate_set_tokens to a tf tensor
            candidate_set_tokens_tensor = tf.constant(candidate_set_tokens, dtype=tf.int32)

            # using tf.gather to transform the indices to corresponding token ids from candidate_set_tokens_tensor
            top_k_indices_original = tf.gather(candidate_set_tokens_tensor, top_k_indices)

            for token_idx, token_prob in zip(top_k_indices_original, top_k_values):

              # creating a new sequence with the predicted token
              new_input_ids = tf.tensor_scatter_nd_update(seq['input_ids'], [[0, mask_token_index]], [token_idx])

              if use_token_type_ids:
                new_seq = {
                    'input_ids': new_input_ids,
                    'attention_mask': seq['attention_mask'],
                    'token_type_ids': seq['token_type_ids']
                }
              else:
                new_seq = {
                    'input_ids': new_input_ids,
                    'attention_mask': seq['attention_mask'],
                }

              # calculating the new confidence using probs
              new_confidence = confidence + token_prob.numpy()

              # adding the new sequence and its score to the candidates list
              candidates.append((new_seq, new_confidence))

          # selecting the top beam_width candidates (all candidates have mask sequences of same length - no need to divide confidence with i)
          beam = heapq.nlargest(beam_width, candidates, key=lambda x: x[1])

        # extracting top_n candidates sequences of the length i
        top_n_candidates = heapq.nlargest(top_n, beam, key=lambda x: x[1])
        # normalizing confidences - dividing with sequence length (i), adding length (i) to tuple
        top_n_candidates = [(tokenized_input, confidence / i, i) for tokenized_input, confidence in top_n_candidates]

        if verbose:
          print(f'Candidates of a length {i}:')

          for seq, confidence, i in top_n_candidates:
            # prediction: from predicted token at the first to the predicted token at the last mask index (all mask tokens in a sequence)
            prediction = seq['input_ids'].numpy()[0][mask_token_indices[0]:mask_token_indices[0]+i]
            prediction_decoded = tokenizer.decode(prediction, skip_special_tokens=True)
            print(f'Prediction: {prediction} : {tokenizer.convert_ids_to_tokens(prediction)} : {prediction_decoded}')
            print(f'Confidence: {confidence}')
          print('--------------------------------------------------------')

        all_candidates += top_n_candidates

      top_n_final_candidates = heapq.nlargest(top_n, all_candidates, key=lambda x: x[1])

      if verbose:
        print('\033[1mFinal predictions:\033[0m')

      outputs = []
      outputs_decoded = []

      for seq, confidence, i in top_n_final_candidates:
        prediction = seq['input_ids'].numpy()[0][mask_token_indices[0]:mask_token_indices[0]+i]
        prediction_decoded = tokenizer.decode(prediction, skip_special_tokens=True)
        outputs.append(list(prediction))
        outputs.append(prediction_decoded)

        if verbose:
          print(f'Prediction: {prediction} : {tokenizer.convert_ids_to_tokens(prediction)} : {prediction_decoded}')
          print(f'Confidence: {confidence}')

      if verbose:
        print('----------------------------------------------------------------------------------------------------')


      all_outputs.append(outputs)
      all_outputs_decoded.append(outputs_decoded)

    return all_outputs, all_outputs_decoded

In [44]:
outputs, outputs_dec = fill_masks_beam_search(bert_models['BERT_base'], test_inputs, top_n=5, beam_width=5, verbose=1)

#test_tokenizer = AutoTokenizer.from_pretrained(bert_models['BERT_base'])
#i = 0
#for output, output_dec in zip(outputs, outputs_dec):
#  print(test_inputs[i])
#  print(f"{output} : {output_dec} : {test_tokenizer.convert_ids_to_tokens(output)}")
#  i += 1
#  print('-------------------------------------------------------------------------')

Chosen model: google-bert/bert-base-uncased
Model: "tf_bert_for_masked_lm_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109514298 (417.76 MB)
Trainable params: 109514298 (417.76 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Candidates of a length 1:
Prediction: [2173] : ['place'] : place
Confidence: 0.7230950593948364
Prediction: [2103] : ['city'] : city
Confidence: 0.03346916660666466
Prediction: [7688] : ['destination'] : destination
Confidence: 0.026779673993587494
Prediction: [5165] : ['pleasure'] : pleasure
Confidence: 0.016575613990426064
Prediction: [5440] : 

In [5]:
# uncased
bert_models = {'BERT_base' : "google-bert/bert-base-uncased", 'BERT_large': "google-bert/bert-large-uncased",
                'BERT_large_wwm': "google-bert/bert-large-uncased-whole-word-masking"}
# cased
roberta_models = {'RoBERTa_base': "FacebookAI/roberta-base", 'RoBERTa_large': "FacebookAI/roberta-large"}
# uncased
albert_models = {'ALBERT_base': "albert/albert-base-v2", 'ALBERT_xxlarge': "albert/albert-xxlarge-v2"}
# cased
biobert_models = {'BioBERT': "dmis-lab/biobert-base-cased-v1.2"}
# uncased
biomedbert_models = {'BioMedBERT_base_abstract' : "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract",
                     'BioMedBERT_base_full': "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
                     'BioMedBERT_large_abstract': "microsoft/BiomedNLP-BiomedBERT-large-uncased-abstract"}