In [60]:
# Import required libraries
from transformers import AutoTokenizer, AutoModelForMaskedLM
from scipy.special import softmax
import numpy as np

In [61]:
# Specify the pre-trained model to use: BERT-base-cased
model_name = "bert-base-cased"

# Instantiate the tokenizer and model for the specified pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [86]:
# Get the mask token from the tokenizer
mask = tokenizer.mask_token
print(mask)

[MASK]


In [87]:
# Create a sentence with a mask token to be filled in by the model
sentence = f"Sakthi is a {mask} boy."
# Tokenize the sentence
tokens = tokenizer.tokenize(sentence)
print(tokens)

['Sa', '##kt', '##hi', 'is', 'a', '[MASK]', 'boy', '.']


In [88]:
# Encode the sentence using the tokenizer and return the input tensors
encoded_inputs = tokenizer(sentence, return_tensors='pt')
print(encoded_inputs)

{'input_ids': tensor([[  101, 17784, 21270,  3031,  1110,   170,   103,  2298,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [89]:
# Get the model's output for the input tensors
outputs = model(**encoded_inputs)
print(outputs)

MaskedLMOutput(loss=None, logits=tensor([[[ -7.4375,  -7.4023,  -7.4773,  ...,  -6.2403,  -5.8518,  -6.3604],
         [ -8.0492,  -8.2752,  -8.0847,  ...,  -6.7218,  -6.4944,  -6.9420],
         [ -2.6042,  -3.2494,  -3.2575,  ...,  -2.1986,  -2.3829,  -3.2883],
         ...,
         [ -8.1409,  -8.2819,  -8.2319,  ...,  -7.1964,  -5.6401,  -7.9355],
         [-11.5900, -11.7678, -12.3174,  ...,  -8.5492,  -9.8181, -11.9257],
         [-10.6881, -11.0931, -10.8000,  ...,  -8.1838,  -7.5953,  -9.8777]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


In [90]:
# Detach the logits from the model's output and convert them to numpy arrays
logits = outputs.logits.detach().numpy()[0]
logits.shape

(10, 28996)

In [91]:
len(tokens)

8

In [92]:
# Extract the logits for the mask token
mask_logits = logits[tokens.index(mask) + 1]
print(mask_logits)

[-4.629001  -5.2313967 -4.738371  ... -5.890812  -4.155588  -5.180725 ]


In [93]:
# Calculate the confidence scores for each possible token using softmax
confidence_scores= softmax(mask_logits)
print(confidence_scores)

[5.0484790e-08 2.7640334e-08 4.5254488e-08 ... 1.4294299e-08 8.1051489e-08
 2.9077006e-08]


In [94]:
confidence_scores.sum()

0.99999994

In [95]:
# Print the top 5 predicted tokens and their confidence scores
for i in np.argsort(confidence_scores)[::-1][:10]:
    pred_token = tokenizer.decode(i)
    score = confidence_scores[i]

    # Print the predicted sentence with the mask token replaced by the predicted token, and the confidence score
    print(sentence.replace(mask, pred_token), score)

Sakthi is a young boy. 0.12496645
Sakthi is a village boy. 0.0932745
Sakthi is a little boy. 0.06266502
Sakthi is a teenage boy. 0.04408817
Sakthi is a beautiful boy. 0.03869896
Sakthi is a small boy. 0.033369146
Sakthi is a baby boy. 0.0219025
Sakthi is a rich boy. 0.016079716
Sakthi is a shy boy. 0.014640891
Sakthi is a simple boy. 0.010145077
