In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import pandas as pd
import numpy as np
from scipy.special import softmax

In [3]:
model_name = "bert-base-cased"

model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# Defining the mask token
mask = tokenizer.mask_token
mask

'[MASK]'

In [5]:
sentence = f"I want to {mask} pizza for tonight."
tokens = tokenizer.tokenize(sentence)
tokens

['I', 'want', 'to', '[MASK]', 'pizza', 'for', 'tonight', '.']

In [6]:
encoded_inputs = tokenizer(sentence, return_tensors='pt')
print('encoded_inputs size = ', encoded_inputs.input_ids.size())
output = model(**encoded_inputs)
print(output)

encoded_inputs size =  torch.Size([1, 10])
MaskedLMOutput(loss=None, logits=tensor([[[ -7.3723,  -7.2489,  -7.4421,  ...,  -6.3119,  -5.9369,  -6.4257],
         [ -7.9311,  -8.2282,  -8.0326,  ...,  -6.7387,  -6.4877,  -6.9525],
         [-12.0500, -11.7972, -12.5776,  ...,  -8.4518,  -6.7310,  -8.2586],
         ...,
         [-10.2204, -10.4315,  -9.9993,  ...,  -7.9570,  -6.7194,  -9.3618],
         [-12.4471, -12.5367, -12.5614,  ...,  -9.9086,  -9.4219, -11.1769],
         [-14.3657, -14.5227, -15.0017,  ..., -11.9715, -11.6569, -13.4498]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


In [7]:
output.logits.size()
# output shape = (1*encoded_inputs.input_ids.size()*vocab_size)

torch.Size([1, 10, 28996])

In [8]:
# Detaching the logits from the model output and converting to numpy array
logits = output.logits.detach().numpy()[0]
# logits gives probabilities of each token in the vocab for each position.
logits.shape

(10, 28996)

In [9]:
tokens.index(mask)

3

In [10]:
mask_logits = logits[tokens.index(mask) + 1] # +1 because there is a start token in the encoded_inputs
mask_logits

array([-6.7146316, -6.3791103, -6.1184897, ..., -5.651311 , -3.657279 ,
       -4.9947295], dtype=float32)

In [11]:
mask_logits.shape

(28996,)

In [12]:
confidence_score = softmax(mask_logits)
confidence_score

array([2.9159913e-10, 4.0785011e-10, 5.2928223e-10, ..., 8.4446233e-10,
       6.2026513e-09, 1.6282841e-09], dtype=float32)

In [13]:
max(confidence_score)

0.2572899

In [14]:
np.argmax(confidence_score)

1138

In [15]:
confidence_score[1138]

0.2572899

In [16]:
tokenizer.decode(1138)

'have'

In [17]:
np.argsort(confidence_score)[::-1][:5]

array([1138, 1243, 3940, 1294, 1546], dtype=int64)

In [18]:
# sentence was: "I want to {mask} pizza for tonight."
for i in np.argsort(confidence_score)[::-1][:5]:
    pred_token = tokenizer.decode(i)
    score = confidence_score[i]
    print(sentence.replace(mask, pred_token), ': Confidence score:', score)

I want to have pizza for tonight. : Confidence score: 0.2572899
I want to get pizza for tonight. : Confidence score: 0.17849591
I want to eat pizza for tonight. : Confidence score: 0.1555557
I want to make pizza for tonight. : Confidence score: 0.11422386
I want to order pizza for tonight. : Confidence score: 0.09823085
