### MLM Demo using BERT

In [1]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
# set BERT model to evaluation mode
model.eval()
# to move model to GPU for faster execution
# model.to('cuda')  # if you have gpu

# prediction function that takes a sentence with a masked token as input and predicts the top-k most likely words
def predict_masked_sent(text, top_k=5):
    # Tokenize input
    # add [CLS] classifier token to start of sequence & [SEP] separator token to input sequence.
    text = "[CLS] %s [SEP]"%text
    # store the tokenized text
    tokenized_text = tokenizer.tokenize(text)
    # finds the index of the MASKED token in the tokenized text
    masked_index = tokenized_text.index("[MASK]")
    # converts the tokenized text to token_ids which can be processed by model
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # create a pytorch tensor from the indexed_tokens
    tokens_tensor = torch.tensor([indexed_tokens])
    # tokens_tensor = tokens_tensor.to('cuda')    # if you have gpu

    # Predict all tokens
    # disable gradient calculation to reduce memory consumption
    with torch.no_grad():
        # pass tokensID through model to get outputs
        outputs = model(tokens_tensor)
        # retrueves the predicted logits for masked token
        predictions = outputs[0]


    # compute softmax probabilities for predicted logits of the masked token
    probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
    # find the top-k most likely tokens and their corresponding probabilities
    top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)


    # iterate over the top-K predictions, converts token ID's back to tokens and prints each token along with its probability weight
    for i, pred_idx in enumerate(top_k_indices):
        predicted_token = tokenizer.convert_ids_to_tokens([pred_idx])[0]
        token_weight = top_k_weights[i]
        print("[MASK]: '%s'"%predicted_token, " | weights:", float(token_weight))


predict_masked_sent("My [MASK] is so cute.", top_k=5)


'''
The above code will output:
[MASK]: 'mom'  | weights: 0.10288725048303604
[MASK]: 'brother'  | weights: 0.08429113030433655
[MASK]: 'dad'  | weights: 0.08260555565357208
[MASK]: 'girl'  | weights: 0.06902255117893219
[MASK]: 'sister'  | weights: 0.04804788902401924
'''

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[MASK]: 'mom'  | weights: 0.10979422181844711
[MASK]: 'dad'  | weights: 0.086820088326931
[MASK]: 'brother'  | weights: 0.08564966917037964
[MASK]: 'girl'  | weights: 0.06944077461957932
[MASK]: 'sister'  | weights: 0.05375329777598381


"\nThe above code will output:\n[MASK]: 'mom'  | weights: 0.10288725048303604\n[MASK]: 'brother'  | weights: 0.08429113030433655\n[MASK]: 'dad'  | weights: 0.08260555565357208\n[MASK]: 'girl'  | weights: 0.06902255117893219\n[MASK]: 'sister'  | weights: 0.04804788902401924\n"