In [3]:
# Install required libraries
!pip install transformers torch bertviz

import torch
from transformers import BertTokenizer, BertForMaskedLM
import torch.nn.functional as F
import pandas as pd

# Load pre-trained model tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval() # Set the model to evaluation mode

Collecting transformers
  Using cached transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
Collecting torch
  Using cached torch-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (29 kB)
[0mCollecting bertviz
  Using cached bertviz-1.4.1-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
[0mCollecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting n

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [6]:
# --- Scenario 1: Factual Recall ---
text = "The capital of France is [MASK]."

# Tokenize the text
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
masked_index = tokenized_text.index('[MASK]')

# Convert to a tensor
tokens_tensor = torch.tensor([indexed_tokens])

# Predict the masked token
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

# Get the top 5 predictions for the masked token
logits = predictions[0, masked_index]
probabilities = F.softmax(logits, dim=-1)
top_5_tokens = torch.topk(probabilities, 5).indices
top_5_probs = torch.topk(probabilities, 5).values

predicted_tokens = tokenizer.convert_ids_to_tokens(top_5_tokens)

# Display the results
print(f"Original Text: {text}")
results_df = pd.DataFrame({
    'Token': predicted_tokens,
    'Probability': [f"{p.item()*100:.2f}%" for p in top_5_probs]
})
print(results_df)

  return forward_call(*args, **kwargs)


Original Text: The capital of France is [MASK].
  Token Probability
0     .      28.44%
1     "       4.72%
2   the       2.25%
3     ,       1.60%
4     )       1.50%


In [9]:
# --- Scenario 2: Context Sensitivity ---
text_bank = "After a long day at work, I went to the [MASK] to deposit a check."
text_river = "The boat sailed peacefully along the river [MASK]."

def predict_masked_word(text):
    """A helper function to predict and display masked words."""
    tokenized_text = tokenizer.tokenize(text)
    if '[MASK]' not in tokenized_text:
        print("Error: '[MASK]' token not found in the text.")
        return
        
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    masked_index = tokenized_text.index('[MASK]')
    tokens_tensor = torch.tensor([indexed_tokens])

    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    logits = predictions[0, masked_index]
    probabilities = F.softmax(logits, dim=-1)
    top_5_tokens = torch.topk(probabilities, 5).indices
    predicted_tokens = tokenizer.convert_ids_to_tokens(top_5_tokens)
    
    print(f"Prediction for: '{text}'")
    print(predicted_tokens)
    print("-" * 30)

# Run predictions for both sentences
predict_masked_word(text_bank)
predict_masked_word(text_river)

  return forward_call(*args, **kwargs)


Prediction for: 'After a long day at work, I went to the [MASK] to deposit a check.'
['office', 'bank', 'door', 'back', 'kitchen']
------------------------------
Prediction for: 'The boat sailed peacefully along the river [MASK].'
['.', '"', ')', ',', '(']
------------------------------


In [11]:

from transformers import BertTokenizer, BertModel
from bertviz import head_view

# Use the base model (not the MLM one) to get attention scores
model_for_viz = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
tokenizer_for_viz = BertTokenizer.from_pretrained('bert-base-uncased')

# A sentence to analyze
sentence = "The dog chased the cat until it was tired"
inputs = tokenizer_for_viz.encode(sentence, return_tensors='pt')
outputs = model_for_viz(inputs)
attention = outputs[-1]  # Output includes attention weights

# Visualize
tokens = tokenizer_for_viz.convert_ids_to_tokens(inputs[0])
head_view(attention, tokens)

  return forward_call(*args, **kwargs)


<IPython.core.display.Javascript object>