In [None]:
import torch
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
import seaborn as sns

# Load pre-trained BERT model and tokenizer from HuggingFace
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)

# Define a sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize the input sentence and convert to tensor
inputs = tokenizer(sentence, return_tensors='pt')

# Pass the inputs through BERT to get the attention weights
outputs = model(**inputs)
attention = outputs.attentions  # List of attention matrices from all layers

# Let's take attention from the last layer (for simplicity, from the first head)
attention_matrix = attention[-1][0][0].detach()

# Decode the tokens to get the words
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

# Visualization of the attention matrix
plt.figure(figsize=(10, 8))
sns.heatmap(attention_matrix.numpy(), annot=True, cmap='Blues', xticklabels=tokens, yticklabels=tokens)
plt.title('Attention Matrix from BERT (Last Layer, Head 0)')
plt.xlabel('Key Words')
plt.ylabel('Query Words')
plt.show()
