In [1]:
# BertModel
#  └── encoder
#      └── layer[i]
#          └── attention
#              └── self
#                  ├── query (Linear)
#                  ├── key   (Linear)
#                  └── value (Linear)

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel


model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModel.from_pretrained(
    model_name,
    output_attentions=True   # attention weights ထုတ်ခိုင်းတာ
)

model.eval()


  from .autonotebook import tqdm as notebook_tqdm


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

#### EmbeddingOutput = Dropout( LayerNorm ( WordEmb + PositionEmb + TokenTypeEmb ) )

In [3]:
model.config.num_hidden_layers      
model.config.num_attention_heads    

12

### (Tokenizing)

In [4]:
sentence = ["Transformers are powerful", "Models for natural language processing."]
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True) # with more sentences needs padding and truncation
input_ids = inputs["input_ids"]

### (word_embeddings)

In [5]:
word_embeddings = model.embeddings.word_embeddings(input_ids)

print(input_ids)
print("Shape:", input_ids.shape)
print(word_embeddings)
print("Word_embeddings Shape:", word_embeddings.shape)

tensor([[  101, 19081,  2024,  3928,   102,     0,     0,     0],
        [  101,  4275,  2005,  3019,  2653,  6364,  1012,   102]])
Shape: torch.Size([2, 8])
tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [ 0.0189, -0.0289, -0.0768,  ...,  0.0116, -0.0212,  0.0171],
         [-0.0134, -0.0135,  0.0250,  ...,  0.0013, -0.0183,  0.0227],
         ...,
         [-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
         [-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
         [-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098]],

        [[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [-0.0067,  0.0282, -0.0428,  ..., -0.0632, -0.0404,  0.0090],
         [-0.0106,  0.0054, -0.0464,  ...,  0.0083,  0.0194,  0.0253],
         ...,
         [ 0.0037, -0.0340, -0.0513,  ..., -0.0440, -0.0129, -0.0368],
         [-0.0207, -0.0020, -0.0118,  ...,  0.0128,  0.0200,  0.0259],
         [-0.0145, -0.0100,  0

### (position_embeddings)

In [6]:
seq_len = input_ids.size(1)
# shape: (1, seq_len) → batch dimension align
position_ids = torch.arange(seq_len).unsqueeze(0)

# repeat for batch size
batch_size = input_ids.size(0)
position_ids = position_ids.expand(batch_size, seq_len)  # (batch, seq_len)

position_embeddings = model.embeddings.position_embeddings(position_ids)
print("Positional embeddings shape:", position_embeddings.shape)
print(position_embeddings)

Positional embeddings shape: torch.Size([2, 8, 768])
tensor([[[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
           6.8312e-04,  1.5441e-02],
         [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
           2.9753e-02, -5.3247e-03],
         [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
           1.8741e-02, -7.3140e-03],
         ...,
         [-3.0871e-03, -1.8956e-02, -1.8930e-02,  ...,  7.4045e-03,
           2.0183e-02,  3.4077e-03],
         [ 6.4257e-03, -1.7664e-02, -2.2067e-02,  ...,  6.7531e-04,
           1.1108e-02,  3.7521e-03],
         [ 6.2613e-04, -1.6089e-02, -7.6365e-03,  ...,  5.3390e-03,
           1.5909e-02,  1.8119e-03]],

        [[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
           6.8312e-04,  1.5441e-02],
         [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
           2.9753e-02, -5.3247e-03],
         [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
           1.8741e-02, 

### (token_type_embeddings)

In [7]:
token_type_ids = inputs.get("token_type_ids", torch.zeros_like(input_ids))
token_type_embeds = model.embeddings.token_type_embeddings(token_type_ids)

print(input_ids)
print(token_type_ids)
print("token type Shape:", token_type_ids.shape)

print("Token type embeddings shape:", token_type_embeds.shape)
print(token_type_embeds)


tensor([[  101, 19081,  2024,  3928,   102,     0,     0,     0],
        [  101,  4275,  2005,  3019,  2653,  6364,  1012,   102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]])
token type Shape: torch.Size([2, 8])
Token type embeddings shape: torch.Size([2, 8, 768])
tensor([[[ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         ...,
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086]],

        [[ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         ...,
         [ 0.0004,  0.0110,  0.0037,  .

### (All Embedding Output)

### Manually sum all embeddings

In [8]:
# Manually sum all embeddings
embedding_sum = word_embeddings + position_embeddings + token_type_embeds

print("Embedding output shape:", embedding_sum.shape)
print(embedding_sum)

Embedding output shape: torch.Size([2, 8, 768])
tensor([[[ 0.0316, -0.0411, -0.0564,  ...,  0.0021,  0.0044,  0.0219],
         [ 0.0271, -0.0156, -0.0926,  ...,  0.0339,  0.0052,  0.0032],
         [-0.0243, -0.0045,  0.0171,  ...,  0.0096, -0.0029,  0.0068],
         ...,
         [-0.0128, -0.0695, -0.0417,  ..., -0.0191, -0.0204, -0.0150],
         [-0.0033, -0.0682, -0.0449,  ..., -0.0258, -0.0295, -0.0146],
         [-0.0091, -0.0666, -0.0304,  ..., -0.0211, -0.0247, -0.0166]],

        [[ 0.0316, -0.0411, -0.0564,  ...,  0.0021,  0.0044,  0.0219],
         [ 0.0015,  0.0414, -0.0586,  ..., -0.0409, -0.0140, -0.0050],
         [-0.0215,  0.0144, -0.0542,  ...,  0.0166,  0.0347,  0.0093],
         ...,
         [ 0.0010, -0.0419, -0.0666,  ..., -0.0432,  0.0039, -0.0420],
         [-0.0139, -0.0087, -0.0302,  ...,  0.0068,  0.0278,  0.0210],
         [-0.0135, -0.0151,  0.0021,  ..., -0.0263,  0.0172, -0.0084]]],
       grad_fn=<AddBackward0>)


In [9]:
embedding_ln = model.embeddings.LayerNorm(embedding_sum)
print("LayerNorm output shape:", embedding_ln.shape)
print(embedding_ln)

mean_per_token = embedding_ln.mean(dim=-1)  # over hidden_size
std_per_token = embedding_ln.std(dim=-1)

print("Mean per token (after LayerNorm):")
print(mean_per_token)

print("\nStd per token (after LayerNorm):")
print(std_per_token)

LayerNorm output shape: torch.Size([2, 8, 768])
tensor([[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
           3.8253e-02,  1.6400e-01],
         [ 8.8409e-01,  2.0381e-01, -8.8640e-01,  ...,  8.3689e-01,
           5.5080e-01,  5.6793e-01],
         [-2.3302e-01,  1.1637e-01,  5.0871e-01,  ...,  3.0187e-01,
           1.8043e-01,  3.7445e-01],
         ...,
         [ 1.6776e-01, -8.9038e-01, -3.1798e-01,  ...,  3.1737e-02,
           6.4863e-02,  1.8418e-01],
         [ 3.5675e-01, -8.7076e-01, -3.7621e-01,  ..., -7.9391e-02,
          -1.0115e-01,  1.9312e-01],
         [ 2.4668e-01, -8.4544e-01, -1.1325e-01,  ...,  2.6934e-04,
          -1.1196e-02,  1.5729e-01]],

        [[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
           3.8253e-02,  1.6400e-01],
         [ 3.4962e-01,  9.9650e-01, -5.8570e-01,  ..., -3.3159e-01,
           1.2399e-01,  2.9833e-01],
         [-1.9791e-01,  4.1476e-01, -6.6507e-01,  ...,  3.9064e-01,
           7.8234e-01,  4.00

### Auto Sumup with Embedding

In [10]:
embedding_output = model.embeddings(
    input_ids=input_ids,
)

print("Embedding output shape:", embedding_output.shape)
print(embedding_output)

Embedding output shape: torch.Size([2, 8, 768])
tensor([[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
           3.8253e-02,  1.6400e-01],
         [ 8.8409e-01,  2.0381e-01, -8.8640e-01,  ...,  8.3689e-01,
           5.5080e-01,  5.6793e-01],
         [-2.3302e-01,  1.1637e-01,  5.0871e-01,  ...,  3.0187e-01,
           1.8043e-01,  3.7445e-01],
         ...,
         [ 1.6776e-01, -8.9038e-01, -3.1798e-01,  ...,  3.1737e-02,
           6.4863e-02,  1.8418e-01],
         [ 3.5675e-01, -8.7076e-01, -3.7621e-01,  ..., -7.9391e-02,
          -1.0115e-01,  1.9312e-01],
         [ 2.4668e-01, -8.4544e-01, -1.1325e-01,  ...,  2.6937e-04,
          -1.1196e-02,  1.5729e-01]],

        [[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
           3.8253e-02,  1.6400e-01],
         [ 3.4962e-01,  9.9650e-01, -5.8570e-01,  ..., -3.3159e-01,
           1.2399e-01,  2.9833e-01],
         [-1.9791e-01,  4.1476e-01, -6.6507e-01,  ...,  3.9064e-01,
           7.8234e-01,  4.00