In [1]:
import torch
from transformers.models.bert import BertModel, BertTokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_hidden_states=True)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

- BertLayer
    - Attention: BertAttention
        - self: BertSelfAttention
        - output: BertSelfOutput
    - intermediate: BertIntermediate
    - output: BertOutput

In [3]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [4]:
test_sentence = 'this is a test sentence'
model_input = tokenizer(test_sentence, return_tensors='pt')

In [5]:
model.eval()
with torch.no_grad():
    output = model(**model_input)

In [8]:
# embeddings
embeddings = output[2][0]

In [7]:
# first bert layer output
print(output[2][1])
print(output[2][1].shape)

tensor([[[ 0.1556, -0.0080, -0.0707,  ...,  0.0786,  0.0213,  0.0616],
         [-0.5333,  0.5799,  0.1044,  ...,  0.0241,  0.4888,  0.0161],
         [-1.0609, -0.3058, -0.5043,  ...,  0.1874,  0.2874,  0.4032],
         ...,
         [ 0.8206, -0.6656, -0.7054,  ...,  0.1347,  0.1117, -1.9040],
         [ 1.1128,  0.6603, -0.1509,  ...,  0.3253, -1.0006, -1.9106],
         [-0.0736,  0.0346,  0.0376,  ..., -0.4506,  0.6585, -0.0502]]])
torch.Size([1, 7, 768])


In [10]:
layer = model.encoder.layer[0]
mha_output = layer.attention.self(embeddings)
mha_output

(tensor([[[ 0.2979,  0.0801, -0.0037,  ..., -0.0142,  0.1290,  0.0828],
          [ 0.3935,  0.1356, -0.0920,  ...,  0.0211,  0.1677,  0.0011],
          [ 0.1696,  0.1449, -0.1039,  ...,  0.1604,  0.2172,  0.0310],
          ...,
          [-0.0617,  0.1968, -0.0669,  ...,  0.1126,  0.1933, -0.0204],
          [-0.2835,  0.1495, -0.0021,  ...,  0.0973,  0.1865, -0.0636],
          [ 0.2575,  0.1120, -0.1008,  ...,  0.0175,  0.1508,  0.0878]]],
        grad_fn=<ViewBackward0>),)

In [12]:
attn_output = layer.attention.output(mha_output[0], embeddings)
attn_output

tensor([[[ 0.6143, -0.2061, -0.5682,  ...,  0.2557,  0.1224,  0.3388],
         [-0.6751,  1.0598, -0.2210,  ...,  0.6263,  1.1682,  0.1998],
         [-0.5969,  0.1463, -0.5549,  ...,  0.5272,  0.5071,  0.6439],
         ...,
         [ 1.3291, -1.1190, -0.1960,  ...,  0.6164,  0.4293, -2.4704],
         [ 1.8784,  0.5158,  0.1902,  ...,  0.5298, -1.3812, -2.0257],
         [ 0.7376, -0.7000, -0.0992,  ..., -0.9356,  0.5831, -0.4610]]],
       grad_fn=<NativeLayerNormBackward0>)

In [13]:
mlp1 = layer.intermediate(attn_output)
mlp1.shape

torch.Size([1, 7, 3072])

In [14]:
mlp2 = layer.output(mlp1, attn_output)
mlp2.shape

torch.Size([1, 7, 768])

In [15]:
mlp2

tensor([[[ 0.1556, -0.0080, -0.0707,  ...,  0.0786,  0.0213,  0.0616],
         [-0.5333,  0.5799,  0.1044,  ...,  0.0241,  0.4888,  0.0161],
         [-1.0609, -0.3058, -0.5043,  ...,  0.1874,  0.2874,  0.4032],
         ...,
         [ 0.8206, -0.6656, -0.7054,  ...,  0.1347,  0.1117, -1.9040],
         [ 1.1128,  0.6603, -0.1509,  ...,  0.3253, -1.0006, -1.9106],
         [-0.0736,  0.0346,  0.0376,  ..., -0.4506,  0.6585, -0.0502]]],
       grad_fn=<NativeLayerNormBackward0>)