In [1]:
from transformers import BertTokenizer, BertModel

In [2]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
model = BertModel.from_pretrained('bert-base-uncased')

In [6]:
# Get all the model's parameters as a list of tuples

named_params = list(model.named_parameters())

print('The BERT Model has {:} different named parameters.\n'.format(len(named_params)))

print("==== Embedding Layer ====\n")
for p in named_params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
print("\n==== First Encoder ====\n")
for p in named_params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
print("\n==== Output Layer ====\n")
for p in named_params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT Model has 199 different named parameters.

==== Embedding Layer ====

embeddings.word_embeddings.weight                       (30522, 768)
embeddings.position_embeddings.weight                     (512, 768)
embeddings.token_type_embeddings.weight                     (2, 768)
embeddings.LayerNorm.weight                                   (768,)
embeddings.LayerNorm.bias                                     (768,)

==== First Encoder ====

encoder.layer.0.attention.self.query.weight               (768, 768)
encoder.layer.0.attention.self.query.bias                     (768,)
encoder.layer.0.attention.self.key.weight                 (768, 768)
encoder.layer.0.attention.self.key.bias                       (768,)
encoder.layer.0.attention.self.value.weight               (768, 768)
encoder.layer.0.attention.self.value.bias                     (768,)
encoder.layer.0.attention.output.dense.weight             (768, 768)
encoder.layer.0.attention.output.dense.bias                   (768,

In [7]:
# The pooler is a separate linear and tanh activated layer that acts on the [CLS] token's representation
# This pooled_output is often used as a representation for the entire sentence

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
tokenizer.encode('Saket loves a beautiful day')

[101, 8739, 2102, 7459, 1037, 3376, 2154, 102]

In [11]:
# run tokens through the model

#1 Turn tokens_with_unknown_words into a tensor (will be shape(8,))
#2 Unsqueeze a first dimension to simulate batches. Resulting shape is (1,8)

response = model(torch.tensor(tokenizer.encode('Saket loves a beautiful day')).unsqueeze(0))

In [13]:
response

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1778,  0.0946, -0.0306,  ..., -0.3201,  0.3875,  0.3578],
         [ 0.6455,  0.4590,  0.4842,  ..., -0.5112,  0.7038, -0.3384],
         [-0.0900, -0.4080,  0.7350,  ..., -1.0433, -0.1517, -0.2731],
         ...,
         [ 0.1477,  0.0171,  0.6067,  ..., -0.4519,  0.1851, -0.3535],
         [-0.2638, -0.2562,  0.0473,  ...,  0.2247,  0.3425, -0.3819],
         [ 0.5196, -0.0754, -0.2657,  ..., -0.0014, -0.4388, -0.2921]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-8.4523e-01, -4.1711e-01, -5.2523e-01,  7.2943e-01,  2.9464e-01,
         -5.5997e-02,  8.9018e-01,  2.6216e-01, -2.0878e-01, -9.9997e-01,
          2.2736e-02,  6.5921e-01,  9.9061e-01,  1.2573e-01,  9.5687e-01,
         -4.3377e-01,  1.8056e-01, -5.5232e-01,  3.9684e-01, -4.9880e-01,
          7.7684e-01,  9.9912e-01,  4.0945e-01,  2.0446e-01,  4.3359e-01,
          9.1557e-01, -6.1338e-01,  9.4577e-01,  9.6313e-01,  7.222

In [12]:
# Embedding for each token, the first one being the [CLS] token

response.last_hidden_state

tensor([[[-0.1778,  0.0946, -0.0306,  ..., -0.3201,  0.3875,  0.3578],
         [ 0.6455,  0.4590,  0.4842,  ..., -0.5112,  0.7038, -0.3384],
         [-0.0900, -0.4080,  0.7350,  ..., -1.0433, -0.1517, -0.2731],
         ...,
         [ 0.1477,  0.0171,  0.6067,  ..., -0.4519,  0.1851, -0.3535],
         [-0.2638, -0.2562,  0.0473,  ...,  0.2247,  0.3425, -0.3819],
         [ 0.5196, -0.0754, -0.2657,  ..., -0.0014, -0.4388, -0.2921]]],
       grad_fn=<NativeLayerNormBackward0>)

In [14]:
# This layer is trained on top of the embedding of [CLS] token

response.pooler_output.shape

torch.Size([1, 768])

In [15]:
model.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [18]:
CLS_embedding = response.last_hidden_state[:,0,:].unsqueeze(0)
CLS_embedding.shape

torch.Size([1, 1, 768])

In [19]:
model.pooler(CLS_embedding).shape

torch.Size([1, 768])

In [20]:
# Running the embedding for CLS through the pooler gives the same output as the 'pooler_output'

(model.pooler(CLS_embedding) == response.pooler_output).all()


tensor(True)

In [21]:
total_params = 0

for p in model.parameters():
    if len(p.shape) == 2:
        total_params += p.shape[0] * p.shape[1]
        
print(f'Total Parameters: {total_params:,}')

Total Parameters: 109,360,128
