In [27]:
from transformers import BertModel, BertTokenizer
import torch
import torch.nn as nn

In [28]:
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
text = "After stealing money from the bank vault, the bank robber was seen " \
   "fishing on the Mississippi river bank."
token_input = tokenizer(text, return_tensors='pt')
token_input

{'input_ids': tensor([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,  1996,
          2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,  2314,  2924,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [30]:
token_input['input_ids'].shape

torch.Size([1, 22])

# model forward
- forward
    - embedding -> encoder -> pooler

In [31]:
model.eval()
with torch.no_grad():
    outputs = model(**token_input)

In [32]:
len(outputs)

3

# output
- len(outputs) == 3
- outputs[0]
    - last_hidden_state, shape: batch_size * seq_len * hidden_size (1 * 22 * 768)
- outputs[1]
    - pooler_output, shape: batch_size * hidden_size (1 * 768)
    - Last layer hidden-state of the first token of the sequence(classification token, [cls])
- outputs[2]`(model.config.output_hidden_states=True)`
    - hidden_states
    - type: tuple
    - one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer
    = (1 + 12)\*(batch_size\*seq_len\*hidden_size) = 13 *1 * 22 * 768
- outputs[0] = outputs[2][-1]
- outputs[1] = model.pooler(outputs[2][-1])
- outputs[2][0] = model.embeddings(token_inpus['input_ids'], token_inputs['token_type_ids'])

In [33]:
outputs[2][0]

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [ 0.2329,  0.1390,  0.2979,  ..., -0.0655,  0.8885,  0.5109],
         [ 0.2257, -0.7165, -0.7255,  ...,  0.4844,  0.6030, -0.0957],
         ...,
         [-0.0374, -0.6155, -1.4419,  ...,  0.0793, -0.0811, -0.3802],
         [-0.0228,  0.4207, -0.3288,  ...,  0.4464,  0.5178,  0.5501],
         [-0.2350,  0.1566, -0.0462,  ..., -0.4206,  0.3074, -0.2288]]])

In [35]:
model.embeddings(token_input['input_ids'], token_input['token_type_ids'])

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [ 0.2329,  0.1390,  0.2979,  ..., -0.0655,  0.8885,  0.5109],
         [ 0.2257, -0.7165, -0.7255,  ...,  0.4844,  0.6030, -0.0957],
         ...,
         [-0.0374, -0.6155, -1.4419,  ...,  0.0793, -0.0811, -0.3802],
         [-0.0228,  0.4207, -0.3288,  ...,  0.4464,  0.5178,  0.5501],
         [-0.2350,  0.1566, -0.0462,  ..., -0.4206,  0.3074, -0.2288]]],
       grad_fn=<NativeLayerNormBackward0>)