In [None]:
!pip install transformers

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

In [4]:
text = "Dark matter is completely invisible, emits no light or energy and thus cannot be detected by conventional sensors and detectors"

In [6]:
tokens = bert_tokenizer.tokenize(text)
print(len(tokens))
print(tokens)

22
['dark', 'matter', 'is', 'completely', 'invisible', ',', 'emi', '##ts', 'no', 'light', 'or', 'energy', 'and', 'thus', 'cannot', 'be', 'detected', 'by', 'conventional', 'sensors', 'and', 'detectors']


In [8]:
print(bert_tokenizer.convert_tokens_to_ids(tokens))

[2601, 3043, 2003, 3294, 8841, 1010, 12495, 3215, 2053, 2422, 2030, 2943, 1998, 2947, 3685, 2022, 11156, 2011, 7511, 13907, 1998, 25971]


In [9]:
# Get encoded input by Bert tokenizer
encoded_input = bert_tokenizer(text, padding=True, truncation=True, return_tensors='pt')
encoded_input

{'input_ids': tensor([[  101,  2601,  3043,  2003,  3294,  8841,  1010, 12495,  3215,  2053,
          2422,  2030,  2943,  1998,  2947,  3685,  2022, 11156,  2011,  7511,
         13907,  1998, 25971,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

The 1st and last token IDs added to the end of original token ID list are the special [CLS] & [SEP] tokens

In [16]:
print(bert_tokenizer.convert_ids_to_tokens([ 101, 102]))

['[CLS]', '[SEP]']


In [17]:
# Compute token embeddings

with torch.no_grad():
    model_output = bert_model(**encoded_input)

In [18]:
print(model_output.keys())

odict_keys(['last_hidden_state', 'pooler_output'])


This is the raw output of Bert model without any specific head on top.

`last hidden state` is the hidden state representation in the last layer for each token in the sequence including the special tokens.

`pooler_output` is the last layer hidden state representation corresponding to the [CLS] token after processing with a linear layer with tanh activation function using weights learnt during training on next sentence prediction task during pre-training.

More details on BertModel in documentation [here](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)

In [20]:
model_output['last_hidden_state'].shape

torch.Size([1, 24, 768])

In [21]:
model_output['pooler_output'].shape

torch.Size([1, 768])

#### References
* https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
* [BERT Paper, 2018](https://arxiv.org/pdf/1810.04805.pdf)