In [1]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(f'Length of BERT base vocabulary: {len(tokenizer.vocab)}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Length of BERT base vocabulary: 30522


In [3]:
text = "A simple sentence."

tokenized_text = tokenizer.encode(text)
print(tokenized_text)

[101, 1037, 3722, 6251, 1012, 102]


In [4]:
tokenizer.decode(tokenized_text)

'[CLS] a simple sentence. [SEP]'

In [5]:
text = "My friend told me about this class and I love it so far! She was right."

tokens = tokenizer.encode(text)
print(tokens)

[101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009, 2061, 2521, 999, 2016, 2001, 2157, 1012, 102]


In [6]:
print(f'Text: {text}. Num tokens: {len(tokens)}')
for t in tokens:
  print(f'Token: {t}, subword: {tokenizer.decode([t])}')

Text: My friend told me about this class and I love it so far! She was right.. Num tokens: 20
Token: 101, subword: [CLS]
Token: 2026, subword: my
Token: 2767, subword: friend
Token: 2409, subword: told
Token: 2033, subword: me
Token: 2055, subword: about
Token: 2023, subword: this
Token: 2465, subword: class
Token: 1998, subword: and
Token: 1045, subword: i
Token: 2293, subword: love
Token: 2009, subword: it
Token: 2061, subword: so
Token: 2521, subword: far
Token: 999, subword: !
Token: 2016, subword: she
Token: 2001, subword: was
Token: 2157, subword: right
Token: 1012, subword: .
Token: 102, subword: [SEP]


In [7]:
'sinan' in tokenizer.vocab

False

In [8]:
text_with_unknown_words = 'Sinan loves a beautiful day'
tokens_with_unknown_words = tokenizer.encode(text_with_unknown_words)

for t in tokens_with_unknown_words:
  print(f'Token: {t}, subword: {tokenizer.decode([t])}')

Token: 101, subword: [CLS]
Token: 8254, subword: sin
Token: 2319, subword: ##an
Token: 7459, subword: loves
Token: 1037, subword: a
Token: 3376, subword: beautiful
Token: 2154, subword: day
Token: 102, subword: [SEP]


In [9]:
tokens = tokenizer.encode_plus(text)
print(tokens)

{'input_ids': [101, 2026, 2767, 2409, 2033, 2055, 2023, 2465, 1998, 1045, 2293, 2009, 2061, 2521, 999, 2016, 2001, 2157, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [10]:
model = BertModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:
model.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [12]:
example_phrase = "I am Sinan"

tokenizer.encode(example_phrase, return_tensors='pt')

tensor([[ 101, 1045, 2572, 8254, 2319,  102]])

In [13]:
model.embeddings.word_embeddings(tokenizer.encode(example_phrase, return_tensors='pt'))

tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [-0.0211,  0.0059, -0.0179,  ...,  0.0163,  0.0122,  0.0073],
         [-0.0437, -0.0150,  0.0029,  ..., -0.0282,  0.0474, -0.0448],
         [-0.0022, -0.0876,  0.0143,  ...,  0.0232, -0.0024, -0.0213],
         [-0.0614, -0.0044, -0.0755,  ..., -0.0522, -0.0310, -0.0248],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [16]:
# Note first and last rows are same since those are for [CLS] and [SEP]
model.embeddings.word_embeddings(tokenizer.encode("I am Matt", return_tensors='pt'))

tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [-0.0211,  0.0059, -0.0179,  ...,  0.0163,  0.0122,  0.0073],
         [-0.0437, -0.0150,  0.0029,  ..., -0.0282,  0.0474, -0.0448],
         [-0.0381, -0.0026,  0.0130,  ...,  0.0038, -0.0279, -0.0082],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [18]:
model.embeddings.position_embeddings

Embedding(512, 768)

In [20]:
model.embeddings.position_embeddings(torch.LongTensor(range(6)))

tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
          6.8312e-04,  1.5441e-02],
        [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
          2.9753e-02, -5.3247e-03],
        [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
          1.8741e-02, -7.3140e-03],
        [-4.1949e-03, -1.1852e-02, -2.1180e-02,  ...,  2.2455e-02,
          5.2826e-03, -1.9723e-03],
        [-5.6087e-03, -1.0445e-02, -7.2288e-03,  ...,  2.0837e-02,
          3.5402e-03,  4.7708e-03],
        [-3.0871e-03, -1.8956e-02, -1.8930e-02,  ...,  7.4045e-03,
          2.0183e-02,  3.4077e-03]], grad_fn=<EmbeddingBackward0>)

In [21]:
model.embeddings.token_type_embeddings(torch.LongTensor([0]*6))

tensor([[ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086]],
       grad_fn=<EmbeddingBackward0>)