# Install Necessary Packages

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
Col

# Importing Libraries

In [2]:
from transformers import BertModel, AutoTokenizer
import pandas as pd

# Instantiate model and tokenizer

In [3]:
model_name = 'bert-base-cased'
model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

# Peeking into a model

- The model has input shape of 28996, 768 in first layer (Word Embeddings)
  - In the sense the vocab size will be 28996 and the embeddings from the model will be 768 (fixed length)
- The model has input shape of 512, 768 in second layer (Position Embeddings)
- The model has input shape of 2, 768 in third layer (token_type embeddings)

In [4]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

# Peeking the Tokenizer

- [PAD] has a value of 0
- [UNK] has a value of 100
- [CLS] has a value of 101
- [SEP] has a value of 102
- [MASK] has a value of 103

In [5]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# Input sentence and its tokenized output
- Sub word tokenization

In [6]:
sentence = "When life gives you lemons, don't make lemonade"

In [7]:
tokens = tokenizer.tokenize(sentence)
tokens

['When',
 'life',
 'gives',
 'you',
 'lemon',
 '##s',
 ',',
 'don',
 "'",
 't',
 'make',
 'lemon',
 '##ade']

## Checking the vocabulary of BERT
- pre-built vocabulary

In [8]:
vocab = tokenizer.vocab
vocab

{'contracting': 26706,
 'extinct': 8256,
 'Crete': 21070,
 'Prior': 4602,
 'dictionary': 17085,
 'Oman': 17739,
 '##pert': 17786,
 'world': 1362,
 'concludes': 15382,
 '##hino': 21918,
 'Kern': 25682,
 'behavior': 4658,
 'crouched': 15062,
 'charts': 5896,
 'ừ': 762,
 'Layla': 21628,
 '##Ţ': 28247,
 '##els': 5999,
 'confined': 12597,
 'modeling': 13117,
 'themes': 6621,
 'breathless': 19305,
 'Archie': 13581,
 '##iq': 28101,
 '##lica': 9538,
 'Lakes': 10180,
 '##tick': 27252,
 'loose': 5768,
 '##fire': 7117,
 'Carl': 4804,
 '##action': 15022,
 'exception': 5856,
 'dismay': 22035,
 'parked': 8806,
 'scrap': 16720,
 '##oker': 26218,
 'Nordic': 14271,
 'Dunedin': 24377,
 'lectures': 9548,
 'majority': 2656,
 'definite': 16428,
 'respective': 7514,
 'coma': 19737,
 'forge': 26621,
 'Archaeological': 17249,
 'facilitate': 11000,
 'Dallas': 5043,
 '##ave': 8308,
 'Toledo': 13459,
 'Petty': 18468,
 'governmental': 11219,
 'ầ': 741,
 'indicated': 4668,
 'Allison': 10692,
 'realised': 11326,
 '

## Vocabulary to Dataframe

In [10]:
vocab_df = pd.DataFrame({
    'tokens': vocab.keys(),
    'token_id': vocab.values(),
})

vocab_df = vocab_df.sort_values(by='token_id').set_index('token_id')
vocab_df.head()

Unnamed: 0_level_0,tokens
token_id,Unnamed: 1_level_1
0,[PAD]
1,[unused1]
2,[unused2]
3,[unused3]
4,[unused4]


In [12]:
vocab_df.shape

(28996, 1)

## Tokenizing and peeking into every details

In [13]:
token_ids = tokenizer.encode(sentence)
token_ids

[101,
 1332,
 1297,
 3114,
 1128,
 22782,
 1116,
 117,
 1274,
 112,
 189,
 1294,
 22782,
 6397,
 102]

In [14]:
len(token_ids), len(tokens), len(sentence.split())

(15, 13, 8)

In [15]:
tokenizer.decode(102)

'[SEP]'

In [16]:
tokenizer.decode(token_ids)

"[CLS] When life gives you lemons, don't make lemonade [SEP]"

In [17]:
tokenizer.decode(token_ids[1: -1])

"When life gives you lemons, don't make lemonade"

In [18]:
tokenizer_output = tokenizer(sentence)
tokenizer_output

{'input_ids': [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
tokenizer_output['input_ids']

[101,
 1332,
 1297,
 3114,
 1128,
 22782,
 1116,
 117,
 1274,
 112,
 189,
 1294,
 22782,
 6397,
 102]

In [20]:
tokenizer_output['attention_mask']

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [21]:
tokenizer_output['token_type_ids']

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

### If we pad the tokens to match to a certain length the padded values will have attention mask as 0 stating that those words needs no attention

In [23]:
sentence2 = sentence.replace("don't ", "")
sentence2

'When life gives you lemons, make lemonade'

In [24]:
tokenizer_output2 = tokenizer(
    [sentence, sentence2],
    padding=True
)

In [25]:
tokenizer_output2

{'input_ids': [[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 102], [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1294, 22782, 6397, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}

In [26]:
for i in tokenizer_output2:
  for j in tokenizer_output2[i]:
    print(j)

[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 102]
[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1294, 22782, 6397, 102, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]


In [27]:
for i in tokenizer_output2['input_ids']:
  for j in i:
    print(j, tokenizer.decode(j))
  print()

101 [CLS]
1332 When
1297 life
3114 gives
1128 you
22782 lemon
1116 ##s
117 ,
1274 don
112 '
189 t
1294 make
22782 lemon
6397 ##ade
102 [SEP]

101 [CLS]
1332 When
1297 life
3114 gives
1128 you
22782 lemon
1116 ##s
117 ,
1294 make
22782 lemon
6397 ##ade
102 [SEP]
0 [PAD]
0 [PAD]
0 [PAD]

