# Transformer Model Basics

## Tokenization

In [1]:
from transformers import BertModel, AutoTokenizer, AutoModelForMaskedLM
import pandas as pd

In [2]:
model_name = "bert-base-cased"

model = BertModel.from_pretrained(model_name)

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [6]:
sentence = "When life gives you lemons, don't make lemonade."

* Tokenizer will tokenize the sentence here its sub-word level tokenization
* Subword tokenization involves breaking words into smaller, meaningful subword units
* Popular words dont get split, words rarely gets used usually gets breakdown 

In [7]:
tokens = tokenizer.tokenize(sentence)
tokens

['When',
 'life',
 'gives',
 'you',
 'lemon',
 '##s',
 ',',
 'don',
 "'",
 't',
 'make',
 'lemon',
 '##ade',
 '.']

* Tokenizers break words based on the vocabulary its trained on
* In this case its around 28996 which is the (word_embeddings): Embedding(28996, 768, padding_idx=0)

In [8]:
vocab = tokenizer.vocab
vocab_df = pd.DataFrame({"tokens": vocab.keys(), "token_id": vocab.values()})
vocab_df

Unnamed: 0,tokens,token_id
0,##rica,15353
1,Bang,12926
2,Sales,15689
3,pathetic,18970
4,##bro,12725
...,...,...
28991,January,1356
28992,fraud,10258
28993,م,589
28994,ט,543


In [9]:
vocab_df = vocab_df.sort_values(by='token_id').set_index("token_id")
vocab_df.head()

Unnamed: 0_level_0,tokens
token_id,Unnamed: 1_level_1
0,[PAD]
1,[unused1]
2,[unused2]
3,[unused3]
4,[unused4]


* Now we encode this sentence to get the token_ids

In [10]:
token_ids = tokenizer.encode(sentence)
token_ids

[101,
 1332,
 1297,
 3114,
 1128,
 22782,
 1116,
 117,
 1274,
 112,
 189,
 1294,
 22782,
 6397,
 119,
 102]

In [11]:
len(tokens), len(token_ids)

(14, 16)

* These first and last token_ids in a sentence is special tokens in BERT
* It denotes the start and the end of the sentences

In [12]:
vocab_df.iloc[101], vocab_df.iloc[102]

(tokens    [CLS]
 Name: 101, dtype: object,
 tokens    [SEP]
 Name: 102, dtype: object)

In [13]:
list(zip(tokens, token_ids[1:-1]))

[('When', 1332),
 ('life', 1297),
 ('gives', 3114),
 ('you', 1128),
 ('lemon', 22782),
 ('##s', 1116),
 (',', 117),
 ('don', 1274),
 ("'", 112),
 ('t', 189),
 ('make', 1294),
 ('lemon', 22782),
 ('##ade', 6397),
 ('.', 119)]

In [14]:
tokenizer.decode(token_ids=token_ids)

"[CLS] When life gives you lemons, don ' t make lemonade. [SEP]"

* input_ids - token_ids which is the input for the model
* token_type_ids - used in pre training of the model i.e to find which is question and which is context in a Q/A input

In [15]:
token_out = tokenizer(sentence)
token_out

{'input_ids': [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
sentence2 = sentence.replace("don't ", "")
sentence2

'When life gives you lemons, make lemonade.'

* padding - used to add extra tokens if 2 or more sentence is given to match the shape of the input matrix but those extra tokens should not interfer in our final prediction
* attention_mask - it will tell the model which token is to be taken as input and which should not
* (position_embeddings): Embedding(512, 768) - 512 is the no of tokens(300 words) we can give as a input to the model at once, 768 is the size of the vector which input tokens will be converted 

In [17]:
token_out2 = tokenizer([sentence, sentence2], padding=True)
token_out2

{'input_ids': [[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1294, 22782, 6397, 119, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}

In [18]:
print(tokenizer.decode(token_out2['input_ids'][0]))
print(tokenizer.decode(token_out2['input_ids'][1]))

[CLS] When life gives you lemons, don ' t make lemonade. [SEP]
[CLS] When life gives you lemons, make lemonade. [SEP] [PAD] [PAD] [PAD]


## Word Embbeding

#### Now we will encode a new sentence and give it to the model

In [19]:
text = "Tokenize me this please"

encoded_inputs = tokenizer(text=text, return_tensors='pt')
encoded_inputs

{'input_ids': tensor([[ 101, 1706, 6378, 3708, 1143, 1142, 4268,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

* Since model is pytorch model it will expect a tensor input rather than dict or list
* return_tensors - It will return the output as tensor

In [20]:
output = model(**encoded_inputs)

* last_hidden_state - Refers the final ouput token for main layer for each input token
* pooler_output - Refers the final ouput of the entire sentence not the individual tokens, its like a summarization of the entire input sentence

In [21]:
last_hidden_state = output.last_hidden_state
pooler_output = output.pooler_output

In [22]:
last_hidden_state.shape

torch.Size([1, 8, 768])

In [23]:
pooler_output.shape

torch.Size([1, 768])

In [24]:
tokenizer.tokenize(text)

['To', '##ken', '##ize', 'me', 'this', 'please']

In [25]:
def predict(text):
    encoded_inputs = tokenizer(text=text, return_tensors='pt')
    return model(**encoded_inputs)[0]

In [26]:
text1 = "There was a fly drinking from my soup"
text2 = "To become a commercial pilot, he had to fly for 1500 hours"

In [27]:
token1 = tokenizer.tokenize(text1)
token2 = tokenizer.tokenize(text2)

token1, token2

(['There', 'was', 'a', 'fly', 'drinking', 'from', 'my', 'soup'],
 ['To',
  'become',
  'a',
  'commercial',
  'pilot',
  ',',
  'he',
  'had',
  'to',
  'fly',
  'for',
  '1500',
  'hours'])

In [28]:
out1 = predict(text=text1)
out2 = predict(text=text2)

In [29]:
emb1 = out1[0:, token1.index("fly"), :].detach()
emb2 = out2[0:, token2.index("fly"), :].detach()

emb1.shape, emb2.shape

(torch.Size([1, 768]), torch.Size([1, 768]))

In [30]:
from scipy.spatial.distance import cosine

In [31]:
cosine(emb1[0], emb2[0])

np.float32(0.41355014)

## Masked Language Modeling

* MLM is the pre training objective of the models like BERT
* We have a seperate masking models and we import it 

In [32]:
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name)
mlm_model

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

* Creating a mask from tokenizer and adding it into a sentence

In [33]:
# Creating the masked tokens

mask = tokenizer.mask_token
mask

'[MASK]'

In [34]:
mask_sentence = f"I want to {mask} pizza for tonight"
mask_sentence

'I want to [MASK] pizza for tonight'

* Tokenizing that sentence

In [35]:
mask_token = tokenizer.tokenize(mask_sentence)
mask_token

['I', 'want', 'to', '[MASK]', 'pizza', 'for', 'tonight']

In [36]:
mask_encoded_input = tokenizer(mask_sentence, return_tensors='pt')
mask_encoded_input

{'input_ids': tensor([[  101,   146,  1328,  1106,   103, 13473,  1111,  3568,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

* Feeding that sentence to the model
* It will return a multi dimensional vector, in that logits tensor will have the masked value details
* Logits represent raw, un-normalized scores that the model assigns to each potential words to fill in the masked positions
* Logits will contain index of the word embedding values and the values will be the probabilites

In [37]:
mask_output = mlm_model(**mask_encoded_input)
mask_output

MaskedLMOutput(loss=None, logits=tensor([[[ -7.4283,  -7.2895,  -7.4779,  ...,  -6.2929,  -5.9589,  -6.4331],
         [ -7.9286,  -8.2635,  -8.0442,  ...,  -6.6752,  -6.4446,  -6.8911],
         [-12.3447, -11.9961, -12.7443,  ...,  -8.4030,  -6.5324,  -8.1336],
         ...,
         [ -9.1358,  -8.8955,  -8.9833,  ...,  -7.8610,  -5.0709,  -8.3300],
         [ -9.4683,  -9.5075,  -9.0676,  ...,  -6.7674,  -6.1865,  -7.4156],
         [-14.4339, -14.6208, -14.9550,  ..., -11.6409, -11.3482, -13.2990]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [38]:
mask_logits = mask_output.logits.detach().numpy()[0]
mask_logits.shape

(9, 28996)

* Fetching the masked value using index
* These will contain the values(probability) of the word embbedings of length `28996` which BERT is trained on
* Converting those values into probabilities with the softmax function

In [39]:
masked_logits = mask_logits[mask_token.index(mask)+1]
masked_logits

array([-6.731374, -6.393911, -6.147725, ..., -5.651458, -3.668856,
       -4.999485], shape=(28996,), dtype=float32)

In [40]:
from scipy.special import softmax
import numpy as np

In [41]:
confidence_score = softmax(masked_logits)
confidence_score

array([3.6420100e-10, 5.1038607e-10, 6.5285466e-10, ..., 1.0723631e-09,
       7.7870954e-09, 2.0582136e-09], shape=(28996,), dtype=float32)

In [42]:
confidence_score.sum()

np.float32(0.99999994)

* Sorting it by index and fetching the top 5 values which has he highest probabilites
* Decoding those 5 values will give the masked word which the model predicted from it's vocab

In [43]:
np.argsort(confidence_score)[::-1][:5]

array([1138, 3940, 1243, 1294, 1546])

In [44]:
for i in np.argsort(confidence_score)[::-1][:5]:
    predict_tokens = tokenizer.decode(i)
    score = confidence_score[i]

    print(score, predict_tokens)
    print(mask_sentence.replace(mask, predict_tokens))

0.25416425 have
I want to have pizza for tonight
0.17271347 eat
I want to eat pizza for tonight
0.15204962 get
I want to get pizza for tonight
0.11082382 make
I want to make pizza for tonight
0.08149549 order
I want to order pizza for tonight
