In [1]:
from transformers import pipeline, set_seed, GPT2Tokenizer, GPT2LMHeadModel
from torch import tensor, numel
from bertviz import model_view

import torch
import pandas as pd

set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


## Tokenization process for GPT

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

'Niket' in tokenizer.get_vocab()

False

In [4]:
input_seq = "I am Niket Girdhar"
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [5]:
tokenizer(input_seq)['input_ids']

[40, 716, 11271, 316, 402, 1447, 9869]

In [6]:
tokenizer(" "+input_seq)['input_ids']

[314, 716, 11271, 316, 402, 1447, 9869]

Adding a space ahead changes the tokens

In [7]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(input_seq))

['I', 'Ġam', 'ĠNik', 'et', 'ĠG', 'ird', 'har']

In [8]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(" "+input_seq))

['ĠI', 'Ġam', 'ĠNik', 'et', 'ĠG', 'ird', 'har']

In [5]:
encoded = tokenizer.encode(input_seq, return_tensors='pt')

encoded

tensor([[   40,   716, 11271,   316,   402,  1447,  9869]])

The reason is that the space is also included in the tokens so it changes the token ids.

The character Ġ represents that there is space.

# Understaning GPT model

In [3]:
generator = pipeline('text-generation', model='gpt2')

Device set to use mps:0


In [10]:
generator("Hello, I am Niket Girdhar and I", max_length = 50, truncation=True, num_return_sequences=3)

Device set to use mps:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, I am Niket Girdhar and I am writing a article about how the world is changing because of the growth of the Bitcoin ATM.\n\nI have been thinking about the Bitcoin ATM ever since I was younger and now I am working'},
 {'generated_text': 'Hello, I am Niket Girdhar and I want to speak to you all about your latest development experience and I will be honest when I say it has not been well received.I want nothing more than to thank you for this experience as well'},
 {'generated_text': 'Hello, I am Niket Girdhar and I was born in Lahore but grew up in Kolkata."\n\n"In a way it is not my fault! My mother was a very strong teacher, but her power was very feeble'}]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation:
    - This means that in the backend of GPt it is setting the end token to a pad token so it eases to a more open generation of text. 

In [4]:
model = GPT2LMHeadModel.from_pretrained('gpt2',attn_implementation="eager")

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
emb_wte = model.transformer.wte(encoded)

emb_wte.shape

torch.Size([1, 7, 768])

In [13]:
emb_wpe = model.transformer.wpe(tensor([0,1,2,3,4,5,6]).reshape(1,7))

emb_wpe.shape

torch.Size([1, 7, 768])

In [19]:
initial_input = emb_wte + emb_wpe

initial_input.shape

torch.Size([1, 7, 768])

In [21]:
initial_input = model.transformer.drop(initial_input)

initial_input

tensor([[[ 0.1286, -0.2933,  0.1470,  ...,  0.0599, -0.0342, -0.0586],
         [ 0.1835, -0.1787,  0.0199,  ...,  0.2900,  0.0298,  0.0143],
         [-0.0972, -0.1913,  0.3132,  ..., -0.1427,  0.0854,  0.0905],
         ...,
         [-0.0218, -0.1556,  0.2022,  ..., -0.0822,  0.1376, -0.2072],
         [-0.1716, -0.1880,  0.3484,  ..., -0.0583, -0.0717,  0.3192],
         [-0.0535, -0.1615,  0.1910,  ...,  0.2329, -0.1999, -0.1838]]],
       grad_fn=<AddBackward0>)

In [22]:
model.lm_head

Linear(in_features=768, out_features=50257, bias=False)

In [24]:
for module in model.transformer.h:
    initial_input = module(initial_input)[0]

initial_input = model.transformer.ln_f(initial_input)

In [25]:
(initial_input == model(encoded, output_hidden_states=True).hidden_states[-1]).all()

tensor(True)

In [28]:
# finding the total parameters in GPT2

total_params = 0

for param in model.parameters():
    total_params += numel(param)

print(f'Total parameters in GPT2: {total_params:,}')

Total parameters in GPT2: 124,439,808


# Masked Multi-Head Attention

In [8]:
phrase = "My friend was right about this lecture. It is so boring!"

encoded_phrase = tokenizer(phrase, return_tensors='pt')

response = model(**encoded_phrase, output_attentions = True, output_hidden_states = True)

len(response.attentions)

12

In [9]:
encoded_phrase

{'input_ids': tensor([[ 3666,  1545,   373,   826,   546,   428, 19143,    13,   632,   318,
           523, 14262,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [11]:
response.attentions[-1].shape # attention from final decoder

torch.Size([1, 12, 13, 13])

1 : batch size

12 : 12 heads in final decoder

13,13 : Input tokens number

In [12]:
encoded_phrase['input_ids'].shape

torch.Size([1, 13])

In [14]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0])

tokens

['My',
 'Ġfriend',
 'Ġwas',
 'Ġright',
 'Ġabout',
 'Ġthis',
 'Ġlecture',
 '.',
 'ĠIt',
 'Ġis',
 'Ġso',
 'Ġboring',
 '!']

In [15]:
# Looking into the layer 9 head 0

arr = response.attentions[9][0][0] # layer 9 | head 0 | batch is 1 so index 0

n_digits = 3 # rondiong the numbers to 3 decimal points

attention_df = pd.DataFrame((torch.round(arr*10**n_digits)/(10**n_digits)).detach()).applymap(float)

attention_df.columns = tokens
attention_df.index = tokens

attention_df


  attention_df = pd.DataFrame((torch.round(arr*10**n_digits)/(10**n_digits)).detach()).applymap(float)


Unnamed: 0,My,Ġfriend,Ġwas,Ġright,Ġabout,Ġthis,Ġlecture,.,ĠIt,Ġis,Ġso,Ġboring,!
My,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġfriend,0.968,0.032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġwas,0.824,0.145,0.031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġright,0.979,0.008,0.007,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġabout,0.979,0.008,0.004,0.005,0.005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġthis,0.924,0.031,0.007,0.006,0.016,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ġlecture,0.974,0.002,0.001,0.001,0.002,0.002,0.018,0.0,0.0,0.0,0.0,0.0,0.0
.,0.814,0.019,0.004,0.003,0.003,0.008,0.142,0.007,0.0,0.0,0.0,0.0,0.0
ĠIt,0.458,0.004,0.004,0.007,0.022,0.055,0.405,0.014,0.032,0.0,0.0,0.0,0.0
Ġis,0.492,0.01,0.004,0.006,0.01,0.06,0.309,0.01,0.071,0.028,0.0,0.0,0.0


sum of attention scores of each row is equals to 1

In [19]:
tokens = tokenizer.convert_ids_to_tokens(encoded_phrase['input_ids'][0])
model_view(response.attentions,tokens)

<IPython.core.display.Javascript object>

In [20]:
response.hidden_states[-1].shape

torch.Size([1, 13, 768])

In [22]:
response.logits.shape

torch.Size([1, 13, 50257])

Logits are the output of teh final language modelling layer.

Logits uses the hidden state in GPT in helping map each of the 13 tokens by applying a feed-forward layer to each of the 13 tokens.

Each of the 13 tokens is mapped to 1 of 50257 possible tokens in GPT-2

In [23]:
pd.DataFrame(
    zip(tokens, tokenizer.convert_ids_to_tokens(response.logits.argmax(2)[0])),
    columns = ["Sequence up until", "Next token with highest probability"]
)

Unnamed: 0,Sequence up until,Next token with highest probability
0,My,Ċ
1,Ġfriend,","
2,Ġwas,Ġa
3,Ġright,.
4,Ġabout,Ġthat
5,Ġthis,.
6,Ġlecture,.
7,.,ĠI
8,ĠIt,Ġwas
9,Ġis,Ġa


In [24]:
generator(phrase, max_length=20,num_return_sequences = 1, do_sample = False) # greedy search

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "My friend was right about this lecture. It is so boring! I'm not sure if I should"}]

In [25]:
generator(phrase, max_length=20,num_return_sequences = 1, do_sample = True) # greedy search with sampling

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'My friend was right about this lecture. It is so boring! I did hear about the great stuff'}]