In [2]:
from transformers import pipeline, set_seed, GPT2Tokenizer, GPT2LMHeadModel
from torch import tensor, numel
from bertviz import model_view

set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


## Tokenization process for GPT

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

'Niket' in tokenizer.get_vocab()

False

In [4]:
input_seq = "I am Niket Girdhar"
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [5]:
tokenizer(input_seq)['input_ids']

[40, 716, 11271, 316, 402, 1447, 9869]

In [6]:
tokenizer(" "+input_seq)['input_ids']

[314, 716, 11271, 316, 402, 1447, 9869]

Adding a space ahead changes the tokens

In [7]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(input_seq))

['I', 'Ġam', 'ĠNik', 'et', 'ĠG', 'ird', 'har']

In [8]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(" "+input_seq))

['ĠI', 'Ġam', 'ĠNik', 'et', 'ĠG', 'ird', 'har']

In [9]:
encoded = tokenizer.encode(input_seq, return_tensors='pt')

encoded

tensor([[   40,   716, 11271,   316,   402,  1447,  9869]])

The reason is that the space is also included in the tokens so it changes the token ids.

The character Ġ represents that there is space.

# Understaning GPT model

In [29]:
generator = pipeline('text-generation', model='gpt2')

Device set to use mps:0


In [10]:
generator("Hello, I am Niket Girdhar and I", max_length = 50, truncation=True, num_return_sequences=3)

Device set to use mps:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, I am Niket Girdhar and I am writing a article about how the world is changing because of the growth of the Bitcoin ATM.\n\nI have been thinking about the Bitcoin ATM ever since I was younger and now I am working'},
 {'generated_text': 'Hello, I am Niket Girdhar and I want to speak to you all about your latest development experience and I will be honest when I say it has not been well received.I want nothing more than to thank you for this experience as well'},
 {'generated_text': 'Hello, I am Niket Girdhar and I was born in Lahore but grew up in Kolkata."\n\n"In a way it is not my fault! My mother was a very strong teacher, but her power was very feeble'}]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation:
    - This means that in the backend of GPt it is setting the end token to a pad token so it eases to a more open generation of text. 

In [11]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [17]:
emb_wte = model.transformer.wte(encoded)

emb_wte.shape

torch.Size([1, 7, 768])

In [18]:
emb_wpe = model.transformer.wpe(tensor([0,1,2,3,4,5,6]).reshape(1,7))

emb_wpe.shape

torch.Size([1, 7, 768])

In [19]:
initial_input = emb_wte + emb_wpe

initial_input.shape

torch.Size([1, 7, 768])

In [21]:
initial_input = model.transformer.drop(initial_input)

initial_input

tensor([[[ 0.1286, -0.2933,  0.1470,  ...,  0.0599, -0.0342, -0.0586],
         [ 0.1835, -0.1787,  0.0199,  ...,  0.2900,  0.0298,  0.0143],
         [-0.0972, -0.1913,  0.3132,  ..., -0.1427,  0.0854,  0.0905],
         ...,
         [-0.0218, -0.1556,  0.2022,  ..., -0.0822,  0.1376, -0.2072],
         [-0.1716, -0.1880,  0.3484,  ..., -0.0583, -0.0717,  0.3192],
         [-0.0535, -0.1615,  0.1910,  ...,  0.2329, -0.1999, -0.1838]]],
       grad_fn=<AddBackward0>)

In [22]:
model.lm_head

Linear(in_features=768, out_features=50257, bias=False)

In [24]:
for module in model.transformer.h:
    initial_input = module(initial_input)[0]

initial_input = model.transformer.ln_f(initial_input)

In [25]:
(initial_input == model(encoded, output_hidden_states=True).hidden_states[-1]).all()

tensor(True)

In [28]:
# finding the total parameters in GPT2

total_params = 0

for param in model.parameters():
    total_params += numel(param)

print(f'Total parameters in GPT2: {total_params:,}')

Total parameters in GPT2: 124,439,808
