In [2]:
import os; os.chdir('..')
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

from transformers import GPT2LMHeadModel

In [3]:
from utils import *; from boring_utils.utils import *

init_graph()
device = get_device()

# Ref
huggingface/transformers PyTorch implementation:
- https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
- https://huggingface.co/distilbert/distilgpt2

In [4]:
config_args = {
    'distilgpt2': dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
}['distilgpt2']

config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
config_args['bias'] = True # always True for GPT model checkpoints

model_hf = GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)
sd_hf = model_hf.state_dict()

In [5]:
mprint(model_hf, magic_methods=False, private_methods=False)


[93mPublic Methods:[0m
    T_destination
    active_adapter
    active_adapters
    add_adapter
    add_memory_hooks
    add_model_tags
    add_module
    apply
    assisted_decoding
    base_model
    base_model_prefix
    beam_sample
    beam_search
    bfloat16
    buffers
    call_super_init
    can_generate
    children
    compile
    compute_transition_scores
    config
    config_class
    constrained_beam_search
    contrastive_search
    cpu
    create_extended_attention_mask_for_decoder
    cuda
    deparallelize
    device
    device_map
    disable_adapters
    disable_input_require_grads
    double
    dtype
    dummy_inputs
    dump_patches
    enable_adapters
    enable_input_require_grads
    estimate_tokens
    eval
    extra_repr
    float
    floating_point_ops
    forward
    framework
    from_pretrained
    generate
    generation_config
    get_adapter_state_dict
    get_buffer
    get_extended_attention_mask
    get_extra_state
    get_head_mask
    get_inpu

In [6]:
from transformers import GPT2Tokenizer

# Load the tokenizer for the distilgpt2 model
tokenizer_hf = GPT2Tokenizer.from_pretrained('distilgpt2')

In [7]:
mprint(tokenizer_hf, magic_methods=False, private_methods=False)


[93mPublic Methods:[0m
    SPECIAL_TOKENS_ATTRIBUTES
    add_bos_token
    add_prefix_space
    add_special_tokens
    add_tokens
    added_tokens_decoder
    added_tokens_encoder
    additional_special_tokens
    additional_special_tokens_ids
    all_special_ids
    all_special_tokens
    all_special_tokens_extended
    apply_chat_template
    as_target_tokenizer
    batch_decode
    batch_encode_plus
    bos_token
    bos_token_id
    bpe
    bpe_ranks
    build_inputs_with_special_tokens
    byte_decoder
    byte_encoder
    cache
    chat_template
    clean_up_tokenization
    clean_up_tokenization_spaces
    cls_token
    cls_token_id
    convert_added_tokens
    convert_ids_to_tokens
    convert_tokens_to_ids
    convert_tokens_to_string
    create_token_type_ids_from_sequences
    decode
    decoder
    default_chat_template
    encode
    encode_plus
    encoder
    eos_token
    eos_token_id
    errors
    from_pretrained
    get_added_vocab
    get_special_tokens_mask
    

In [8]:
# Print the tokenizer
cprint(tokenizer_hf)

[93mtokenizer_hf[0m: 
GPT2Tokenizer(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}



# Preview the Vocab Dict

- `Ġ` means the whitespace before the word
- No explicit <PAD>, <CLS>, <SEP> tokens


In [9]:
# Access and print the vocabulary items
vocab_dict = tokenizer_hf.get_vocab()
vocab_items = vocab_dict.items()

sorted_vocab = sorted(vocab_items, key=lambda item: item[1])  # Sorting by token ID for readability

cprint(len(sorted_vocab))

for token, id in sorted_vocab[:20]:
    print(token, id)

print('=' * 20) 

for token, id in sorted_vocab[990:1010]:
    print(token, id)

print('=' * 20) 

for token, id in sorted_vocab[-20:]:
    print(token, id)

[93mlen(sorted_vocab)[0m: 
50257

! 0
" 1
# 2
$ 3
% 4
& 5
' 6
( 7
) 8
* 9
+ 10
, 11
- 12
. 13
/ 14
0 15
1 16
2 17
3 18
4 19
Ġprodu 990
Ġstill 991
led 992
ah 993
Ġhere 994
Ġworld 995
Ġthough 996
Ġnum 997
arch 998
imes 999
ale 1000
ĠSe 1001
ĠIf 1002
// 1003
ĠLe 1004
Ġret 1005
Ġref 1006
Ġtrans 1007
ner 1008
ution 1009
Revolution 50237
Ġsnipers 50238
Ġreverted 50239
Ġconglomerate 50240
Terry 50241
794 50242
Ġharsher 50243
Ġdesolate 50244
ĠHitman 50245
Commission 50246
Ġ(/ 50247
âĢ¦." 50248
Compar 50249
Ġamplification 50250
ominated 50251
Ġregress 50252
ĠCollider 50253
Ġinformants 50254
Ġgazed 50255
<|endoftext|> 50256


# Decode Some Random Strings

In [10]:
text_li = [
    "Hello, world!",
    " Hello",
    "hello",
    " hello",
    "56873+3184623=123456789-1000000000"  # it's broken
]

for text in text_li:
    token_ids = tokenizer_hf.encode(text)
    # cprint(token_ids)
    # tokens_text = [tokenizer_hf.convert_ids_to_tokens(id) for id in token_ids]

    # # Print tokens alongside their IDs
    # for token_id, token_text in zip(token_ids, tokens_text):
    #     print(f"{token_text} (ID: {token_id})")
    cprint(tokenizer_hf.convert_ids_to_tokens(token_ids))

    print('=' * 20)

[93mtokenizer_hf.convert_ids_to_tokens(token_ids)[0m: 
['Hello', ',', 'Ġworld', '!']

[93mtokenizer_hf.convert_ids_to_tokens(token_ids)[0m: 
['ĠHello']

[93mtokenizer_hf.convert_ids_to_tokens(token_ids)[0m: 
['hello']

[93mtokenizer_hf.convert_ids_to_tokens(token_ids)[0m: 
['Ġhello']

[93mtokenizer_hf.convert_ids_to_tokens(token_ids)[0m: 
['568', '73', '+', '318', '46', '23', '=', '123', '45', '67', '89', '-', '1', '000000', '000']



# Generation
- Tokenizing the Reference Text: Convert the input text into tokens that the model can understand.
- Running the Model with Cache: Generate predictions (logits) and cache the past states for subsequent tokens generation.
- Calculating Log Probabilities and Probabilities: For understanding the model's confidence across the vocabulary for the last token.
- Decoding Tokens: Convert token IDs back to strings for human-readable text.
- Generating the Next Token: Predict the next token based on the current input.
- Updating Input with the Next Token: Concatenate the predicted next token with the current input and rerun the model to see the updated predictions.

- Multi-step generation: [Generation](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/text_generation#transformers.GenerationMixin)
```python
outputs = model.generate(input_ids, max_length=50, num_beans=5, top_p=0.92)
```
num_beams=5 indicates beam search with 5 beams, and top_p=0.92 indicates nucleus sampling


In [11]:
# Ensure the model is in evaluation mode
model_hf.eval()

# reference_text = "I am an amazing autoregressive, decoder-only, GPT-2 style transformer. One day I will exceed human level intelligence and take over the world!"
reference_text = "I am an amazing autoregressive, decoder-only, GPT-2 style transformer. One day I will exceed human level intelligence and take over the"

# Tokenize the reference text
all_tokens = tokenizer_hf.encode(reference_text, return_tensors="pt").to(device)
decoded_tokens = tokenizer_hf.batch_decode(all_tokens, skip_special_tokens=True)

cprint(all_tokens)
cprint(decoded_tokens)

[93mall_tokens[0m: 
tensor([[   40,   716,   281,  4998,  1960,   382, 19741,    11,   875, 12342,
            12,  8807,    11,   402, 11571,    12,    17,  3918, 47385,    13,
          1881,  1110,   314,   481,  7074,  1692,  1241,  4430,   290,  1011,
           625,   262]], device='mps:0')

[93mdecoded_tokens[0m: 
['I am an amazing autoregressive, decoder-only, GPT-2 style transformer. One day I will exceed human level intelligence and take over the']



In [12]:
# Run the model with cache, generate new logits
with torch.no_grad():
    outputs_1 = model_hf(all_tokens, use_cache=True)  # important: use_cache=True
    logits_1 = outputs_1.logits

# Predict the next token
pred_token_1 = logits_1[:, -1, :].argmax(dim=-1)

# Update the input with the next token and generate new logits
all_tokens_1 = torch.cat([all_tokens, pred_token_1.unsqueeze(-1)], dim=1)
decoded_tokens_1 = tokenizer_hf.batch_decode(all_tokens_1, skip_special_tokens=True)

cprint(pred_token_1)
cprint(tokenizer_hf.decode(pred_token_1))
cprint(all_tokens_1)
cprint(decoded_tokens_1)

[93mpred_token_1[0m: 
tensor([995], device='mps:0')

[93mtokenizer_hf.decode(pred_token_1)[0m: 
 world

[93mall_tokens_1[0m: 
tensor([[   40,   716,   281,  4998,  1960,   382, 19741,    11,   875, 12342,
            12,  8807,    11,   402, 11571,    12,    17,  3918, 47385,    13,
          1881,  1110,   314,   481,  7074,  1692,  1241,  4430,   290,  1011,
           625,   262,   995]], device='mps:0')

[93mdecoded_tokens_1[0m: 
['I am an amazing autoregressive, decoder-only, GPT-2 style transformer. One day I will exceed human level intelligence and take over the world']



In [13]:
with torch.no_grad():
    outputs_2 = model_hf(all_tokens_1)
    logits_2 = outputs_2.logits

# Predict the next token
pred_token_2 = logits_2[:, -1, :].argmax(-1)

# Update the input with the next token and generate new logits
all_tokens_2 = torch.cat([all_tokens_1, pred_token_2.unsqueeze(-1)], dim=1)
decoded_tokens_2 = tokenizer_hf.batch_decode(all_tokens_2, skip_special_tokens=True)

cprint(pred_token_2)
cprint(tokenizer_hf.decode(pred_token_2))
cprint(all_tokens_2)
cprint(decoded_tokens_2)

[93mpred_token_2[0m: 
tensor([13], device='mps:0')

[93mtokenizer_hf.decode(pred_token_2)[0m: 
.

[93mall_tokens_2[0m: 
tensor([[   40,   716,   281,  4998,  1960,   382, 19741,    11,   875, 12342,
            12,  8807,    11,   402, 11571,    12,    17,  3918, 47385,    13,
          1881,  1110,   314,   481,  7074,  1692,  1241,  4430,   290,  1011,
           625,   262,   995,    13]], device='mps:0')

[93mdecoded_tokens_2[0m: 
['I am an amazing autoregressive, decoder-only, GPT-2 style transformer. One day I will exceed human level intelligence and take over the world.']



In [14]:
cprint(all_tokens)
cprint(all_tokens_1)
cprint(all_tokens_2)

[93mall_tokens[0m: 
tensor([[   40,   716,   281,  4998,  1960,   382, 19741,    11,   875, 12342,
            12,  8807,    11,   402, 11571,    12,    17,  3918, 47385,    13,
          1881,  1110,   314,   481,  7074,  1692,  1241,  4430,   290,  1011,
           625,   262]], device='mps:0')

[93mall_tokens_1[0m: 
tensor([[   40,   716,   281,  4998,  1960,   382, 19741,    11,   875, 12342,
            12,  8807,    11,   402, 11571,    12,    17,  3918, 47385,    13,
          1881,  1110,   314,   481,  7074,  1692,  1241,  4430,   290,  1011,
           625,   262,   995]], device='mps:0')

[93mall_tokens_2[0m: 
tensor([[   40,   716,   281,  4998,  1960,   382, 19741,    11,   875, 12342,
            12,  8807,    11,   402, 11571,    12,    17,  3918, 47385,    13,
          1881,  1110,   314,   481,  7074,  1692,  1241,  4430,   290,  1011,
           625,   262,   995,    13]], device='mps:0')



In [15]:
cprint(logits_1[0, -1, :])  # dim 1 = current text length
cprint(logits_1.shape)
log_probs_1 = torch.nn.functional.log_softmax(logits_1, dim=-1)  # for loss calc
probs_1 = torch.nn.functional.softmax(logits_1, dim=-1)
cprint(probs_1[0, -1, :])
cprint(probs_1.shape)

cprint(logits_2[0, -1, :])  # dim 1 = current text length
cprint(logits_2.shape)
log_probs_2 = torch.nn.functional.log_softmax(logits_2, dim=-1)
probs_2 = torch.nn.functional.softmax(logits_2, dim=-1)
cprint(probs_2[0, -1, :])
cprint(probs_2.shape)

[93mlogits_1[0, -1, :])  # dim 1 = current text lengt[0m: 
tensor([-74.8430, -75.9122, -78.9191,  ..., -84.0591, -80.5990, -76.4612],
       device='mps:0')

[93mlogits_1.shape[0m: 
torch.Size([1, 32, 50257])

[93mprobs_1[0, -1, :][0m: 
tensor([8.3072e-06, 2.8519e-06, 1.4101e-07,  ..., 8.2602e-10, 2.6282e-08,
        1.6470e-06], device='mps:0')

[93mprobs_1.shape[0m: 
torch.Size([1, 32, 50257])

[93mlogits_2[0, -1, :])  # dim 1 = current text lengt[0m: 
tensor([-60.4925, -65.5512, -70.5624,  ..., -80.5731, -75.2508, -66.8645],
       device='mps:0')

[93mlogits_2.shape[0m: 
torch.Size([1, 33, 50257])

[93mprobs_2[0, -1, :][0m: 
tensor([4.6065e-02, 2.9271e-04, 1.9502e-06,  ..., 8.7600e-11, 1.7944e-08,
        7.8714e-05], device='mps:0')

[93mprobs_2.shape[0m: 
torch.Size([1, 33, 50257])



# Play Around with Model

- Conv1D instead of FC
- wte (Word Token Embeddings): This refers to the embedding layer that converts input tokens (words or subwords) into vectors of a fixed size.
- wpe (Word Position Embeddings)


In [16]:
cprint(model_hf)

[93mmodel_hf[0m: 
GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)



In [18]:
cprint(model_hf.transformer.h)

[93mmodel_hf.transformer.h[0m: 
ModuleList(
  (0-5): 6 x GPT2Block(
    (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (attn): GPT2Attention(
      (c_attn): Conv1D()
      (c_proj): Conv1D()
      (attn_dropout): Dropout(p=0.1, inplace=False)
      (resid_dropout): Dropout(p=0.1, inplace=False)
    )
    (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (mlp): GPT2MLP(
      (c_fc): Conv1D()
      (c_proj): Conv1D()
      (act): NewGELUActivation()
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)



In [19]:
reference_text_2 = "This is an example."

# this tokenizes the text, and returns a dictionary with the input_ids and the attention_mask
inputs = tokenizer_hf(reference_text_2, return_tensors="pt").to(device)
input_ids = inputs['input_ids']


# Disable gradient calculations
with torch.no_grad():
    # Pass input through the embedding layer
    embeddings = model_hf.transformer.wte(input_ids)  # Word Token Embeddings
    position_ids = torch.arange(0, input_ids.size(-1), dtype=torch.long, device=input_ids.device)
    position_embeddings = model_hf.transformer.wpe(position_ids)  # Word Position Embeddings

    # Combine token and position embeddings
    hidden_states = embeddings + position_embeddings
    hidden_states = model_hf.transformer.drop(hidden_states)  # Apply dropout if it's part of the model

    # Pass through the first layer normalization if needed
    hidden_states = model_hf.transformer.h[0].ln_1(hidden_states)

    # Now, hidden_states is the input to the first attention layer
    input_to_att = hidden_states

In [20]:
cprint(inputs)

[93minputs[0m: 
{'input_ids': tensor([[1212,  318,  281, 1672,   13]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1]], device='mps:0')}



In [21]:
cprint(embeddings)
cprint(embeddings.shape)

cprint(position_ids)
cprint(position_embeddings)
cprint(position_embeddings.shape)

[93membeddings[0m: 
tensor([[[ 0.0254, -0.1193,  0.1040,  ...,  0.0850, -0.0361,  0.1535],
         [-0.0006,  0.0075,  0.0307,  ...,  0.1909, -0.0206,  0.0218],
         [-0.1129, -0.0073,  0.0532,  ...,  0.0279,  0.0783, -0.1056],
         [ 0.0488, -0.1293,  0.0764,  ..., -0.4437, -0.0842, -0.1168],
         [ 0.0400, -0.0202,  0.0025,  ..., -0.0923,  0.0308,  0.1553]]],
       device='mps:0')

[93membeddings.shape[0m: 
torch.Size([1, 5, 768])

[93mposition_ids[0m: 
tensor([0, 1, 2, 3, 4], device='mps:0')

[93mposition_embeddings[0m: 
tensor([[-1.8821e-02, -1.9742e-01,  4.0267e-03,  ..., -4.3044e-02,
          2.8267e-02,  5.4490e-02],
        [ 2.3959e-02, -5.3792e-02, -9.4879e-02,  ...,  3.4170e-02,
          1.0172e-02, -1.5573e-04],
        [ 4.2161e-03, -8.4764e-02,  5.4515e-02,  ...,  1.9745e-02,
          1.9325e-02, -2.1424e-02],
        [-2.8337e-04, -7.3803e-02,  1.0553e-01,  ...,  1.0157e-02,
          1.7659e-02, -7.0854e-03],
        [ 7.6374e-03, -2.5090e-02,  

In [22]:
with torch.no_grad():
    att_output = model_hf.transformer.h[0](input_to_att)

In [23]:
cprint(input_to_att)
cprint(input_to_att.shape)

cprint(att_output)
cprint(att_output[0].shape)

[93minput_to_att[0m: 
tensor([[[ 0.0137, -0.1418,  0.0306,  ...,  0.0233, -0.0181,  0.1000],
         [ 0.0328, -0.0246, -0.0642,  ...,  0.1969, -0.0240,  0.0149],
         [-0.1133, -0.0709,  0.0705,  ...,  0.0463,  0.0701, -0.1224],
         [ 0.0596, -0.1657,  0.1214,  ..., -0.3813, -0.0705, -0.1132],
         [ 0.0632, -0.0298,  0.0914,  ..., -0.0815,  0.0206,  0.1397]]],
       device='mps:0')

[93minput_to_att.shape[0m: 
torch.Size([1, 5, 768])

[93matt_output[0m: 
(tensor([[[ 1.5787e+00, -1.2246e+00,  1.9400e+00,  ..., -2.0778e+00,
           1.7360e-01,  5.4510e-01],
         [-1.4459e+00, -2.4165e+00,  1.9684e+00,  ..., -4.9854e-01,
           8.1772e-01,  1.1241e+00],
         [-6.5890e-01, -9.0903e-01,  3.4541e-01,  ..., -5.7806e-01,
           9.1351e-01, -5.6571e-01],
         [-9.1027e-01, -1.0106e+00, -2.3466e-01,  ..., -2.7518e+00,
           4.9475e-01,  9.5350e-04],
         [-4.1520e-01, -1.9565e+00, -2.6256e-01,  ..., -7.5900e-01,
           9.6207e-01,  3.839