# Playground for OpenAI weights
* Loading and trying openai weights

## Imports and config

In [60]:
from importlib.metadata import version
print("TensorFlow version:", version("tensorflow"))
print("tqdm version:", version("tqdm"))

import torch
import tiktoken
import os
import json
import tensorflow as tf 
from tqdm import tqdm

torch.set_printoptions(threshold=10, edgeitems=3, precision=2)
torch.manual_seed(42)

%run "06. GPTModel.ipynb"
%run "08. Training.ipynb"

GPT_CONFIG_124M = {
       # Vocabulary size
    "vocab_size": 50257,"context_length": 1024, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": True       # Query-key-value bias -- is true for real gpt
}

# From 06. GPTModel
GPTModel = GPTModel # From 06. GPTModel.ipynb
generate_tokens = generate_tokens # From 06. GPTModel.ipynb

# from 08. Training
text_to_token_ids = text_to_token_ids 
token_ids_to_text = token_ids_to_text

# determine hardware device
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

TensorFlow version: 2.19.0
tqdm version: 4.67.1


## Load Pre-trained GPT-2 Model from HuggingFace
* Downloads pre-trained PyTorch weights (.pth files) if not present locally
* Available sizes: 124M (small), 355M (medium), 774M (large), 1558M (xl)
* Loads weights into model and generates sample text for verification

In [61]:

# file_name = "gpt2-small-124M.pth"
# file_name = "gpt2-medium-355M.pth"
# file_name = "gpt2-large-774M.pth"
# file_name = "gpt2-xl-1558M.pth"
def load_model_from_scratch(file_name = "gpt2-small-124M.pth", config = GPT_CONFIG_124M):

    tokenizer = tiktoken.get_encoding("gpt2")
    url = f"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}"

    if not os.path.exists(file_name):
        urllib.request.urlretrieve(url, file_name)
        print(f"Downloaded to {file_name}")

    gpt = GPTModel(config)
    gpt.load_state_dict(torch.load(file_name, weights_only=True))
    gpt.eval()
    gpt.to(device);

    torch.manual_seed(42)

    token_ids = generate_tokens(
        model=gpt,
        token_ids=text_to_token_ids("Every effort moves you", tokenizer).to(device),
        max_new_tokens=25,
        context_size=config["context_length"],
        top_k=50,
        temperature=1.5
    )

    print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

if '__file__' not in dir():
    load_model_from_scratch()

Output text:
 Every effort moves you and my team around, so please donate today and consider donating before Labor Day. The money will go toward further research and development


## Manual Weight Loading from Original OpenAI GPT-2 Checkpoint
* Downloads 7 checkpoint files from OpenAI's official blob storage
* Extracts weights from TensorFlow checkpoint format
* Manual weight assignment to match custom architecture naming:
  - Token/position embeddings (wte/wpe)
  - Attention QKV matrices with bias splitting
  - Feed-forward layers (MLP)
  - Layer normalization parameters (scale/shift)
* Verifies loaded model with text generation
* File downloading code for GPT. is in `gpt_download.py`. Just skip no magic

In [62]:
import numpy as np
from gpt_download import download_and_load_gpt2

# Define model configurations in a dictionary for compactness
model_configs = {
    "gpt2-small (124M)": {"vocab_size": 50257, "emb_dim": 768, "n_layers": 12, "n_heads": 12, "context_length": 1024, "drop_rate": 0.1, "qkv_bias": True},
    "gpt2-medium (355M)": {"vocab_size": 50257, "emb_dim": 1024, "n_layers": 24, "n_heads": 16, "context_length": 1024, "drop_rate": 0.1, "qkv_bias": True},
    "gpt2-large (774M)": {"vocab_size": 50257, "emb_dim": 1280, "n_layers": 36, "n_heads": 20, "context_length": 1024, "drop_rate": 0.1, "qkv_bias": True},
    "gpt2-xl (1558M)": {"vocab_size": 50257, "emb_dim": 1600, "n_layers": 48, "n_heads": 25, "context_length": 1024, "drop_rate": 0.1, "qkv_bias": True},
}

# Inits model base structure
def init_model(model_name):
    config = model_configs[model_name]
    gpt = GPTModel(config)
    gpt.eval();
    return gpt

# Assign function for weight matrices
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

# Manually loading weitghts into gpt
def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

if '__file__' not in dir():
    settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")
    print("\nSettings:", settings)
    print("\nParameter dictionary keys:", params.keys())

    print("\nExample: wte - Token embedding weight tensor dimensions:", params["wte"].shape)
    print(params["wte"])
    
    gpt = init_model("gpt2-small (124M)")
    load_weights_into_gpt(gpt, params)
    gpt.to(device);

    torch.manual_seed(42)
    tokenizer = tiktoken.get_encoding("gpt2")
    
    token_ids = generate_tokens(
        model=gpt,
        token_ids=text_to_token_ids("Every effort moves you", tokenizer).to(device),
        max_new_tokens=25,
        context_size=1024,
        top_k=50,
        temperature=1.5
    )

    print("\nOutput text:\n", token_ids_to_text(token_ids, tokenizer))

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])

Example: wte - Token embedding weight tensor dimensions: (50257, 768)
[[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.0978326