In [36]:
import json
import tensorflow as tf
from gpt_download import load_gpt2_params_from_tf_ckpt

settings = json.load(open("gpt2/124M/hparams.json"))
params = load_gpt2_params_from_tf_ckpt(tf.train.latest_checkpoint("gpt2/124M"),settings)

print(f" Settings : {settings}")
print(f" Parameters dictionary keys : {params.keys()}")

 Settings : {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
 Parameters dictionary keys : dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [37]:
# Define model configurations in a dictionary for compactness
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

from base_gpt import GPT2_config,GPT_model

# Copy the base configuration and update with specific model settings
model_name = "gpt2-small (124M)"  # Example model name
NEW_CONFIG = GPT2_config.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt = GPT_model(NEW_CONFIG)
gpt.eval()

print(NEW_CONFIG)

{'vocab_size': 50257, 'n_heads': 12, 'n_layers': 12, 'qkv_bias': True, 'context_len': 1024, 'emb_dim': 768, 'drop_rate': 0.1, 'context_length': 1024}


In [38]:
import torch
def assign(left,right):
    if left.shape != right.shape:
        raise ValueError(f"ShapeMismatch -- Left : {left.shape} Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [42]:
import numpy as np



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_gpt2params_to_model(gpt,params):
    gpt.pos_embed.weight = assign(gpt.pos_embed.weight,params['wpe'])
    gpt.tok_embed.weight = assign(gpt.tok_embed.weight,params['wte'])
    for b in range(len(params['blocks'])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blks[b].MHA.w_q.weight = assign(
            gpt.trf_blks[b].MHA.w_q.weight, q_w.T)
        gpt.trf_blks[b].MHA.w_k.weight = assign(
            gpt.trf_blks[b].MHA.w_k.weight, k_w.T)
        gpt.trf_blks[b].MHA.w_v.weight = assign(
            gpt.trf_blks[b].MHA.w_v.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blks[b].MHA.w_q.bias = assign(
            gpt.trf_blks[b].MHA.w_q.bias, q_b)
        gpt.trf_blks[b].MHA.w_k.bias = assign(
            gpt.trf_blks[b].MHA.w_k.bias, k_b)
        gpt.trf_blks[b].MHA.w_v.bias = assign(
            gpt.trf_blks[b].MHA.w_v.bias, v_b)

        gpt.trf_blks[b].MHA.out_proj.weight = assign(
            gpt.trf_blks[b].MHA.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blks[b].MHA.out_proj.bias = assign(
            gpt.trf_blks[b].MHA.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blks[b].ff.layers[0].weight = assign(
            gpt.trf_blks[b].ff.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blks[b].ff.layers[0].bias = assign(
            gpt.trf_blks[b].ff.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blks[b].ff.layers[2].weight = assign(
            gpt.trf_blks[b].ff.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blks[b].ff.layers[2].bias = assign(
            gpt.trf_blks[b].ff.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blks[b].norm1.scale = assign(
            gpt.trf_blks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blks[b].norm1.shift = assign(
            gpt.trf_blks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blks[b].norm2.scale = assign(
            gpt.trf_blks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blks[b].norm2.shift = assign(
            gpt.trf_blks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

load_gpt2params_to_model(gpt, params)
gpt.to(device)


GPT_model(
  (dropout): Dropout(p=0.1, inplace=False)
  (tok_embed): Embedding(50257, 768)
  (pos_embed): Embedding(1024, 768)
  (trf_blks): Sequential(
    (0): Transformer_blk(
      (MHA): Multi_head_attention(
        (w_q): Linear(in_features=768, out_features=768, bias=True)
        (w_k): Linear(in_features=768, out_features=768, bias=True)
        (w_v): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (ff): Feed_forward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GeLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): Layer_norm()
      (norm2): Layer_norm()
      (drop): Dropout(p=0.1, inplace=False)
    )
    (1): Transformer_blk(
      (MHA): Multi_head_attention(
        (w_q): Linear(in_features=768, out_features

In [43]:
from base_gpt import generate_text
import tiktoken
def text_to_token(text,tokenizer)->torch.tensor:
    encode = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
    encode_tensor = torch.tensor(encode).unsqueeze(0)   # adding batch dimension
    return encode_tensor

def token_to_text(token,tokenizer):
    decode = token.squeeze(0).tolist()
    return tokenizer.decode(decode)

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")
torch.manual_seed(123)
out_tokens = generate_text(model=gpt,
            inp_tokens=text_to_token(start_context,tokenizer),
            new_tokens=25,
            context_len=GPT2_config["context_len"],
            temp=1.5,
            top_k=50)
print(f"OUTPUT : {token_to_text(out_tokens,tokenizer)}")

OUTPUT : Every effort moves you toward finding an ideal new way to practice something!

What makes us want to be on top of that?




In [45]:
from importlib.metadata import version

pkgs = ["matplotlib",
        "numpy",
        "tiktoken",
        "torch",
        "tensorflow", # For OpenAI's pretrained weights
        "pandas"      # Dataset loading
       ]
for p in pkgs:
    print(f"{p} version: {version(p)}")

matplotlib version: 3.9.0
numpy version: 1.26.4
tiktoken version: 0.7.0
torch version: 2.3.0
tensorflow version: 2.16.1
pandas version: 2.2.2
