In [2]:
from gpt import TransformerBlock

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

block = TransformerBlock(GPT_CONFIG_124M)

# Parameters in feed forward network
ff_params = sum(p.numel() for p in block.ff.parameters())
print(f"Feed Forward Params: {ff_params:,}")

# Parameters in attention module
attn_params = sum(p.numel() for p in block.att.parameters())
print(f"Attention Params: {attn_params:,}")

from gpt import GPTModel

def get_config(_, model_name):
    cfg = {
        "vocab_size": 50257,
        "context_length": 1024,
        "drop_rate": 0.1,
        "qkv_bias": False
    }

    if model_name == "gpt2-small":
        cfg.update({"emb_dim": 768, "n_layers": 12, "n_heads": 12})
    elif model_name == "gpt2-medium":
        cfg.update({"emb_dim": 1024, "n_layers": 24, "n_heads": 16})
    elif model_name == "gpt2-large":
        cfg.update({"emb_dim": 1280, "n_layers": 36, "n_heads": 20})
    elif model_name == "gpt2-xl":
        cfg.update({"emb_dim": 1600, "n_layers": 48, "n_heads": 25})
    else:
        raise ValueError("Invalid model name")

    return cfg


def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    tied = total - sum(p.numel() for p in model.out_head.parameters())
    print(f"Total: {total:,} | Tied: {tied:,} | Size: {total * 4 / (1024**2):.2f} MB")

# Generate sizes
for name in ["gpt2-small", "gpt2-medium", "gpt2-large", "gpt2-xl"]:
    print(f"\nModel: {name}")
    config = get_config({}, name)  # Start from an empty config
    model = GPTModel(config)
    count_parameters(model)

GPT_CONFIG_DROPOUT = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate_emb": 0.1,
    "drop_rate_attn": 0.1,
    "drop_rate_shortcut": 0.1,
    "qkv_bias": False
}

import torch.nn as nn
from gpt import MultiHeadAttention, LayerNorm, FeedForward

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(cfg["emb_dim"], cfg["emb_dim"], cfg["context_length"], cfg["n_heads"], cfg["drop_rate_attn"], cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate_shortcut"])

    def forward(self, x):
        x = x + self.drop_shortcut(self.att(self.norm1(x)))
        x = x + self.drop_shortcut(self.ff(self.norm2(x)))
        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate_emb"])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, idx):
        b, t = idx.shape
        tok = self.tok_emb(idx)
        pos = self.pos_emb(torch.arange(t, device=idx.device))
        x = self.drop_emb(tok + pos)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        return self.out_head(x)

import torch
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_DROPOUT)

'''
Implement a transformer block that uses separate dropout rates for embeddings, attention, and shortcut connections.
'''

Feed Forward Params: 4,722,432
Attention Params: 2,360,064

Model: gpt2-small
Total: 163,009,536 | Tied: 124,412,160 | Size: 621.83 MB

Model: gpt2-medium
Total: 406,212,608 | Tied: 354,749,440 | Size: 1549.58 MB

Model: gpt2-large
Total: 838,220,800 | Tied: 773,891,840 | Size: 3197.56 MB

Model: gpt2-xl
Total: 1,637,792,000 | Tied: 1,557,380,800 | Size: 6247.68 MB


AssertionError: d_out must be divisible by num_heads