In [1]:
import torch
import torch.nn as nn
import tiktoken
from ch3 import MultiHeadAttention
from torch.nn.utils.rnn import pad_sequence

In [3]:
GPT_CONFIG_124M = {
        "vocab_size": 50257,    # Vocabulary size
        "context_length": 1024, # Context length
        "emb_dim": 768,         # Embedding dimension
        "n_heads": 12,          # Number of attention heads
        "n_layers": 12,         # Number of layers
        "drop_rate": 0.1,       # Dropout rate
        "qkv_bias": False       # Query-Key-Value bias
}

In [71]:
class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
    def forward(self, x):
        return x

class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
    def forward(self, x):
        return x

In [72]:
class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(
                *[DummyTransformerBlock(cfg)
                  for _ in range(cfg["n_layers"])]
                )
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
                cfg["emb_dim"], cfg["vocab_size"], bias = False
                )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
                torch.arange(seq_len, device=in_idx.device)
                )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [73]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-5):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(emb_dim))
        self.beta = nn.Parameter(torch.zeros(emb_dim))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        # unbiased=False matches nn.LayerNorm
        var = x.var(dim=-1, unbiased=False, keepdim=True)
        return self.gamma * (x - mean) / torch.sqrt(var + self.eps) + self.beta

In [74]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi, device=x.device)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [75]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),  # 768 → 3072
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),  # 3072 → 768
        )

    def forward(self, x):
        return self.layers(x)

In [76]:
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
                # Implement 5 layers

                nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
                nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
                nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
                nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
                nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU()),
                ])

    def forward(self, x):
        for layer in self.layers:
            # Compute the output of the current layer
            layer_output = layer(x)
            # Check if shortcut can be applied
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x

# A function to compute gradients
def print_gradients(model, x):
    # Forward pass
    output = model(x)
    target = torch.tensor([[0.]])

    # Calculate loss based on how close the target and output are
    loss = nn.MSELoss()
    loss = loss(output, target)

    # Backward pass to calculate gradients
    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            # Print the mean absoute gradient of the weights
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [77]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
         d_in=cfg["emb_dim"],
         d_out=cfg["emb_dim"],          
         context_length=cfg["context_length"],
         dropout=cfg["drop_rate"],
         num_heads=cfg["n_heads"],
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Multi-head attention + residual
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        # Feed-forward + residual
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x

In [78]:
class TransformerBlockSeperateDropout(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn= MultiHeadAttention(
                    d_in=cfg["emb_dim"],
                    d_out=cfg["emb_dim"],
                    context_length=cfg["context_length"],
                    num_heads=cfg["n_heads"],
                    dropout=cfg["attn_drop_rate"],
                    qkv_bias=cfg["qkv_bias"]
        )
        self.ff= FeedForward(cfg)
        self.norm1= LayerNorm(cfg['emb_dim'])
        self.norm2= LayerNorm(cfg['emb_dim'])
        self.drop_shortcut= nn.Dropout(cfg["shortcut_drop_rate"])
     
    def forward(self,x):
         shortcut= x
         x= self.norm1(x)
         x = self.attn(x)
         x = x+ shortcut

         shortcut=x
         x= self.norm2(x)
         x= self.attn(x)
         x=x+shortcut
         return x

In [79]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb= nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb= nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb= nn.Dropout(cfg['drop_rate'])
        self.trf_blocks= nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm= LayerNorm(cfg['emb_dim'])
        self.out_head= nn.Linear(
            cfg['emb_dim'], cfg['vocab_size'], bias= False
        )
    
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds= self.tok_emb(in_idx)

        pos_embeds= self.pos_emb(
            torch.arange(seq_len, device= in_idx.device)
        )

        x= tok_embeds+pos_embeds
        x=self.drop_emb(x)
        x=self.trf_blocks(x)
        x=self.final_norm(x)
        logits= self.out_head(x)

        return logits

In [80]:
def createModelAndCalculateSize(conf):
    model= GPTModel(conf)
    total_params= sum(p.numel() for p in model.parameters())
    total_size_in_bytes= total_params*4
    total_size_in_mbs= total_size_in_bytes/(1024*1024)
    return total_params, total_size_in_mbs

In [81]:
def Generate_text(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond= idx[:,-context_size:]
        with torch.no_grad():
            logits= model(idx_cond)
        logits= logits[:,-1,:]
        probas= torch.softmax(logits, dim=-1)
        idx_next= torch.argmax(probas, dim=-1, keepdim=True)
        idx= torch.cat((idx, idx_next), dim=1)
    
    return idx

In [84]:
def main():
    tokenizer= tiktoken.get_encoding("gpt2")
    batch=[]
    txt1= "Everybody makes"
    txt2="Everyday hold a"

    batch = [torch.tensor(tokenizer.encode(txt)) for txt in [txt1, txt2]]
    batch = pad_sequence(batch, batch_first=True, padding_value=0)
    
    print(batch)

    torch.manual_seed(123)
    model=DummyGPTModel(GPT_CONFIG_124M)
    logits= model(batch)
    print("Output shape:", logits.shape)
    print(logits)

    # Layer Normalization
    torch.manual_seed(123)
    batch_example= torch.randn(2,5)
    layer= nn.Sequential(nn.Linear(5,6), nn.ReLU())
    out= layer(batch_example)
    print(out)

    mean=out.mean(dim=-1, keepdim=True)
    var= out.var(dim=-1, unbiased=False, keepdim=True)
    print("Mean:", mean)
    print("Variance:", var)

    out_norm = (out - mean) / torch.sqrt(var + 1e-5)
    mean = out_norm.mean(dim=-1, keepdim=True)
    var = out_norm.var(dim=-1, keepdim=True)
    print("Normalized layer outputs:\n", out_norm)
    print("Mean:\n" , mean)
    print("Variance:\n", var)

    torch.set_printoptions(sci_mode=False)
    print("Mean:\n" , mean)
    print("Variance:\n", var)

    # Trying LayerNorm in practice
    
    ln = LayerNorm(emb_dim=5)
    out_ln = ln(batch_example)
    mean = out_ln.mean(dim=-1, keepdim=True)
    var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
    print("Mean:\n", mean)
    print("Variance:\n", var)


    ffn = FeedForward(GPT_CONFIG_124M)
    x = torch.rand(2, 3, 768)
    out = ffn(x)
    print(out.shape)


    # First we implement a neural net without shortcut connections
    
    layer_sizes = [3, 3, 3, 3, 3, 1]
    sample_input = torch.tensor([[1.0, 0., -1.]])
    torch.manual_seed(123)
    model_without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=False)


    print("Model gradients without shortcut:")
    print_gradients(model_without_shortcut, sample_input)
    
    # Now to compare with a model that has gradients
    torch.manual_seed(123)
    model_with_shortcut = ExampleDeepNeuralNetwork(
            layer_sizes, use_shortcut=True
            )
    print("Model gradients with shortcuts:")
    print_gradients(model_with_shortcut, sample_input)


    # Instantiating transformer block and feeding it some sample data
    torch.manual_seed(123)
    x = torch.rand(2, 4, 768)
    block = TransformerBlock(GPT_CONFIG_124M)
    output = block(x)
    
    print("Transfrormer input shape:", x.shape)
    print("Transformer output shape:", output.shape)

    # Sample batch to our GPT model
    
    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    
    out = model(batch)
    print("Input batch:\n", batch)
    print("\nOutput shape:", out.shape)
    print(out)
    
    # Analyzing the size of the model we coded up earlier
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters: {total_params:,}")
    
    print("Token embedding layer shape:", model.tok_emb.weight.shape)
    print("Output layer shape:", model.out_head.weight.shape)
    
    total_params_gpt2 = (
            total_params - sum(p.numel() for p in model.out_head.parameters())
            )
    print(f"Number of trainable parameters "
          f"considering weight tying: {total_params_gpt2:,}")
    
    # Feed forward module and multi-head attention module amount of parameters
    one_of_transformers = model.trf_blocks[0]
    feed_forward = one_of_transformers.ff
    attention = one_of_transformers.att
    
    feed_forward_params = sum(p.numel() for p in feed_forward.parameters())
    attention_params = sum(p.numel() for p in attention.parameters())
    
    print(f"Feed forward has {feed_forward_params:,} trainable weights")
    print(f"Attention has {attention_params:,} trainable weights")
    
    # Assesing memmory requirements
    total_size_bytes = total_params * 4
    total_size_mb = total_size_bytes / (1024 * 1024) # Convert to Megabytes
    print(f"Total size of the model: {total_size_mb:.2f} MB")
    
    # Exercise 4.2 Calculating number of parameters and require memmory for GPT-2 medium, GPT-2 large and GPT-2 XL
    
    GPT_CONFIGS = {
        "GPT-2 medium": {
            "vocab_size": 50257,    # Vocabulary size
            "context_length": 1024, # Context length
            "emb_dim": 1024,         # Embedding dimension
            "n_heads": 16,          # Number of attention heads
            "n_layers": 24,         # Number of layers
            "drop_rate": 0.1,       # Dropout rate
            "qkv_bias": False       # Query-Key-Value bias
        },
        "GPT-2 large": {
            "vocab_size": 50257,    # Vocabulary size
            "context_length": 1024, # Context length
            "emb_dim": 1280,         # Embedding dimension
            "n_heads": 20,          # Number of attention heads
            "n_layers": 36,         # Number of layers
            "drop_rate": 0.1,       # Dropout rate
            "qkv_bias": False       # Query-Key-Value bias
        },
        "GPT-2 XL": {
            "vocab_size": 50257,    # Vocabulary size
            "context_length": 1024, # Context length
            "emb_dim": 1600,         # Embedding dimension
            "n_heads": 25,          # Number of attention heads
            "n_layers": 48,         # Number of layers
            "drop_rate": 0.1,       # Dropout rate
            "qkv_bias": False       # Query-Key-Value bias
        },
    }


    # Lets try it out
    start_context = "Everyone is "
    encoded = tokenizer.encode(start_context)
    print("encoded:", encoded)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # Adds batch dimension
    print("encoded_tensor.shape:", encoded_tensor.shape)
    
    model.eval() # Puts model into eval state to disable random components such as dropout and etc
    out = Generate_text(
            model=model,
            idx=encoded_tensor,
            max_new_tokens=10,
            context_size=GPT_CONFIG_124M["context_length"]
            )
    print("Output:", out)
    print("Output length:", len(out[0]))
    
    decoded_text = tokenizer.decode(out.squeeze(0).tolist())
    print(decoded_text)

    # Exercise 4.3 Using separate dropout parameters
    
    GPT_CONFIG_124M_SEPARATE_DROOPOUTS = {
            "vocab_size": 50257,    # Vocabulary size
            "context_length": 1024, # Context length
            "emb_dim": 768,         # Embedding dimension
            "n_heads": 12,          # Number of attention heads
            "n_layers": 12,         # Number of layers
            "emb_drop_rate": 0.1,       # Embeddings dropout rate
            "shortcut_drop_rate": 0.1,  # Shortcut dropout rate
            "att_drop_rate": 0.1,       # Attention dropout rate
            "qkv_bias": False       # Query-Key-Value bias
    }
    
    class GPTModelSeparateDropoutParameters(nn.Module):
      def __init__(self, cfg):
          super().__init__()
          self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
          self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
          self.drop_emb = nn.Dropout(cfg["emb_drop_rate"])

          self.trf_blocks = nn.Sequential(
              *[TransformerBlockSeperateDropout(cfg) for _ in range(cfg["n_layers"])]
          )

          self.final_norm = LayerNorm(cfg["emb_dim"])
          self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

      def forward(self, in_idx):
          batch_size, seq_len = in_idx.shape

          # Token embeddings
          tok_embeds = self.tok_emb(in_idx)  # (batch, seq_len, emb_dim)

          # Positional embeddings (expand to batch)
          pos_idx = torch.arange(seq_len, device=in_idx.device).unsqueeze(0)  # (1, seq_len)
          pos_embeds = self.pos_emb(pos_idx)  # (1, seq_len, emb_dim)
          pos_embeds = pos_embeds.expand(batch_size, seq_len, -1)  # broadcast to batch

          x = tok_embeds + pos_embeds
          x = self.drop_emb(x)
          x = self.trf_blocks(x)  # pass through all transformer blocks
          x = self.final_norm(x)
          logits = self.out_head(x)  # (batch, seq_len, vocab_size)

          return logits

In [85]:
if __name__=='__main__':
    main()

tensor([[28172,  1838,     0,     0],
        [ 6109,   820,  1745,   257]])
Output shape: torch.Size([2, 4, 50257])
tensor([[[-1.4662, -0.4357, -0.5804,  ..., -0.4822,  0.1343, -0.6500],
         [-0.9326, -0.2487, -1.8642,  ...,  0.0958, -0.1139,  0.9603],
         [ 0.1749,  1.1411,  0.9494,  ...,  0.5431,  0.5032, -0.5057],
         [-0.1238,  0.7459,  0.9899,  ...,  1.9023, -0.0482, -0.3757]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-1.1791,  0.1868, -0.5684,  ..., -0.5918,  0.4412,  0.5210],
         [ 0.5525,  0.4192, -0.1347,  ...,  0.3754,  0.2626, -0.5079],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)
tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)
Mean: tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance: tensor([[0.0192],
        [0.0332]], grad_fn=<

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Hugging Face token
token = ""

# Load pretrained tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", use_auth_token=token)
model = GPT2LMHeadModel.from_pretrained("gpt2", use_auth_token=token)

model.eval()  # eval mode disables dropout

# Example prompt
prompt = "Everyone is "
encoded = tokenizer.encode(prompt, return_tensors="pt")  # shape: [1, seq_len]

# Generate text using greedy decoding
max_new_tokens = 20
generated = encoded
with torch.no_grad():
    for _ in range(max_new_tokens):
        outputs = model(generated)
        next_token_logits = outputs.logits[:, -1, :]
        next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
        generated = torch.cat((generated, next_token), dim=1)

# Decode to readable text
decoded_text = tokenizer.decode(generated[0].tolist())
print("Generated text:\n", decoded_text)




tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generated text:
 Everyone is  a little bit of a jerk, but I'm not a jerk. I'm a jerk.
