# 4. Implementing a GPT model from scratch to generate text

In [16]:
from dataclasses import dataclass, asdict

import tiktoken
import torch
import torch.nn as nn

## 4.1 Coding an LLM architecture

In [17]:
@dataclass
class GPTConfig:
    vocabulary_size: int
    context_length: int
    embedding_dim: int
    n_heads: int
    n_transformers: int
    dropout_rate: float
    qkv_bias: bool


GPT_CONFIG_124M = GPTConfig(
    vocabulary_size=50257,
    context_length=1024,
    embedding_dim=768,
    n_heads=12,
    n_transformers=12,
    dropout_rate=0.1,
    qkv_bias=False
)

In [13]:
class DummyGPTModel(nn.Module):

    def __init__(
        self, 
        vocabulary_size, 
        embedding_dim, 
        context_length, 
        n_heads, 
        qkv_bias,
        n_transformers,
        dropout_rate
    ):
        super().__init__()
        self.token_embedding_layer = nn.Embedding(vocabulary_size, embedding_dim)
        self.position_embedding_layer = nn.Embedding(context_length, embedding_dim)
        self.embedding_dropout_layer = nn.Dropout(dropout_rate)
        self.transformer_blocks = nn.Sequential(
            *[
                DummyTransformerBlock(
                    n_heads, 
                    embedding_dim, 
                    embedding_dim,
                    context_length,
                    dropout_rate,
                    qkv_bias,
                ) for _ in range(n_transformers)
            ]
        )
        self.final_norm = DummyLayerNorm(embedding_dim)
        self.out_head = nn.Linear(embedding_dim, vocabulary_size, bias=False)

    def forward(self, token_ids):
        n_batches, context_length = token_ids.shape
        
        token_embeddings = self.token_embedding_layer(token_ids)
        position_embeddings = self.position_embedding_layer(
            torch.arange(context_length, device=token_ids.device)
        )
        embeddings = token_embeddings + position_embeddings
        embeddings = self.embedding_dropout_layer(embeddings)
        
        context_vectors = self.transformer_blocks(embeddings)
        context_vectors = self.final_norm(context_vectors)
        
        logits = self.out_head(context_vectors)
        return logits


class DummyTransformerBlock(nn.Module):

    def __init__(
        self, 
        n_heads, 
        embedding_dim, 
        context_dim,
        context_length, 
        dropout_rate,
        qkv_bias
    ):
        super().__init__()
    
    def forward(self, x):
        return x
    

class DummyLayerNorm(nn.Module):

    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
    
    def forward(self, x):
        return x

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch = []
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [19]:
torch.manual_seed(123)
dummy_gpt_model = DummyGPTModel(**asdict(GPT_CONFIG_124M))
logits = dummy_gpt_model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)


## 4.2 Normalizing activations with layer normalization

In [35]:
torch.manual_seed(123)
batch_example = torch.randn(2, 5)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
output = layer(batch_example)
print(output)

mean = output.mean(dim=-1, keepdim=True)
var = output.var(dim=-1, unbiased=False, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)
Mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[0.0192],
        [0.0332]], grad_fn=<VarBackward0>)


In [36]:
normalized_output = (output - mean) / torch.sqrt(var)
normalized_mean = normalized_output.mean(dim=-1, keepdim=True)
normalized_var = normalized_output.var(dim=-1, unbiased=False, keepdim=True)

torch.set_printoptions(sci_mode=False)
print("Normalized layer outputs:\n", normalized_output)
print("Mean:\n", normalized_mean)
print("Variance:\n", normalized_var)

Normalized layer outputs:
 tensor([[ 0.6746,  1.5474, -0.9551,  0.6433, -0.9551, -0.9551],
        [-0.0207,  0.1228, -1.1915,  1.6621,  0.6186, -1.1915]],
       grad_fn=<DivBackward0>)
Mean:
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [38]:
class LayerNorm(nn.Module):

    def __init__(self, embedding_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(embedding_dim))
        self.shift = nn.Parameter(torch.zeros(embedding_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        normalized_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * normalized_x + self.shift

In [39]:
layer_norm = LayerNorm(embedding_dim=5)
layer_norm_output = layer_norm(batch_example)
layer_norm_mean = layer_norm_output.mean(dim=-1, keepdim=True)
layer_norm_var = layer_norm_output.var(dim=-1, unbiased=False, keepdim=True)
print("Mean:\n", layer_norm_mean)
print("Variance:\n", layer_norm_var)

Mean:
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


## 4.3 Implementing a feed forward network with GELU activations

In [7]:
class GELU(nn.Module):

    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(0.79788456 * (x + 0.044715 * x**3)))

In [8]:
class FeedForward(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
        )
    
    def forward(self, x):
        return self.layers(x)

## 4.4 Adding shortcut connections

In [9]:
class ExampleDeepNeuralNetwork(nn.Module):

    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[i], layer_sizes[i+1]),GELU()) 
            for i in range(len(layer_sizes) - 1)
        ])

    def forward(self, x):
        for layer in self.layers:
            layer_output = layer(x)
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x

In [10]:

def print_gradients(model, x):
    output = model(x)
    target = torch.tensor([[0.]])

    loss = nn.MSELoss()(output, target)
    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., -1.]])

# torch.manual_seed(123)
# model_without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=False)
# print_gradients(model_without_shortcut, sample_input)

torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(layer_sizes, use_shortcut=True)
print_gradients(model_with_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.22169798612594604
layers.1.0.weight has gradient mean of 0.2069411277770996
layers.2.0.weight has gradient mean of 0.3289700150489807
layers.3.0.weight has gradient mean of 0.26657330989837646
layers.4.0.weight has gradient mean of 1.3258544206619263


## 4.5 Connecting attention and linear layers in a transformer block

In [None]:
from gpt import MultiHeadAttention

class TransformerBlock(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg[""],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg[""])
    
    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.attn(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x

In [12]:
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


## 4.6 Coding the GPT model

In [None]:
class GPTModel(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg[''])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)
    
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [None]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

output = model(batch)
print(f"Input batch:\n {batch}")
print(f"Output shape: {output.shape}")
print(output)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3613,  0.4222, -0.0711,  ...,  0.3483,  0.4661, -0.2838],
         [-0.1792, -0.5660, -0.9485,  ...,  0.0477,  0.5181, -0.3168],
         [ 0.7120,  0.0332,  0.1085,  ...,  0.1018, -0.4327, -0.2553],
         [-1.0076,  0.3418, -0.1190,  ...,  0.7195,  0.4023,  0.0532]],

        [[-0.2564,  0.0900,  0.0335,  ...,  0.2659,  0.4454, -0.6806],
         [ 0.1230,  0.3653, -0.2074,  ...,  0.7705,  0.2710,  0.2246],
         [ 1.0558,  1.0318, -0.2800,  ...,  0.6936,  0.3205, -0.3178],
         [-0.1565,  0.3926,  0.3288,  ...,  1.2630, -0.1858,  0.0388]]],
       grad_fn=<UnsafeViewBackward0>)


In [17]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,009,536


## 4.7 Generating text

In [18]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    
    return idx

In [19]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)    #1
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


In [None]:
model.eval()                  #1
output = generate_text_simple(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=6, 
    context_size=GPT_CONFIG_124M["context_length"]
)
print("Output:", output)
print("Output length:", len(output[0]))

Output: tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Output length: 10


In [None]:
decoded_text = tokenizer.decode(output.squeeze(0).tolist())
print(decoded_text)

Hello, I am Featureiman Byeswickattribute argue
