#### **Tranaformer Block**

In [2]:
import sys
import os

# Add the parent directory (where 'transformer_components' folder lives)
sys.path.append("/Users/umesh/Desktop/LLM from Scratch")

In [3]:
from transformer_components.multihead_attention import MultiheadAttention
from transformer_components.positional_encodings import PositionalEncoding
from transformer_components.embeddings import GetEmbeddings
from transformer_components.feed_forward import FeedForward
from transformer_components.activation import GeLU
from transformer_components.layer_norm import LayerNorm 

In [4]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias (W*x + b)
}

In [6]:
import torch 
import torch.nn as nn

#### **Multihead Attention**

In [58]:
class MultiheadAttention(nn.Module):
    def __init__(self, configs):
        super().__init__()
        self.n_heads = configs['n_heads']
        self.head_dim = int(configs['emb_dim'] / configs['n_heads'])
        self.Wq = nn.Linear(configs['emb_dim'], configs['emb_dim'])
        self.Wk = nn.Linear(configs['emb_dim'], configs['emb_dim'])
        self.Wv = nn.Linear(configs['emb_dim'], configs['emb_dim'])
        self.proj = nn.Linear(configs['emb_dim'], configs['emb_dim'])
        self.register_buffer( 'mask',torch.triu(torch.ones(configs['context_length'], configs['context_length']), diagonal=1))
        self.dropout = torch.nn.Dropout(configs['drop_rate'])

    def forward(self, x):
        # shape of x (B, T (context_len), D) 
        B, T, D = x.shape
        query = self.Wq(x)    # (B, T, D)
        key = self.Wk(x)
        value = self.Wv(x)

        # unrolling these weight Q/K/V metrices from (B, T, D) -> (B, T, n_heads, head_dim)
        query = query.view(B, T, self.n_heads, self.head_dim)
        key = key.view(B, T, self.n_heads, self.head_dim)
        value = value.view(B, T, self.n_heads, self.head_dim)

        # transpose (batch, context_len, n_heads, head_dim) -> (batch, n_heads, context_len, head_dim)
        keys = key.transpose(1, 2)
        queries = query.transpose(1, 2)
        values = value.transpose(1, 2)

        # time to calculate attention weights of shape (B, n_heads, context_len, context_len)
        attn_scores = queries @ keys.transpose(2, 3)
        attn_scores = attn_scores.masked_fill_(self.mask.bool()[:T, :T], -torch.inf)
        attn_scores = attn_scores / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        scaled_attn_weights = torch.softmax(attn_scores, dim=-1)
        scaled_attn_weights = self.dropout(scaled_attn_weights)
        outputs = scaled_attn_weights @ values

        # reformat context vectors 
        # (batch, heads, context_len, head_dim) -> (batch, context_len, heads, head_dim)
        outputs = outputs.transpose(1, 2)
        outputs = outputs.contiguous().view(B, T, self.n_heads * self.head_dim)

        context_vector = self.proj(outputs)
        return context_vector

#### **Layer Normalization**

In [59]:
class LayerNorm(nn.Module):
    def __init__(self, configs):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(configs['emb_dim']))
        self.shift = nn.Parameter(torch.zeros(configs['emb_dim']))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

#### **GeLU**

In [60]:
class GeLU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

#### **Feed-Forward**

In [61]:
class FeedForward(nn.Module):
    def __init__(self, configs):
        super().__init__()
        self.feedforward = nn.Sequential(
            nn.Linear(configs['emb_dim'], 4*configs['emb_dim']),
            GeLU(),
            nn.Linear(4*configs['emb_dim'], configs['emb_dim'])
        )

    def forward(self, x):
        return self.feedforward(x)
        

#### **Transformer Block**

In [62]:
import torch 
import torch.nn as nn

class TransformerBlock(nn.Module):
    def __init__(self, configs):
        super().__init__()
        self.attn = MultiheadAttention(configs)
        self.ff = FeedForward(configs)
        self.norm1 = LayerNorm(configs)
        self.norm2 = LayerNorm(configs)
        self.drop_shortcut = nn.Dropout(configs["drop_rate"])

    def forward(self, x):
        # shape of x is (B, T, D)
        shortcut = x 
        x = self.norm1(x)
        x = self.attn(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x        

#### **GPT-2 Class**

In [63]:
import torch 
import torch.nn as nn

class GPTModel(nn.Module):
    def __init__(self, configs):
        super().__init__()
        # token embeddings
        self.tok_emb = nn.Embedding(configs['vocab_size'], configs['emb_dim'])
        # positionl encodings 
        self.pos_emb = torch.nn.Embedding(configs['context_length'], configs['emb_dim'])
        # dropout
        self.drop_emb = nn.Dropout(configs['drop_rate'])

        # transformer block - encoder only in case of GPT
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(configs) for _ in range(configs['n_layers'])]
        )

        self.final_norm = LayerNorm(configs)
        self.out_head = nn.Linear(
            configs["emb_dim"], configs["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

        

In [64]:
# Initialize the GPT-2 Model 
torch.manual_seed(42) # for reproducibility
model = GPTModel(GPT_CONFIG_124M)
model

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiheadAttention(
        (Wq): Linear(in_features=768, out_features=768, bias=True)
        (Wk): Linear(in_features=768, out_features=768, bias=True)
        (Wv): Linear(in_features=768, out_features=768, bias=True)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (feedforward): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GeLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiheadAttention(
        (Wq): Linear(in_features=768, out_feature

In [75]:
input_batch = torch.randint(
    low=0,
    high=GPT_CONFIG_124M["vocab_size"],
    size=(2, 6)          # batch=2, seq_len=6 for testing
).long()

output = model(input_batch)

print(f"Input size : {input_batch.shape}")
print(f"Input tokens : {input_batch}")
print(f"Output shape : {output.shape}")

Input size : torch.Size([2, 6])
Input tokens : tensor([[33985, 38192, 47025, 29388,  8881, 12568],
        [39361,  2038, 22992, 42787, 29012, 30838]])
Output shape : torch.Size([2, 6, 50257])


In [76]:
output.argmax(dim=-1)

tensor([[49714, 46056, 10703, 19770, 34785, 22253],
        [12765, 34706, 50092, 36805,   744, 14868]])

In [71]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,037,184


In [72]:
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")

Number of trainable parameters considering weight tying: 124,439,808


As we can see, the model is now only 124 million parameters large, matching the original size of the GPT-2 model.

1 KB = 1024 bytes <br>
1 MB = 1024 ร 1024 bytes = 1,048,576 bytes

In [74]:
## Lastly, let us compute the memory requirements of the 163 million parameters in our GPTModel object:

# Each parameter in a PyTorch model (weights, biases) is stored as a 32-bit float (float32) by default.

# float32 uses 4 bytes (because: 32 bits รท 8 = 4 bytes)

total_size_bytes = total_params * 4 #A
total_size_mb = total_size_bytes / (1024 * 1024) #B
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.94 MB


#### **Function to predict next token**

In [84]:
import tiktoken
max_new_tokens = 6

tokenizer = tiktoken.get_encoding('gpt2')

In [96]:
token_ids = torch.randint(0, 50527, (1, 4))
print(token_ids)
token_ids[:, -2:]

tensor([[ 2063, 29761, 27639,  8761]])


tensor([[27639,  8761]])

In [108]:
def predict_next_token(token_ids):
    print(token_ids)
    context_len = 5
    num_iter = 6 
    print(f"Input Sentence : {tokenizer.decode(token_ids.squeeze(dim=0).tolist())}")

    for _ in range(num_iter):
        if len(token_ids) >= context_len:
            input_ids = token_ids[:, -5:]
        else:
            input_ids = token_ids 

        model.eval()
        outputs = model(input_ids)
        # shape of output (B, T, D)
        outputs = outputs[:, -1, :]
        outputs = torch.softmax(outputs, dim=-1)
        outputs = torch.argmax(outputs, dim=-1, keepdim=True)
        token_ids = torch.cat((token_ids, outputs), dim=-1)

    print(f"Output Sentences : {tokenizer.decode(token_ids.squeeze(dim=0).tolist())}")

In [109]:
predict_next_token(torch.randint(0, 50527, (1, 5)))

tensor([[33898, 19318, 20158, 40148, 42755]])
Input Sentence :  acron octgenerationcodes Macro
Output Sentences :  acron octgenerationcodes Macro jourfiction oils Tomato ambul developed
