In [52]:
# make the full gpt module now 
import torch
import torch.nn as nn
import torch.nn.functional as F
from attention_blocks import *
# add the loss to calculate cross entropy in this case 
class GPT(nn.Module): 
    def __init__(self, embed_size, max_seq_len, vocab_size, num_heads, num_layers ):
        super().__init__()
        self.embed_size = embed_size
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.num_layers = num_layers 
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # nn.Embedding needs a int 64 inputs Because nn.Embedding is a lookup table, not a linear layer.
        self.positional_embedding = nn.Embedding(self.max_seq_len, self.embed_size) #lets make this learnable as well
        self.attention_blocks = nn.ModuleList([DecoderOnlyBlock(embedding_dim=embed_size, num_heads=num_heads, 
                                                                dff = 4*embed_size, dropout=0.9) 
                                                                for _ in range(self.num_layers)])
        self.lm_head  = nn.Linear(self.embed_size, self.vocab_size)
        self.layernorm = nn.LayerNorm(self.embed_size)
        
    def forward(self, x, targets=None):
        ## FORWARD SHOULD RETURN THE LOSS HERE SO THAT WE CAN DO BACKPROP 
        # expect x to eb the pretraining data wiht batch size, seq_len 
        batch_size, seq_len = x.shape
        # call the embedding and ocmibine with positional embedding 
        token_embedding = self.embedding(x) # batch_size, seq_len, embed_size
    
        pos_indices = torch.arange(seq_len)
        positional_embedding = self.positional_embedding(pos_indices)
        x = token_embedding+ positional_embedding
        
        # shape batch_size, seq_len,embed_size
        # now we can pass this through the attention modules with the mask 
        mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0)
         
        # 1, 1, max_seq, max_seq
        # make the causal maask 
        for layer in self.attention_blocks:
            x = layer(x, mask)
            # x same as output batch_size, max_seq, embed_size
        output = self.layernorm(x)
        # otuput shape batch, seq
        logits = self.lm_head(output)

        if targets is None: 
            ce_loss = None
        else:
            # lets say the targets are next tokens ie the tokens shifted by 1 
            # batch, seq_len 
            # and we get the logits as batch, seq_len, vocab 
            targets = targets.view(-1)
            predictions = logits.view(-1,self.vocab_size) 
           
            loss = nn.CrossEntropyLoss()
            ce_loss = loss(predictions, targets)
            # this goes in the backward loss.backward()

        return logits,ce_loss# here y is the last token prob
    # lets add a generate call for this 
    # this is the decoding step which calls the froward and samples the token from this recursively 
    # this is the inference code 
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature = 1.0):
        self.eval()
        #idx shape is previous tokens processsed so (batch_size x seq_len )
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.max_seq_len:]
            logits ,_ = self.forward(idx_cond) # batch_size, seq_len, vocab_size
            
            # get logits from the last time step last seq token basically 
            last_token_logits = logits[:, -1, :] # b , vocab size 
            
            last_token_logits = last_token_logits / temperature
            last_token_probs = torch.softmax(last_token_logits, dim= -1) # the softmax is over all the vocab 
            # sampple the next token torch.multinomial samples an index based on the weights (probabilities)
            last_token = torch.multinomial(last_token_probs, num_samples=1) # B x1 
            # print("logits", logits.shape)
            # print(last_token.shape)
            idx = torch.cat((idx, last_token), dim = 1)
            # B, S+1
        return idx
# Character tokenizer 
import torch 
# This is not a torch.nn.Module because it doesn't have any learnable parameters.
class CharacterTokenizer: 
    def __init__(self, corpus):
        # find all the unique characters in the corpus 
        # map them to a dictionary 
        # reverse map it to the tokens 
        self.vocab = sorted(list(set(corpus.strip())))
        self.vocab_size = len(self.vocab)
        self.char_to_index = {char:i for i,char in enumerate(self.vocab)}
        self.index_to_char  = {i:char for i,char in enumerate(self.vocab)}
    def encode(self, text):
        indices = [self.char_to_index[char] for char in text]
        return torch.tensor(indices)
    def decode(self, indices):
        text = [self.index_to_char[idx.item()] for idx in indices]
        return "".join(text)


corpus = "piebfrhfbrfhchellols374t3842/,',v" 
tokenizer = CharacterTokenizer(corpus)
input_ids = tokenizer.encode("ello")
tokenizer.decode(input_ids)
# It's good practice to specify dtype=torch.long when creating index tensors. 
# PyTorch's nn.Embedding layer, which you'd use next, expects its input to be of type LongTensor.

'ello'

In [53]:
# data loader 
# --- 1. The Dataset ---

corpus = """
Friends, Romans, countrymen, lend me your ears;
I come to bury Caesar, not to praise him.
The evil that men do lives after them;
The good is oft interred with their bones;
So let it be with Caesar.
"""
# --- 2. Data Preparation ---
tokenizer = CharacterTokenizer(corpus)
data = tokenizer.encode(corpus)
# Hyperparameters
VOCAB_SIZE = tokenizer.vocab_size
EMBED_SIZE = 32
MAX_SEQ_LEN = 64
NUM_HEADS = 4
NUM_LAYERS = 3
LEARNING_RATE = 1e-3
TRAINING_STEPS = 20
model = GPT(VOCAB_SIZE, EMBED_SIZE, MAX_SEQ_LEN, NUM_HEADS, NUM_LAYERS)


In [58]:
def get_batch(data, batch_size, block_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x ,y 
optimizer = torch.optim.AdamW(model.parameters(), lr= LEARNING_RATE)
loss = torch.nn.CrossEntropyLoss()

# integrate the training loop with this 
for steps in range(100):
    batch_data, batch_targets = get_batch(data, 4, 8)
    # train data = (4, 8 ) 
    predictions, loss = model.forward(batch_data,batch_targets )
    optimizer.zero_grad(set_to_none=True)
    # shape of predictions is same as batch_data 
    #(4, 8)
    loss.backward()
    optimizer.step()
    print(loss )



tensor(3.1098, grad_fn=<NllLossBackward0>)
tensor(3.0527, grad_fn=<NllLossBackward0>)
tensor(3.2153, grad_fn=<NllLossBackward0>)
tensor(3.2053, grad_fn=<NllLossBackward0>)
tensor(3.0135, grad_fn=<NllLossBackward0>)
tensor(3.1087, grad_fn=<NllLossBackward0>)
tensor(3.1025, grad_fn=<NllLossBackward0>)
tensor(2.9334, grad_fn=<NllLossBackward0>)
tensor(3.2142, grad_fn=<NllLossBackward0>)
tensor(3.0939, grad_fn=<NllLossBackward0>)
tensor(3.1510, grad_fn=<NllLossBackward0>)
tensor(3.1032, grad_fn=<NllLossBackward0>)
tensor(3.0563, grad_fn=<NllLossBackward0>)
tensor(3.1462, grad_fn=<NllLossBackward0>)
tensor(3.2239, grad_fn=<NllLossBackward0>)
tensor(3.0631, grad_fn=<NllLossBackward0>)
tensor(2.9085, grad_fn=<NllLossBackward0>)
tensor(3.2700, grad_fn=<NllLossBackward0>)
tensor(2.9816, grad_fn=<NllLossBackward0>)
tensor(2.9221, grad_fn=<NllLossBackward0>)
tensor(3.2798, grad_fn=<NllLossBackward0>)
tensor(3.1600, grad_fn=<NllLossBackward0>)
tensor(3.0454, grad_fn=<NllLossBackward0>)
tensor(3.18

In [56]:
# just to show that the loss works with predictions batchs_size, ) and (batch_Size, probs ) tensors as well

import torch
import torch.nn as nn

# 1. Initialize the loss function
loss_fn = nn.CrossEntropyLoss()

# 2. Create tensors with the exact shapes you described
predictions = torch.randn(32, 64) # 32 items, 64 classes
targets = torch.randint(0, 64, (32,)) # 32 correct answers (indices from 0 to 63)

# 3. Calculate the loss
loss = loss_fn(predictions, targets)

print(f"Predictions shape: {predictions.shape}")
print(f"Targets shape: {targets.shape}")
print(f"Calculated Loss: {loss.item()}") # This will run without any errors

Predictions shape: torch.Size([32, 64])
Targets shape: torch.Size([32])
Calculated Loss: 4.50061559677124


In [9]:
# byte pair encoding 
#BPE is a tokenization algorithm that creates a vocabulary of subwords (parts of words) instead of just
#  individual characters. It starts with a vocabulary of single characters and iteratively merges the most 
# frequently occurring adjacent pairs of tokens into a new, single token.
# Compared to character tokenizer, this is betetr because we increase the vocabulary size and decrease the seq length and this makes the trainign and infernece d\fatser, 
from  collections import Counter
initial_splits = [['t', 'e', 's', 't'], ['b', 'e', 's', 't'], ['l', 'e', 's', 's', 'o', 'n']]
pair = ('e', 's')
def merge_pair(initial_splits, pair):
    new_word_splits = []
    for word in initial_splits:
        i = 0 
        new_word = []
        while i < len(word):
            # check if we meet the condition 
            if i<len(word)-1 and (word[i], word[i+1]) == pair:
                new_word.append("".join([word[i], word[i+1]]))
                i+=2
            else:
                new_word.append(word[i])
                i+=1
        new_word_splits.append(new_word)

    return new_word_splits
import collections# to do this iteratively we will get the best freq so far 
def get_stats(initial_splits):
    c = collections.defaultdict(int)
    for word in initial_splits:
        i = 0 
        while i<len(word)-1:
            pair = (word[i], word[i+1])
            c[pair] +=1
            i+=1
    #sorted_pairs = sorted(c.items(), key = lambda x:x[1], reverse = True)
    return c
splits = initial_splits
for i in range(7):
    print(f"--- Iteration {i+1} ---")
    print(f"Current splits: {splits}")
    pair_stats = get_stats(initial_splits)
    best_pair = max(pair_stats, key=pair_stats.get)
    initial_splits = merge_pair(initial_splits,best_pair)
    print(f"Most frequent pair: {best_pair} (count: {pair_stats[best_pair]})")
    
print(initial_splits)

--- Iteration 1 ---
Current splits: [['t', 'e', 's', 't'], ['b', 'e', 's', 't'], ['l', 'e', 's', 's', 'o', 'n']]
Most frequent pair: ('e', 's') (count: 3)
--- Iteration 2 ---
Current splits: [['t', 'e', 's', 't'], ['b', 'e', 's', 't'], ['l', 'e', 's', 's', 'o', 'n']]
Most frequent pair: ('es', 't') (count: 2)
--- Iteration 3 ---
Current splits: [['t', 'e', 's', 't'], ['b', 'e', 's', 't'], ['l', 'e', 's', 's', 'o', 'n']]
Most frequent pair: ('t', 'est') (count: 1)
--- Iteration 4 ---
Current splits: [['t', 'e', 's', 't'], ['b', 'e', 's', 't'], ['l', 'e', 's', 's', 'o', 'n']]
Most frequent pair: ('b', 'est') (count: 1)
--- Iteration 5 ---
Current splits: [['t', 'e', 's', 't'], ['b', 'e', 's', 't'], ['l', 'e', 's', 's', 'o', 'n']]
Most frequent pair: ('l', 'es') (count: 1)
--- Iteration 6 ---
Current splits: [['t', 'e', 's', 't'], ['b', 'e', 's', 't'], ['l', 'e', 's', 's', 'o', 'n']]
Most frequent pair: ('les', 's') (count: 1)
--- Iteration 7 ---
Current splits: [['t', 'e', 's', 't'], ['b