In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers

from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
import json

# Step 1: Load first sample from the .jsonl file
with open("/kaggle/input/the-natural-questions-dataset/simplified-nq-train.jsonl", "r") as f:
    first_line = f.readline()
    sample = json.loads(first_line)

# Step 2: Extract fields
question = sample['question_text']
context_tokens = sample['document_text'].split()  # space-tokenized
annotations = sample['annotations'][0] if sample['annotations'] else None

# Step 3: Extract short answer using start/end tokens
if annotations and annotations['short_answers']:
    start = annotations['short_answers'][0]['start_token']
    end = annotations['short_answers'][0]['end_token']
    answer_tokens = context_tokens[start:end]
    answer = ' '.join(answer_tokens)
else:
    answer = "[NO SHORT ANSWER FOUND]"

# Step 4: Join full context (you may truncate later for max length)
context = ' '.join(context_tokens)

# Step 5: Construct prompt format
prompt = f"Question: {question}\nContext: {context}\nAnswer:"

# Step 6: Combine prompt + answer
full_input = f"{prompt} {answer}"

# Step 7: Print final result
print("===== GPT Training Sample =====")
print(full_input[:1000])  # Truncated for readability


===== GPT Training Sample =====
Question: which is the most common use of opt-in e-mail marketing
Context: Email marketing - Wikipedia <H1> Email marketing </H1> Jump to : navigation , search <Table> <Tr> <Td> </Td> <Td> ( hide ) This article has multiple issues . Please help improve it or discuss these issues on the talk page . ( Learn how and when to remove these template messages ) <Table> <Tr> <Td> </Td> <Td> This article needs additional citations for verification . Please help improve this article by adding citations to reliable sources . Unsourced material may be challenged and removed . ( September 2014 ) ( Learn how and when to remove this template message ) </Td> </Tr> </Table> <Table> <Tr> <Td> </Td> <Td> This article possibly contains original research . Please improve it by verifying the claims made and adding inline citations . Statements consisting only of original research should be removed . ( January 2015 ) ( Learn how and when to remove this template message ) </Td> 

In [3]:
import os
import random
from tqdm import tqdm


In [4]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": True       # Query-Key-Value bias
}

## Normalization, GELU and Feed Forward

In [5]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), ## Expansion
            GELU(), ## Activation
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), ## Contraction
        )

    def forward(self, x):
        return self.layers(x)

## Multi-Head Attention

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=True):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

## Transformer Block

In [7]:
  class TransformerBlock(nn.Module):
      def __init__(self, cfg):
          super().__init__()
          self.att = MultiHeadAttention(
              d_in=cfg["emb_dim"],
              d_out=cfg["emb_dim"],
              context_length=cfg["context_length"],
              num_heads=cfg["n_heads"],
              dropout=cfg["drop_rate"],
              qkv_bias=cfg["qkv_bias"])
          self.ff = FeedForward(cfg)
          self.norm1 = LayerNorm(cfg["emb_dim"])
          self.norm2 = LayerNorm(cfg["emb_dim"])
          self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

      def forward(self, x):
          # Shortcut connection for attention block
          shortcut = x
          x = self.norm1(x)
          x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
          x = self.drop_shortcut(x)
          x = x + shortcut  # Add the original input back

          # Shortcut connection for feed forward block
          shortcut = x
          x = self.norm2(x)
          x = self.ff(x)
          # 2*4*768
          x = self.drop_shortcut(x)
          x = x + shortcut  # Add the original input back

          return x
          # 2*4*768

## GPT Class

In [8]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [9]:
model = GPTModel(GPT_CONFIG_124M)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,037,184


### Loading Weights

In [10]:
import os
import json
import numpy as np
import tensorflow as tf

def load_gpt2_from_kaggle(model_size, kaggle_dataset_path="/kaggle/input/openai-gpt2-weights/124M"):
    # Validate model size
    allowed_sizes = ("124M", "355M", "774M", "1558M")
    if model_size not in allowed_sizes:
        raise ValueError(f"Model size not in {allowed_sizes}")

    # Construct model directory path inside Kaggle dataset
    model_dir = os.path.join(kaggle_dataset_path, model_size)

    # Check if directory exists
    if not os.path.isdir(model_dir):
        raise FileNotFoundError(f"Model directory not found: {model_dir}")

    # Load hyperparameters
    hparams_path = os.path.join(model_dir, "hparams.json")
    if not os.path.exists(hparams_path):
        raise FileNotFoundError(f"hparams.json not found in {model_dir}")
    settings = json.load(open(hparams_path))

    # Load TensorFlow checkpoint
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    if tf_ckpt_path is None:
        raise FileNotFoundError(f"No checkpoint found in {model_dir}")
    
    # Extract parameters from checkpoint
    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)

    return settings, params

def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
    # Initialize parameters dictionary with empty blocks for each layer
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}

    # Iterate over each variable in the checkpoint
    for name, _ in tf.train.list_variables(ckpt_path):
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
        variable_name_parts = name.split("/")[1:]  # Skip 'model/' prefix

        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]

        for key in variable_name_parts[1:-1]:
            target_dict = target_dict.setdefault(key, {})
        
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array

    return params


2025-07-25 09:46:36.823720: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753436797.168342      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753436797.260983      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [11]:

settings, params = load_gpt2_from_kaggle(model_size="124M")
print("Settings:", settings)
print("Parameter dictionary keys:", params.keys())

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [12]:
import numpy as np

def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))
    
def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])



In [13]:
load_weights_into_gpt(model,params)

Weights loaded finally hehe!!!!!!!

## Create Formatted Sample from QA Dataset

In [14]:
import json
import re

def clean_text(text):
    """Removes HTML tags and common entities from a text string."""
    text = re.sub(r"<.*?>", "", text)          # Remove HTML tags
    text = re.sub(r"&nbsp;|&amp;", " ", text)  # Replace common HTML entities
    return text.strip()

def extract_context_with_answer(tokens, start, end, window=256):
    """Extract a window of context centered around the short answer span."""
    context_len = len(tokens)
    mid = (start + end) // 2
    half_window = window // 2
    start_idx = max(0, mid - half_window)
    end_idx = min(context_len, start_idx + window)
    start_idx = max(0, end_idx - window)
    return tokens[start_idx:end_idx]

samples = []
N=600
with open("/kaggle/input/the-natural-questions-dataset/simplified-nq-train.jsonl", "r") as f:
    for i, line in enumerate(f):
        if len(samples) >= N: break
        example = json.loads(line)

        # Skip examples without short answers
        if not example['annotations'] or not example['annotations'][0]['short_answers']:
            continue

        # Raw, uncleaned tokens (needed for indexing)
        raw_tokens = example['document_text'].split()
        question = example['question_text']
        sa = example['annotations'][0]['short_answers'][0]
        start, end = sa['start_token'], sa['end_token']

        # Extract answer from raw tokens (MUST be done before cleaning)
        answer_tokens = raw_tokens[start:end]
        answer = " ".join(answer_tokens)

        # Now clean both answer and context
        cleaned_answer = clean_text(answer)
        context_tokens = extract_context_with_answer(raw_tokens, start, end, window=256)
        cleaned_context = clean_text(" ".join(context_tokens))

        samples.append({
            "question": question,
            "context": cleaned_context,
            "answer": cleaned_answer
        })


In [15]:
samples[10]

{'question': 'when do the eclipse supposed to take place',
 'context': 'Solar eclipse of August 21 , 2017 - Wikipedia  Solar eclipse of August 21 , 2017  Jump to : navigation , search    Solar eclipse of August 21 , 2017     Totality as seen from Simpsonville , South Carolina     Map     Type of eclipse     Nature   Total     Gamma   0.4367     Magnitude   1.0306     Maximum eclipse     Duration   160 sec ( 2 m 40 s )       37 ° 00 ′ N 87 ° 42 ′ W \ufeff / \ufeff 37 ° N 87.7 ° W \ufeff / 37 ; - 87.7     Max . width of band   115 km ( 71 mi )     Times ( UTC )     ( P1 ) Partial begin   15 : 46 : 48     ( U1 ) Total begin   16 : 48 : 32     Greatest eclipse   18 : 26 : 40     ( U4 ) Total end   20 : 01 : 35     ( P4 ) Partial end   21 : 04 : 19     References     Saros   145 ( 22 of 77 )',
 'answer': 'August 21 , 2017'}

## Data Loader

In [16]:
class GPTQADataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=512):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        prompt = f"Question: {sample['question']}\nContext: {sample['context']}\nAnswer:"
        answer = sample["answer"]
        full = prompt + " " + answer

        encodings = self.tokenizer(full,
                                   max_length=self.max_length,
                                   truncation=True,
                                   padding="max_length",
                                   return_tensors="pt")

        labels = encodings.input_ids.clone()
        prompt_len = len(self.tokenizer(prompt, truncation=True)["input_ids"])
        labels[0][:prompt_len] = -100  # Ignore loss on prompt
        valid_label_count = (labels != -100).sum().item()
        if valid_label_count == 0:
            print("⚠️ WARNING: All labels are -100 (ignored in loss). Check prompt length.")
            print(f"Prompt: {prompt}")
            print(f"Answer: {answer}")
            print(f"Label shape: {labels.shape}")
            print(f"Labels: {labels}")


        return encodings.input_ids.squeeze(), labels.squeeze()


from torch.utils.data import DataLoader


def create_qa_dataloader(samples, tokenizer, batch_size=4, shuffle=True):
    dataset = GPTQADataset(samples, tokenizer)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


In [17]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Ensure compatibility


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [18]:
from sklearn.model_selection import train_test_split

train_samples, val_samples = train_test_split(samples, test_size=0.1, random_state=42)

train_loader = create_qa_dataloader(train_samples, tokenizer, batch_size=4)
val_loader = create_qa_dataloader(val_samples, tokenizer, batch_size=4, shuffle=False)


## Training and Evaluation

In [19]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [20]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context

    ###Input batch:
 ###tensor([[6109, 3626, 6100,  345],
        ##[6109, 1110, 6622,  257]])

    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond) ### batch, n_tokens, vocab_size

        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx


def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())


In [None]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel() # Returns the total number of elements (or tokens) in the input_batch.
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]  # max sequence length (block size)

    # Encode the prompt
    encoded = tokenizer.encode(start_context, return_tensors="pt").to(device)

    # Generate continuation
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model,
            idx=encoded,
            max_new_tokens=50,
            context_size=context_size
        )

    # Decode and print
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print("📌 Sample Output:")
    print(decoded_text.replace("\n", " "))  # compact view

    model.train()


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
device

In [None]:
###### Note:
# Uncomment the following code to calculate the execution time
import time
start_time = time.time()

torch.manual_seed(123)
# model = GPTModel(GPT_CONFIG_124M)
# model.to(device)›
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=num_epochs,
    eval_freq=10,        # evaluate every N steps
    eval_iter=10,        # how many batches to use for quick val
    start_context="Question: What is the capital of France?\nContext: France is in Europe. Its capital is Paris.\nAnswer:",
    tokenizer=tokenizer
)

# Note:
# Uncomment the following code to show the execution time
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000090): Train loss 0.005, Val loss 0.000
Ep 1 (Step 000100): Train loss 0.002, Val loss 0.001
Ep 1 (Step 000110): Train loss 0.000, Val loss 0.001
Ep 1 (Step 000120): Train loss 0.002, Val loss 0.002
Ep 1 (Step 000130): Train loss 0.001, Val loss 0.001
📌 Sample Output:
Question: What is the capital of France? Context: France is in Europe. Its capital is Paris. Answer:::::::::::::::::::::::::::::::::::::::::::::::::::
Ep 2 (Step 000140): Train loss 0.009, Val loss 0.001
Ep 2 (Step 000150): Train loss 0.001, Val loss 0.000
Ep 2 (Step 000160): Train loss 0.000, Val loss 0.001
Ep 2 (Step 000170): Train loss 0.000, Val loss 0.000
Ep 2 (Step 000180): Train loss 0.027, Val loss 0.000
Ep 2 (Step 000190): Train loss 0.000, Val loss 0.000
Ep 2 (Step 000200): Train loss 0.000, Val loss 0.000
Ep 2 (Step 000210): Train loss 0.004, Val loss 0.000
Ep 2 (Step 000220): Train loss 0.000, Val loss 0.000
Ep 2 (Step 000230): Train loss 0.021, Val loss 0.000
Ep 2 (Step 000240): Train loss 0.001,

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator


def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))

    # Plot training and validation loss against epochs
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))  # only show integer labels on x-axis

    # Create a second x-axis for tokens seen
    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis
    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks
    ax2.set_xlabel("Tokens seen")

    fig.tight_layout()  # Adjust layout to make room
    plt.savefig("loss-plot.pdf")
    plt.show()

epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

In [None]:
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Question: Who is the founder of Microsoft?\nContext: Bill Gates founded Microsoft.\nAnswer:", 
    tokenizer=tokenizer
)
