### Orion Peeters

#### 208409565

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import numpy as np
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader
import tensorflow
import tqdm
import json
import os
import sys
import requests
import time
from functools import partial

In the next chuck, we automatically select the fastest available hardware for running the project, by checking for an NVIDIA GPU or Apple Silicon, falling back to the CPU if no accelerator is found.

In [3]:
# Check if an NVIDIA GPU is available (Standard for Deep Learning)
if torch.cuda.is_available():
    device = torch.device("cuda")

# Check if Apple Silicon (Mac M1/M2/M3) is available
elif torch.backends.mps.is_available():
    major, minor = map(int, torch.__version__.split(".")[:2])
    if (major, minor) >= (2, 9):
        device = torch.device("mps")
    else:
        # Fallback to CPU if the PyTorch version is too old
        device = torch.device("cpu")

# Default to CPU if no accelerator is found
else:
    device = torch.device("cpu")

print("Device:", device)

Device: cuda


This code converts raw text into a training format using a "sliding window" approach.

In [4]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        # Turning the entire text string into a list of tokens
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Creating the "Sliding Window"
        # We slide over the text with a step size of 'stride' to create overlapping chunks.
        for i in range(0, len(token_ids) - max_length, stride):
            # The sequence of words the model sees
            input_chunk = token_ids[i:i + max_length]

            # The same sequence, but shifted forward by one position
            # (The model tries to predict this shift)
            target_chunk = token_ids[i + 1: i + max_length + 1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

# Helper function to easily create the data loader
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # DataLoader handles batching
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

This class implements the core "intelligence" mechanism of the Transformer, splitting the input into multiple "heads" to attend to different parts of the sentence simultaneously, using an optimized PyTorch function to speed up the calculations.

In [5]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by n_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        # Calculating the dimension of each single head (e.g., 768 / 12 = 64)
        self.head_dim = d_out // num_heads

        # Defining the layers that create Queries, Keys, and Values
        self.W_query = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = torch.nn.Linear(d_in, d_out, bias=qkv_bias)

        # Layer to combine the outputs of all heads back together
        self.out_proj = torch.nn.Linear(d_out, d_out)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        # Calculateing Q, K, V for the entire batch
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        # Reshaping and Transpose to split into multiple heads
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)

        # Performing "FlashAttention"
        # This optimized function replaces the manual dot-product and softmax
        context_vec = torch.nn.functional.scaled_dot_product_attention(
            queries, keys, values, is_causal=True,
            dropout_p=self.dropout.p if self.training else 0
        ).transpose(1, 2)

        # Combining heads back into the original shape
        context_vec = context_vec.reshape(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec

This layer normalizes the numbers inside each token's embedding to have a stable average and spread, which prevents the math from crashing (exploding/vanishing gradients) during deep network training.

In [6]:
class LayerNorm(torch.nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = torch.nn.Parameter(torch.ones(emb_dim))
        self.shift = torch.nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        # Calculating the average (mean) and spread (variance) of the input
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)

        # Normalizing: subtract mean, divide by square root of variance
        norm_x = (x - mean) / torch.sqrt(var + self.eps)

        # Applying the learnable scale and shift
        return self.scale * norm_x + self.shift

This is the activation function, allowing the model to learn complex patterns rather than just simple linear relationships

In [7]:
class GELU(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        # The mathematical formula for Gaussian Error Linear Unit.
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

This sub-network processes every word independently after the attention layer, expanding the information into a larger space to extract features and then compressing it back down.

In [8]:
class FeedForward(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(), # Apply activation
            # Projecting back down to original dimension
            torch.nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

This is the main modular building block of GPT, combining the Attention layer and the FeedForward layer together with skip connections to preserve information flow.

In [9]:
class TransformerBlock(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # The Attention Mechanism - Communication between words
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])

        # The FeedForward Network - Processing individual words
        self.ff = FeedForward(cfg)

        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_resid = torch.nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Attention Block with Residual Connection
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_resid(x)
        x = x + shortcut  # "Skip connection": add original input back to result

        # FeedForward Block with Residual Connection
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_resid(x)
        x = x + shortcut  # "Skip connection" again

        return x

This is the final wrapper that assembles all the class and functions - it converts words to embeddings, runs them through a stack of Transformer blocks, and finally predicts the probabilities of the next word.

In [10]:
class GPTModel(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # Input Embedding: Converting token IDs to vectors
        self.tok_emb = torch.nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        # Positional Embedding: Adding information about word order
        self.pos_emb = torch.nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = torch.nn.Dropout(cfg["drop_rate"])

        # A stack of many Transformer Blocks
        self.trf_blocks = torch.nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        # Converts vectors back to vocabulary probability logits
        self.out_head = torch.nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))

        # Combining token content + position information
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


This is a basic loop for text generation that looks at the current context, picks the word with the highest probability, and appends it to the sequence one by one.

In [11]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):

        # Ensuring we don't exceed the model's memory limit, in terms of tokens
        idx_cond = idx[:, -context_size:]

        with torch.no_grad():
            logits = model(idx_cond)

        # Getting the prediction for the very last word only
        logits = logits[:, -1, :]

        # Picking the single token with the highest score
        idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        # Appending the new word to the history and repeat
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

This next part gives the essential training and inference utilities for the GPT model, which include the main training loop ("train_model_simple"), loss tracking and a generate function that implements top-k sampling and temperature scaling.

It also includes the critical "load_weights_into_gpt" function, which maps pre-trained parameters (from OpenAI's GPT-2) into a custom model architecture, ensuring the shapes match for transfer learning.

In [12]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            logits = logits - logits.max(dim=-1, keepdim=True).values

            # Applying softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)

            # Sampling from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)

        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # appending sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx


def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initializing lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Setting model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()  # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()  # Calculating loss gradients
            optimizer.step()  # Updating model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    """
    Evaluates the model on training and validation sets.
    """
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    """
    Generates and prints a sample text from the model during training.
    """
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
        decoded_text = token_ids_to_text(token_ids, tokenizer)
        print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()


def assign(left, right):
    """
    Assigns the right tensor to the left parameter, ensuring shapes match.
    """
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))


def load_weights_into_gpt(gpt, params):
    """
    Loads OpenAI GPT-2 weights into the custom GPT model structure.
    """
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])


def text_to_token_ids(text, tokenizer):
    """Encodes text to a tensor of token IDs."""
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
    return encoded_tensor


def token_ids_to_text(token_ids, tokenizer):
    """Decodes a tensor of token IDs back to text."""
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())


def calc_loss_batch(input_batch, target_batch, model, device):
    """Calculates the loss for a single batch."""
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    """Calculates the average loss over a DataLoader."""
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches


def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    """Plots training and validation losses."""
    fig, ax1 = plt.subplots(figsize=(5, 3))

    # Ploting training and validation loss against epochs
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))

    # Creating a second x-axis for tokens seen
    ax2 = ax1.twiny()
    ax2.plot(tokens_seen, train_losses, alpha=0)
    ax2.set_xlabel("Tokens seen")

    fig.tight_layout()
    plt.savefig("loss-plot.pdf")
    plt.show()

This part downloads a remote JSON dataset and partitions it into training, testing, and validation subsets for model development.

In [13]:
def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        text_data = response.text
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

file_path = "instruction-data.json"
url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json")

data = download_and_load_file(file_path, url)

# Splitting data
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

This function converts raw dataset entries into a standardized prompt template to prepare the model for instruction-based fine-tuning.

In [14]:
def format_input(entry):
    # Constructing the primary instruction text using a template
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    # Appending the input section only if the entry provides additional context
    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    # Merging the components into a single formatted prompt string
    return instruction_text + input_text

This part defines a custom dataset class to process and tokenize pairs of instruction-response, and a collation function to pad batches and align input-target pairs for training.

In [15]:
class InstructionDataset(Dataset):
    """Dataset for instruction fine-tuning that formats and encodes data entries."""
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []
        # Pre-processing and tokenizing each entry in the dataset
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(tokenizer.encode(full_text))

    def __getitem__(self, index):
        """Returns the tokenized sequence at the specified index."""
        return self.encoded_texts[index]

    def __len__(self):
        """Returns the total number of entries in the dataset."""
        return len(self.data)

def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # Finding the maximum length in this batch to determine padding
    batch_max_length = max(len(item)+1 for item in batch)

    # Padding sequence to match the batch maximum length
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        # Creating input and target: target is shifted by one position
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])

        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [16]:
customized_collate_fn = partial(
    custom_collate_fn,
    device=device,
    allowed_max_length=1024
)

This next block sets up the data pipeline.

In [17]:
# Initializing the tokenizer to convert raw text into numerical token IDs using GPT-2's encoding.
tokenizer = tiktoken.get_encoding("gpt2")

# Wrapping the raw data into Dataset objects that handle the tokenization and formatting logic for each split.
train_dataset = InstructionDataset(train_data, tokenizer)
val_dataset = InstructionDataset(val_data, tokenizer)
test_dataset = InstructionDataset(test_data, tokenizer)

# Creating DataLoaders to manage batching, shuffling, and the padding of sequences via the collate function.

num_workers = 0
batch_size = 8

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

A small helper func to search for the file "gpt_download.py", starting from the current folder or a specific root (needs to be changed by the user accordingly).

In [18]:
def find_and_import(module_name, start_dir="."):
    for root, dirs, files in os.walk(start_dir):
        if f"{module_name}.py" in files:
            sys.path.append(root)
            print(f"Found {module_name} in {root}")
            return __import__(module_name)
    raise ImportError(f"Could not find {module_name} in {start_dir} or subdirectories")

# Usage
gpt_download = find_and_import("gpt_download", start_dir="/content/drive/MyDrive")
download_and_load_gpt2 = gpt_download.download_and_load_gpt2

Found gpt_download in /content/drive/MyDrive/data_llm_course/appendix-E/01_main-chapter-code


This block of code is the model initialization and loading phase, where the architecture is defined and pre-trained weights are downloaded.

In [19]:
# Defining the core architectural settings.
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}

# Creating a dictionary of specific hyperparameters for the different scaling tiers of GPT-2.
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Selecting a specific model size and merging its dimensions into the base configuration dictionary.
CHOOSE_MODEL = "gpt2-medium (355M)"
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

# Extracting the size string and download the official OpenAI weights and settings files.
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

# Initializing the GPTModel architecture and mapping the downloaded weights into the corresponding PyTorch layers.
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)

# Transfering the model to the GPU and using torch.compile to optimize the computation graph for speed.
model.to(device)
model = torch.compile(model)
model.eval();

File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe


This block calculates the initial loss to see how the model performs with its pre-trained weights before any fine-tuning begins.

In [20]:
torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 3.82590970993042
Validation loss: 3.761934185028076


Here the optimizer adjusts the model's weights to minimize the error on the specific instruction dataset.

In [21]:
start_time = time.time()

torch.manual_seed(123)

# Initializing the AdamW optimizer with a specific learning rate and weight decay to prevent overfitting.
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

# Defining the number of full passes through the training dataset.
num_epochs = 2

# Executing the training loop, which updates weights, evaluates periodically, and tracks progress.
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

# Calculating and display the total time elapsed during the training session.
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 2.637, Val loss 2.626
Ep 1 (Step 000005): Train loss 1.174, Val loss 1.103
Ep 1 (Step 000010): Train loss 0.872, Val loss 0.944
Ep 1 (Step 000015): Train loss 0.857, Val loss 0.906
Ep 1 (Step 000020): Train loss 0.776, Val loss 0.881
Ep 1 (Step 000025): Train loss 0.754, Val loss 0.859
Ep 1 (Step 000030): Train loss 0.799, Val loss 0.836
Ep 1 (Step 000035): Train loss 0.714, Val loss 0.808
Ep 1 (Step 000040): Train loss 0.672, Val loss 0.806
Ep 1 (Step 000045): Train loss 0.633, Val loss 0.789
Ep 1 (Step 000050): Train loss 0.663, Val loss 0.783
Ep 1 (Step 000055): Train loss 0.760, Val loss 0.763
Ep 1 (Step 000060): Train loss 0.719, Val loss 0.743
Ep 1 (Step 000065): Train loss 0.653, Val loss 0.735
Ep 1 (Step 000070): Train loss 0.533, Val loss 0.729
Ep 1 (Step 000075): Train loss 0.568, Val loss 0.729
Ep 1 (Step 000080): Train loss 0.604, Val loss 0.725
Ep 1 (Step 000085): Train loss 0.509, Val loss 0.710
Ep 1 (Step 000090): Train loss 0.563, Val loss

This block is the moment of truth: it performs qualitative testing by having the model generate actual responses to instructions it hasn't seen before, giving the ability to compare its performance against the ground truth.

In [22]:
torch.manual_seed(123)

for entry in test_data[3:5]:

    # Applying the instruction template to the raw data.
    input_text = format_input(entry)

    # Converting the input text to tokens and runing the model's generation loop to produce a sequence of new tokens.
    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )

    # Decoding the resulting token IDs back into human text.
    generated_text = token_ids_to_text(token_ids, tokenizer)

    # Post-process the output by removing the original prompt.
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )

    # Printing the prompt, the ideal target answer, and the model's actual output
    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is the periodic symbol for chlorine?

Correct response:
>> The periodic symbol for chlorine is Cl.

Model response:
>> The periodic symbol for chlorine is C.
-------------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Correct the punctuation in the sentence.

### Input:
Its time to go home.

Correct response:
>> The corrected sentence should be: 'It's time to go home.'

Model response:
>> It's time to go home.
-------------------------------------


Lastly - This function that I added is the user-facing interface of the project. It wraps all the complex technical steps (tokenization, generation, and slicing) into a simple, reusable tool that feels like a real chatbot.

In [23]:
def chat_with_model(instruction, input_text=""):
    # Formating the prompt using your existing function
    entry = {"instruction": instruction, "input": input_text}
    prompt = format_input(entry)

    # Tokenizing and Generating
    input_ids = text_to_token_ids(prompt, tokenizer).to(device)
    out_ids = generate(
        model=model,
        idx=input_ids,
        max_new_tokens=150,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )

    # Decoding and printing only the response part
    full_response = token_ids_to_text(out_ids, tokenizer)
    answer = full_response[len(prompt):].replace("### Response:", "").strip()
    return answer

# Example usage:
user_query = "What Are the planets in our solar system?"
print(f"Chatbot: {chat_with_model(user_query)}")

Chatbot: The planets in our solar system are Jupiter, Saturn, Uranus, and Neptune.


Saving the model for future usage

In [24]:
import re

file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL) }-sft.pth"
torch.save(model.state_dict(), file_name)
print(f"Model saved as {file_name}")

Model saved as gpt2-medium355M-sft.pth
