In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Building a GPT from Scratch -- Notebook Series

*A Vizuara learning path: from token embeddings to a trained autoregressive language model*

## Welcome!

This series of 3 notebooks will take you on a complete journey through building a **GPT-style autoregressive language model** entirely from scratch in PyTorch.

You will implement every component by hand: token and positional embeddings, causal self-attention, the Transformer decoder block, cross-entropy loss, and a full training loop with backpropagation.

### What You Will Learn

| # | Notebook | What You Build | Time |
|---|---------|----------------|------|
| **01** | Embeddings and the GPT Architecture | Character tokenizer, token embeddings, positional embeddings, GPT input pipeline | 45 min |
| **02** | Self-Attention and the Forward Pass | Scaled dot-product attention, causal masking, multi-head attention, Transformer block, full GPT forward pass | 60 min |
| **03** | Loss and Backpropagation | Cross-entropy loss, gradient computation, AdamW optimizer, full training loop, text generation | 50 min |

**Total estimated time: ~2.5 hours**

### Prerequisites

- **Python**: Comfortable with classes, functions, and list comprehensions
- **PyTorch**: Basic familiarity with tensors, `nn.Module`, and training loops
- **Linear Algebra**: Matrix multiplication, dot products, softmax
- **Machine Learning**: Loss functions, gradient descent, train/test split

### Setup

All notebooks are designed to run on **Google Colab with a T4 GPU** (free tier). Each notebook includes a setup cell that installs dependencies and checks GPU availability.

### Learning Path

```
01: Embeddings & Architecture       02: Self-Attention & Forward Pass
    -- Character tokenizer               -- Scaled dot-product attention
    -- Token embedding table              -- Causal masking
    -- Positional embeddings              -- Multi-head attention
    -- Combined input pipeline            -- Transformer block + residuals
            |                                       |
            +-----------------+---------------------+
                              |
                  03: Loss & Backpropagation
                      -- Cross-entropy loss
                      -- Gradient flow through Transformer
                      -- AdamW optimizer
                      -- Full training loop
                      -- Text generation from trained model
```

### Open the Notebooks

In [None]:
# Links to open each notebook in Colab
notebooks = {
    "01_embeddings_and_gpt_architecture": "Embeddings and the GPT Architecture",
    "02_self_attention_and_forward_pass": "Self-Attention and the Forward Pass",
    "03_loss_and_backpropagation": "Loss and Backpropagation",
}

print("Building a GPT from Scratch -- Vizuara Notebook Series\n")
for filename, title in notebooks.items():
    print(f"  {title}")
    print(f"     File: {filename}.ipynb\n")

print("Start with Notebook 01 and work through them in order.")
print("Each notebook builds on concepts from the previous one.")
print("\nHappy learning!")