# Day 10: Embeddings, Learning Rate Schedules & Hyperparameters

**Building LLMs from Scratch** — Following Andrej Karpathy's makemore lectures.

---

## 1. Introduction

Finding the right learning rate is one of the most impactful hyperparameter choices. Too low and training crawls; too high and the optimizer diverges. Today we use a **learning rate finder** — sweeping LR in log-space and plotting loss — to identify the sweet spot before committing to full training.

We also explore **embedding spaces**: the learned character vectors in `C` organize themselves during training. Similar characters (e.g., vowels) cluster together. Visualizing these embeddings reveals what the network has learned.

## 2. Dataset Setup

Same names list, stoi/itos, and (X, Y) pairs with block_size=3 — reused from Day 9.

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

words = ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'mia', 'charlotte', 'amelia', 'harper', 'evelyn',
         'abigail', 'emily', 'ella', 'elizabeth', 'camila', 'luna', 'sofia', 'avery', 'mila', 'aria']

chars = sorted(list(set(''.join(words))))
stoi = {'.': 0, **{c: i + 1 for i, c in enumerate(chars)}}
itos = {i: c for c, i in stoi.items()}

block_size = 3
X, Y = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for i in range(len(chs) - block_size):
        ctx = [stoi[c] for c in chs[i:i + block_size]]
        tgt = stoi[chs[i + block_size]]
        X.append(ctx)
        Y.append(tgt)

X = torch.tensor(X)
Y = torch.tensor(Y)

print(f"Dataset: {len(words)} names")
print(f"Vocabulary size: {len(stoi)}")
print(f"Context pairs: {len(X)}")
print(f"X shape: {X.shape}, Y shape: {Y.shape}")

In [None]:
# Train/val split (80/20)
torch.manual_seed(42)
perm = torch.randperm(len(X))
n_train = int(0.8 * len(X))
Xtr, Ytr = X[perm[:n_train]], Y[perm[:n_train]]
Xval, Yval = X[perm[n_train:]], Y[perm[n_train:]]
print(f"Train: {len(Xtr)}, Val: {len(Xval)}")

## 3. Model Setup

Same MLP architecture as Day 9: C(27,10), W1(30,200), b1(200), W2(200,27), b2(27).

In [None]:
def build_model(seed=42):
    torch.manual_seed(seed)
    C = torch.randn(27, 10) * 0.1
    W1 = torch.randn(30, 200) * 0.2
    b1 = torch.randn(200) * 0.01
    W2 = torch.randn(200, 27) * 0.01
    b2 = torch.randn(27) * 0
    params = [C, W1, b1, W2, b2]
    for p in params:
        p.requires_grad = True
    return params

def forward(X, params):
    C, W1, b1, W2, b2 = params
    emb = C[X]  # (N, block_size, emb_dim)
    h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
    logits = h @ W2 + b2
    return logits

params = build_model()
print("Params: C(27,10), W1(30,200), b1(200), W2(200,27), b2(27)")

## 4. Learning Rate Finder

Sweep learning rate from 1e-4 to 10 in log-space over 1000 steps. Record loss at each LR. Plot loss vs log10(lr) to find the sweet spot.

In [None]:
params = build_model()
lre = torch.linspace(-4, 1, 1000)  # log10(lr) from -4 to 1
lrs = 10 ** lre
lossi = []

for i in range(1000):
    ix = torch.randint(0, len(Xtr), (32,))
    logits = forward(Xtr[ix], params)
    loss = F.cross_entropy(logits, Ytr[ix])
    for p in params:
        p.grad = None
    loss.backward()
    lr = lrs[i].item()
    for p in params:
        p.data -= lr * p.grad
    lossi.append(loss.item())

plt.figure(figsize=(8, 4))
plt.plot(lre.tolist(), lossi, 'b-', linewidth=0.8)
plt.xlabel('log10(learning rate)')
plt.ylabel('Loss')
plt.title('Learning Rate Finder: Loss vs LR (log scale)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Sweet spot: where loss drops fastest before rising
min_idx = min(range(len(lossi)), key=lambda i: lossi[i])
print(f"Lowest loss at step {min_idx}: lr={lrs[min_idx].item():.4f}, loss={lossi[min_idx]:.4f}")

## 5. Step Decay Schedule

Train with a schedule: start at lr=0.1, decay to 0.01 after 5000 steps. Compare to fixed LR.

In [None]:
def train_with_schedule(params, steps, decay_step=5000, lr_high=0.1, lr_low=0.01, use_decay=True):
    train_losses, val_losses = [], []
    for step in range(steps):
        ix = torch.randint(0, len(Xtr), (32,))
        logits = forward(Xtr[ix], params)
        loss = F.cross_entropy(logits, Ytr[ix])
        for p in params:
            p.grad = None
        loss.backward()
        if use_decay:
            lr = lr_high if step < decay_step else lr_low
        else:
            lr = lr_low  # fixed low LR
        for p in params:
            p.data -= lr * p.grad
        if step % 500 == 0:
            with torch.no_grad():
                tloss = F.cross_entropy(forward(Xtr, params), Ytr).item()
                vloss = F.cross_entropy(forward(Xval, params), Yval).item()
            train_losses.append(tloss)
            val_losses.append(vloss)
    return train_losses, val_losses

In [None]:
steps = 10000
decay_step = 5000

# With step decay (0.1 -> 0.01 at 5000)
params_decay = build_model()
tr_decay, val_decay = train_with_schedule(params_decay, steps, decay_step, use_decay=True)

# Fixed LR = 0.01
params_fixed = build_model()
tr_fixed, val_fixed = train_with_schedule(params_fixed, steps, decay_step, use_decay=False)

steps_plot = list(range(0, steps + 1, 500))
plt.figure(figsize=(9, 4))
plt.plot(steps_plot, tr_decay, 'b-', label='Train (decay)')
plt.plot(steps_plot, val_decay, 'b--', label='Val (decay)')
plt.plot(steps_plot, tr_fixed, 'g-', alpha=0.7, label='Train (fixed 0.01)')
plt.plot(steps_plot, val_fixed, 'g--', alpha=0.7, label='Val (fixed 0.01)')
plt.axvline(decay_step, color='gray', linestyle=':', alpha=0.7, label='Decay step')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.title('Step Decay (0.1→0.01) vs Fixed LR')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"With decay  - Final train: {tr_decay[-1]:.4f}, val: {val_decay[-1]:.4f}")
print(f"Fixed 0.01 - Final train: {tr_fixed[-1]:.4f}, val: {val_fixed[-1]:.4f}")

## 6. Visualizing Embeddings

After training, extract the 2D PCA (or first 2 dims) of C. Plot characters in embedding space, label each point.

In [None]:
C = params_decay[0].detach()

# Use first 2 dims (C is 27x10, so we have 10 dims)
# For 2D viz: either first 2 dims or PCA
if C.shape[1] >= 2:
    # Option: first 2 dims
    emb_2d = C[:, :2].numpy()
else:
    emb_2d = C.numpy()

plt.figure(figsize=(8, 6))
for i in range(27):
    plt.scatter(emb_2d[i, 0], emb_2d[i, 1], s=80, alpha=0.8)
    plt.annotate(itos[i], (emb_2d[i, 0], emb_2d[i, 1]), fontsize=10, ha='center', va='bottom')
plt.xlabel('Embedding dim 0')
plt.ylabel('Embedding dim 1')
plt.title('Character Embeddings (first 2 dims of C)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Alternative: PCA for better 2D projection when emb_dim > 2
def pca_2d(X, n_components=2):
    X_centered = X - X.mean(0)
    cov = X_centered.T @ X_centered / (X.shape[0] - 1)
    eigvals, eigvecs = torch.linalg.eigh(cov)
    idx = eigvals.argsort(descending=True)
    V = eigvecs[:, idx[:n_components]]
    return (X_centered @ V).numpy()

emb_pca = pca_2d(C)
plt.figure(figsize=(8, 6))
for i in range(27):
    plt.scatter(emb_pca[i, 0], emb_pca[i, 1], s=80, alpha=0.8)
    plt.annotate(itos[i], (emb_pca[i, 0], emb_pca[i, 1]), fontsize=10, ha='center', va='bottom')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Character Embeddings (PCA projection)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Hyperparameter Table

Print a summary: embedding_dim, hidden_size, block_size, total parameters, final train loss, final val loss.

In [None]:
def count_params(params):
    return sum(p.numel() for p in params)

with torch.no_grad():
    final_train = F.cross_entropy(forward(Xtr, params_decay), Ytr).item()
    final_val = F.cross_entropy(forward(Xval, params_decay), Yval).item()

embedding_dim = 10
hidden_size = 200
total_params = count_params(params_decay)

print("=" * 50)
print("HYPERPARAMETER SUMMARY")
print("=" * 50)
print(f"embedding_dim:  {embedding_dim}")
print(f"hidden_size:    {hidden_size}")
print(f"block_size:    {block_size}")
print(f"total_params:  {total_params:,}")
print(f"final_train_loss: {final_train:.4f}")
print(f"final_val_loss:   {final_val:.4f}")
print("=" * 50)

---

**Blog:** [Day 10 — Embeddings, LR Schedules & Hyperparameters](https://omkarray.com/llm-day10.html)

**Prev:** [Day 9 — MLP Language Model](llm_day09_mlp.ipynb) · **Next:** [Day 11 — Activations & Gradients](llm_day11_activations.ipynb)