In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Training a Reasoning Model End-to-End -- Vizuara

In this notebook, we combine SFT and GRPO to train a language model that reasons through math problems. This is the full pipeline from base model to reasoning model.

**What you will build:** A model fine-tuned with SFT then GRPO on math problems, producing chain-of-thought reasoning.
```

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
import matplotlib.pyplot as plt
import numpy as np
import re
import copy
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"PyTorch: {torch.__version__}, Device: {device}")
torch.manual_seed(42)

In [None]:
!pip install -q transformers datasets

## 1. Why Does This Matter?

In Notebooks 1 and 2, we built individual components: SFT and GRPO. Now we combine them into the full pipeline. By the end, you will see a small model develop step-by-step reasoning ability.

**Teaser output from our trained model:**
```
Q: If a box has 3 rows of 4 apples, remove 5. How many left?
<think> 3 rows of 4 = 3*4 = 12. Remove 5: 12-5 = 7. </think>
The answer is 7.
```
```

## 2. Building Intuition

Think of two-phase training like teaching a student:
- **Phase 1 (SFT):** Show worked examples. Student learns the *format*.
- **Phase 2 (GRPO):** Practice exams with grading. Student learns *what works*.

Neither alone suffices. SFT without RL = plausible but wrong reasoning. RL without SFT = no idea how to structure reasoning.

### Think About This
- Why must SFT come before RL?
- What if SFT was very long but RL very short?
```

## 3. The Mathematics

**SFT loss:** $\mathcal{L}_{\text{SFT}} = -\sum_{t} \log p_\theta(y_t | y_{<t}, x)$

**GRPO objective:**
$$\mathcal{L}_{\text{GRPO}} = -\frac{1}{G}\sum_{i=1}^{G} \min\left(r_i \hat{A}_i, \text{clip}(r_i, 1-\epsilon, 1+\epsilon) \hat{A}_i\right)$$

where $\hat{A}_i = \frac{R_i - \bar{R}}{\sigma_R}$ and $r_i = \frac{\pi_\theta(y_i|x)}{\pi_{\text{old}}(y_i|x)}$.

**Computationally:** SFT minimizes prediction error on worked examples. GRPO maximizes probability of correct completions relative to the group.
```

## 4. Let's Build It -- Component by Component

### 4.1 Setup: Model and Data
```

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
special_tokens = {"additional_special_tokens": ["<think>", "</think>"]}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

print(f"Model: {MODEL_NAME} ({sum(p.numel() for p in model.parameters()):,} params)")

In [None]:
# SFT training data
sft_data = [
    ("What is 3 * 7?", "<think>\n3 * 7 = 21.\n</think>\nThe answer is 21."),
    ("What is 15 + 28?", "<think>\n15 + 28 = 43.\n</think>\nThe answer is 43."),
    ("What is 100 - 37?", "<think>\n100 - 37 = 63.\n</think>\nThe answer is 63."),
    ("What is 8 * 6?", "<think>\n8 * 6 = 48.\n</think>\nThe answer is 48."),
    ("What is 50% of 80?", "<think>\n50% = 0.5. 0.5 * 80 = 40.\n</think>\nThe answer is 40."),
    ("What is 9 * 9?", "<think>\n9 * 9 = 81.\n</think>\nThe answer is 81."),
    ("What is 200 - 85?", "<think>\n200 - 85 = 115.\n</think>\nThe answer is 115."),
    ("If 24 cookies shared among 6, how many each?", "<think>\n24 / 6 = 4.\n</think>\nThe answer is 4."),
]

# RL problems with ground truth
rl_problems = [
    ("What is 4 * 8?", "32"), ("What is 17 + 25?", "42"),
    ("What is 90 - 34?", "56"), ("What is 6 * 7?", "42"),
    ("What is 20% of 50?", "10"), ("3 boxes of 5 items, total?", "15"),
    ("What is 12 * 3?", "36"), ("What is 150 - 67?", "83"),
]
print(f"SFT examples: {len(sft_data)}, RL problems: {len(rl_problems)}")

In [None]:
def extract_answer(text):
    """Extract numerical answer from model output."""
    match = re.search(r'[Tt]he answer is[:\s]*(\-?[\d,\.]+)', text)
    if match:
        return match.group(1).replace(',', '').strip('.')
    numbers = re.findall(r'\-?[\d]+', text)
    return numbers[-1] if numbers else None

def compute_reward(completion, ground_truth):
    """Binary verifiable reward."""
    predicted = extract_answer(completion)
    return 1.0 if predicted and predicted.strip() == ground_truth.strip() else 0.0

# Test
print(compute_reward("<think>\n32.\n</think>\nThe answer is 32.", "32"))  # Should be 1.0
print(compute_reward("<think>\n30.\n</think>\nThe answer is 30.", "32"))  # Should be 0.0

### 4.2 Phase 1: SFT Training
```

In [None]:
def run_sft(model, tokenizer, data, epochs=40, lr=5e-5):
    """Run SFT warmup phase."""
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    losses = []
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for prompt, completion in data:
            text = f"Question: {prompt}\n{completion}"
            tokens = tokenizer(text, return_tensors="pt", truncation=True,
                             max_length=256, padding="max_length")
            tokens = {k: v.to(device) for k, v in tokens.items()}
            labels = tokens["input_ids"].clone()
            labels[tokens["attention_mask"] == 0] = -100
            loss = model(**tokens, labels=labels).loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        losses.append(epoch_loss / len(data))
        if (epoch+1) % 10 == 0:
            print(f"SFT Epoch {epoch+1}, Loss: {losses[-1]:.4f}")
    return losses

print("Phase 1: SFT Training")
sft_losses = run_sft(model, tokenizer, sft_data)

### Visualization Checkpoint: SFT Loss
```

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(sft_losses, color='#2196F3', linewidth=2)
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('SFT Training Loss')
plt.grid(True, alpha=0.3); plt.tight_layout(); plt.show()

### 4.3 Save Reference Model
```

In [None]:
ref_model = copy.deepcopy(model)
ref_model.eval()
for p in ref_model.parameters():
    p.requires_grad = False
print("Reference model saved (frozen)")

### 4.4 Phase 2: GRPO Training
```

In [None]:
def generate(model, tokenizer, prompt, max_new=128):
    """Generate completion from model."""
    text = f"Question: {prompt}\n"
    inputs = tokenizer(text, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=max_new,
                           do_sample=True, temperature=0.8,
                           pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(out[0], skip_special_tokens=False)[len(text):], out[0]

def log_prob(model, toks):
    """Compute total log probability of a token sequence."""
    model.eval()
    with torch.no_grad():
        logits = model(toks.unsqueeze(0).to(device)).logits[0, :-1]
        targets = toks[1:].to(device)
        lp = F.log_softmax(logits, dim=-1)
        return lp.gather(1, targets.unsqueeze(1)).squeeze().sum().item()

## 5. Your Turn -- TODO Exercises

### TODO 1: Complete the GRPO Training Loop
```

In [None]:
def grpo_train(model, ref_model, tokenizer, problems,
               steps=50, G=4, beta=0.05, lr=1e-6):
    """GRPO training loop."""
    optimizer = AdamW(model.parameters(), lr=lr)
    rewards_hist, losses_hist = [], []

    for step in range(steps):
        idx = np.random.randint(len(problems))
        prompt, truth = problems[idx]

        # Generate G completions
        comps, toks_list = [], []
        for _ in range(G):
            c, t = generate(model, tokenizer, prompt)
            comps.append(c); toks_list.append(t)

        # Compute rewards
        rewards = torch.tensor([compute_reward(c, truth) for c in comps])

        # ============ TODO ============
        # Compute group-relative advantages
        if rewards.std() < 1e-8:
            advantages = torch.zeros_like(rewards)
        else:
            advantages = ???  # YOUR CODE: (rewards - mean) / std

        # Compute log probs
        lp_curr = torch.tensor([log_prob(model, t) for t in toks_list])
        lp_ref = torch.tensor([log_prob(ref_model, t) for t in toks_list])

        # KL penalty
        kl = (lp_curr - lp_ref)
        adj_rewards = rewards - beta * kl
        if adj_rewards.std() > 1e-8:
            advantages = (adj_rewards - adj_rewards.mean()) / (adj_rewards.std() + 1e-8)

        # Policy gradient update (simplified)
        loss = ???  # YOUR CODE: -(lp_curr * advantages).mean()
        # ==============================

        if hasattr(loss, 'backward'):
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        rewards_hist.append(rewards.mean().item())
        if (step+1) % 10 == 0:
            print(f"Step {step+1}, Avg Reward: {np.mean(rewards_hist[-10:]):.3f}")

    return rewards_hist

print("Phase 2: GRPO Training")
rl_rewards = grpo_train(model, ref_model, tokenizer, rl_problems)

In [None]:
# Verification
assert len(rl_rewards) > 0, "No rewards recorded"
print(f"Training complete! Final avg reward: {np.mean(rl_rewards[-10:]):.3f}")

### TODO 2: Compare Pre-RL vs Post-RL Outputs
```

In [None]:
# ============ TODO ============
# Generate from both ref_model and model on rl_problems[:4]
# Compare reasoning quality
# for prompt, truth in rl_problems[:4]:
#     before, _ = generate(ref_model, tokenizer, prompt)
#     after, _ = generate(model, tokenizer, prompt)
#     print(f"Q: {prompt} (Truth: {truth})")
#     print(f"Before: {before[:150]}")
#     print(f"After:  {after[:150]}\n")
# ==============================

## 6. Putting It All Together
```

In [None]:
print("=== Final Evaluation ===\n")
correct = 0
for prompt, truth in rl_problems:
    comp, _ = generate(model, tokenizer, prompt)
    r = compute_reward(comp, truth)
    correct += int(r)
    print(f"Q: {prompt} | Truth: {truth} | {'CORRECT' if r else 'WRONG'}")
    print(f"  {comp[:200]}\n")
print(f"Accuracy: {correct}/{len(rl_problems)} ({100*correct/len(rl_problems):.0f}%)")

## 7. Training and Results
```

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(sft_losses, color='#2196F3', linewidth=2)
ax1.set_xlabel('Epoch'); ax1.set_ylabel('Loss')
ax1.set_title('Phase 1: SFT'); ax1.grid(True, alpha=0.3)

w = 5
sm = [np.mean(rl_rewards[max(0,i-w):i+1]) for i in range(len(rl_rewards))]
ax2.plot(sm, color='#4CAF50', linewidth=2)
ax2.set_xlabel('Step'); ax2.set_ylabel('Avg Reward')
ax2.set_title('Phase 2: GRPO'); ax2.set_ylim(-0.1, 1.1); ax2.grid(True, alpha=0.3)

plt.suptitle('Complete Training Pipeline', fontsize=14, y=1.02)
plt.tight_layout(); plt.show()

## 8. Final Output

We trained a reasoning model through the full two-phase pipeline:
1. **SFT:** Learned the `<think>` format from worked examples
2. **GRPO:** Learned which reasoning strategies lead to correct answers

Even with GPT-2 (124M params) and 8 toy problems, we observe the model developing reasoning behavior. At DeepSeek scale (671B params, GSM8K), this produces remarkable emergent reasoning.
```

## 9. Reflection and Next Steps

### Think About This
1. How would a larger model and dataset change the results?
2. DeepSeek-R1 found RL alone (no SFT) can work. Why?
3. What happens if we increase the group size G from 4 to 64?

### What Comes Next
Notebook 4 covers emergent behaviors (self-verification, backtracking) and distillation from large to small models.

### Key Takeaway
SFT provides scaffolding (format). RL fills it with substance (correctness). Together they produce genuine reasoning.
```