In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Training a Reasoning Model with GRPO -- Vizuara

## 1. Why Does This Matter?

DeepSeek-R1 demonstrated something remarkable: a language model can learn to reason step by step purely through reinforcement learning, without being explicitly taught chain-of-thought reasoning. The key ingredient? **GRPO with verifiable rewards**.

In this notebook, we will replicate this idea at a small scale. We will train a tiny transformer model to solve simple arithmetic problems (addition and multiplication) using GRPO. The reward is binary -- 1 if the answer is correct, 0 if wrong. No reward model needed. No critic network needed.

By the end, you will have a working GRPO training pipeline that you can scale up.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import re
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. Building Intuition

### The Simplest RL Setup

Imagine teaching a child arithmetic. You give them a problem: "What is 7 + 5?"

- If they answer "12" -- correct! Reward = 1.
- If they answer "11" -- wrong. Reward = 0.

Now imagine you give them the SAME problem 8 times, and they give different answers each time (because they are still learning and sometimes guess):

| Attempt | Answer | Correct? | Reward |
|---------|--------|----------|--------|
| 1       | 12     | Yes      | 1      |
| 2       | 11     | No       | 0      |
| 3       | 12     | Yes      | 1      |
| 4       | 13     | No       | 0      |
| 5       | 12     | Yes      | 1      |
| 6       | 10     | No       | 0      |
| 7       | 12     | Yes      | 1      |
| 8       | 11     | No       | 0      |

GRPO normalizes these rewards: mean=0.5, std=0.5.
- Correct answers get advantage = +1.0
- Wrong answers get advantage = -1.0

The model learns: "increase the probability of generating '12', decrease everything else." This is exactly what we want.

In [None]:
# Demonstrate the binary reward setup
rewards = torch.tensor([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0])
mean_r = rewards.mean()
std_r = rewards.std()
advantages = (rewards - mean_r) / std_r

print("Binary rewards:", rewards.numpy())
print(f"Mean: {mean_r:.1f}, Std: {std_r:.3f}")
print(f"Advantages: {advantages.numpy().round(2)}")
print(f"\nCorrect answers: advantage = {advantages[0]:.2f}")
print(f"Wrong answers:   advantage = {advantages[1]:.2f}")

## 3. The Mathematics

### GRPO with Verifiable Rewards

The full GRPO objective with binary verifiable rewards:

$$\mathcal{J}(\theta) = \mathbb{E}\left[\frac{1}{G}\sum_{i=1}^G \frac{1}{|o_i|}\sum_{t=1}^{|o_i|} \left(\min(r_{i,t}\hat{A}_i, \text{clip}(r_{i,t}, 1-\epsilon, 1+\epsilon)\hat{A}_i) - \beta D_\text{KL}\right)\right]$$

where $\hat{A}_i = \frac{r_i - \mu}{\sigma}$ and $r_i \in \{0, 1\}$ (correct or not).

Let us plug in numbers for G=4 with rewards $\{1, 0, 1, 0\}$:
- $\mu = 0.5$, $\sigma = 0.5$
- Advantages: $\{(1-0.5)/0.5, (0-0.5)/0.5, ...\} = \{+1, -1, +1, -1\}$

With ratio $r_{i,t} = 1.1$ for a correct response:
- Unclipped: $1.1 \times 1.0 = 1.1$
- Clipped: $\min(1.1, 1.2) \times 1.0 = 1.1$
- No clipping needed (ratio within bounds). The correct response gets reinforced.

In [None]:
# Verify the GRPO objective numerically for our simple case
ratio = 1.1
A = 1.0
epsilon = 0.2

unclipped = ratio * A
clipped = min(max(ratio, 1 - epsilon), 1 + epsilon) * A
objective = min(unclipped, clipped)

print(f"Ratio: {ratio}")
print(f"Advantage: {A}")
print(f"Unclipped: {unclipped}")
print(f"Clipped:   {clipped}")
print(f"Objective: {objective}")
print(f"\nThis positive objective reinforces the correct response.")

## 4. Let's Build It -- Component by Component

### Component 1: The Arithmetic Dataset

In [None]:
class ArithmeticDataset:
    """
    Generate simple arithmetic problems for GRPO training.
    Each problem has a verifiable answer.
    """

    def __init__(self, max_num=20, operations=['+']):
        self.max_num = max_num
        self.operations = operations

    def generate_problem(self):
        """Generate a random arithmetic problem."""
        a = np.random.randint(1, self.max_num + 1)
        b = np.random.randint(1, self.max_num + 1)
        op = np.random.choice(self.operations)

        if op == '+':
            answer = a + b
        elif op == '*':
            answer = a * b
        else:
            answer = a + b

        problem = f"{a}{op}{b}="
        return problem, str(answer)

    def generate_batch(self, batch_size=16):
        """Generate a batch of problems."""
        problems = []
        answers = []
        for _ in range(batch_size):
            p, a = self.generate_problem()
            problems.append(p)
            answers.append(a)
        return problems, answers

# Test the dataset
dataset = ArithmeticDataset(max_num=15, operations=['+'])
for _ in range(5):
    problem, answer = dataset.generate_problem()
    print(f"  {problem}{answer}")

### Component 2: Character-Level Tokenizer

In [None]:
class CharTokenizer:
    """Simple character-level tokenizer for arithmetic expressions."""

    def __init__(self):
        self.chars = list("0123456789+=*<>")  # < = start, > = end
        self.char_to_id = {c: i for i, c in enumerate(self.chars)}
        self.id_to_char = {i: c for i, c in enumerate(self.chars)}
        self.vocab_size = len(self.chars)
        self.pad_id = self.vocab_size  # padding token
        self.start_id = self.char_to_id['<']
        self.end_id = self.char_to_id['>']

    def encode(self, text):
        """Encode text to token IDs."""
        return [self.char_to_id.get(c, self.pad_id) for c in text]

    def decode(self, ids):
        """Decode token IDs to text."""
        return ''.join([self.id_to_char.get(i, '?') for i in ids if i in self.id_to_char])

tokenizer = CharTokenizer()
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Encode '7+5=12': {tokenizer.encode('7+5=12')}")
print(f"Decode back: {tokenizer.decode(tokenizer.encode('7+5=12'))}")

### Component 3: Tiny Transformer Model

In [None]:
class TinyTransformer(nn.Module):
    """
    A small transformer for character-level arithmetic.
    ~50K parameters -- trainable in minutes on a CPU.
    """

    def __init__(self, vocab_size=16, d_model=64, n_heads=4, n_layers=2, max_len=20):
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Embedding(max_len, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=128,
            dropout=0.1, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        """Forward pass. x: (batch, seq_len)"""
        B, T = x.shape
        pos = torch.arange(T, device=x.device).unsqueeze(0)

        # Causal mask
        mask = nn.Transformer.generate_square_subsequent_mask(T).to(x.device)

        h = self.embedding(x) + self.pos_embedding(pos)
        h = self.transformer(h, mask=mask, is_causal=True)
        logits = self.head(h)
        return logits

    def generate(self, prompt_ids, max_new_tokens=5, temperature=1.0):
        """Generate tokens autoregressively."""
        generated = prompt_ids.clone()

        for _ in range(max_new_tokens):
            if generated.shape[1] >= self.max_len:
                break
            logits = self.forward(generated)
            next_logits = logits[:, -1, :] / temperature
            probs = F.softmax(next_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat([generated, next_token], dim=1)

        return generated

# Create model
model = TinyTransformer(vocab_size=tokenizer.vocab_size + 1, d_model=64, n_heads=4, n_layers=2).to(device)
ref_model = TinyTransformer(vocab_size=tokenizer.vocab_size + 1, d_model=64, n_heads=4, n_layers=2).to(device)
ref_model.load_state_dict(model.state_dict())  # Reference = copy of initial model

param_count = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {param_count:,}")
print(f"Reference model: frozen copy")

### Component 4: Verifiable Reward Function

In [None]:
def compute_reward(problem: str, generated: str, correct_answer: str) -> float:
    """
    Binary reward: 1 if the generated answer matches the correct answer, 0 otherwise.
    This is the 'verifiable reward' -- no reward model needed!
    """
    # Extract just the digits after '='
    generated_clean = ''.join(c for c in generated if c.isdigit())
    correct_clean = ''.join(c for c in correct_answer if c.isdigit())

    if generated_clean == correct_clean:
        return 1.0
    return 0.0

# Test
print(compute_reward("7+5=", "12", "12"))  # Should be 1.0
print(compute_reward("7+5=", "11", "12"))  # Should be 0.0
print(compute_reward("7+5=", "12>", "12")) # Should be 1.0

### Component 5: GRPO Training Functions

In [None]:
def get_log_probs(model, input_ids, target_ids):
    """
    Compute log probabilities of target tokens under the model.

    Args:
        model: The language model
        input_ids: (batch, seq_len) input token IDs
        target_ids: (batch, seq_len) target token IDs (shifted by 1)
    Returns:
        log_probs: (batch, seq_len-1) log probabilities of each target token
    """
    logits = model(input_ids[:, :-1])  # (batch, seq_len-1, vocab)
    log_probs = F.log_softmax(logits, dim=-1)

    # Gather log probs of the target tokens
    target = target_ids[:, 1:]  # Shifted targets
    token_log_probs = log_probs.gather(2, target.unsqueeze(-1)).squeeze(-1)

    return token_log_probs


def compute_grpo_advantages(rewards):
    """Group-relative advantage computation."""
    mean_r = rewards.mean()
    std_r = rewards.std()
    if std_r < 1e-8:
        return torch.zeros_like(rewards)
    return (rewards - mean_r) / std_r


def grpo_loss(log_probs, old_log_probs, ref_log_probs, advantages, mask, epsilon=0.2, beta=0.04):
    """Compute the GRPO loss."""
    ratio = torch.exp(log_probs - old_log_probs)
    adv = advantages.unsqueeze(1)

    surr1 = ratio * adv
    surr2 = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) * adv
    policy_obj = torch.min(surr1, surr2)

    kl_ratio = torch.exp(ref_log_probs - log_probs)
    kl = kl_ratio - torch.log(kl_ratio) - 1.0

    per_token = policy_obj - beta * kl
    per_response = (per_token * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)

    return -per_response.mean()

print("All GRPO training components ready!")

## 5. Your Turn

### TODO 1: Implement the Full GRPO Training Step

In [None]:
def grpo_training_step(model, ref_model, tokenizer, dataset, optimizer,
                        G=8, max_new_tokens=4, epsilon=0.2, beta=0.04):
    """
    TODO: Implement one complete GRPO training step.

    Steps:
    1. Generate a problem from the dataset
    2. Encode the problem as prompt tokens
    3. Generate G completions from the model
    4. Compute rewards (binary: correct/incorrect)
    5. Compute group-relative advantages
    6. Compute log probs under current, old, and ref policies
    7. Compute GRPO loss and backpropagate

    Hints:
    - Use model.generate() for sampling completions
    - Use get_log_probs() for computing token-level log probs
    - Use compute_grpo_advantages() for advantages
    - Use grpo_loss() for the loss computation
    - Remember to detach old_log_probs (they should not have gradients)
    """
    model.train()

    # Step 1: Generate problem
    problem, correct_answer = dataset.generate_problem()

    # Step 2: Encode prompt
    prompt_ids = torch.tensor([tokenizer.encode(problem)]).to(device)
    prompt_len = prompt_ids.shape[1]

    # Step 3: Generate G completions
    completions = []
    with torch.no_grad():
        for _ in range(G):
            comp = model.generate(prompt_ids, max_new_tokens=max_new_tokens, temperature=1.0)
            completions.append(comp)

    # Step 4: Compute rewards
    rewards = []
    for comp in completions:
        generated_text = tokenizer.decode(comp[0, prompt_len:].tolist())
        r = compute_reward(problem, generated_text, correct_answer)
        rewards.append(r)
    rewards = torch.tensor(rewards).to(device)

    # Step 5: Compute advantages
    advantages = compute_grpo_advantages(rewards)

    # Skip if all rewards are the same (nothing to learn)
    if advantages.abs().sum() < 1e-8:
        return 0.0, rewards.mean().item(), 0.0

    # Step 6: Compute log probs
    # Pad completions to same length
    max_len = max(c.shape[1] for c in completions)
    padded = torch.zeros(G, max_len, dtype=torch.long).to(device)
    mask = torch.zeros(G, max_len - 1).to(device)

    for i, comp in enumerate(completions):
        L = comp.shape[1]
        padded[i, :L] = comp[0]
        # Mask: only count completion tokens (after prompt)
        mask[i, prompt_len:L-1] = 1.0

    log_probs = get_log_probs(model, padded, padded)
    with torch.no_grad():
        old_log_probs = log_probs.detach()
        ref_log_probs = get_log_probs(ref_model, padded, padded)

    # Step 7: Compute loss and backprop
    loss = grpo_loss(log_probs, old_log_probs, ref_log_probs, advantages, mask, epsilon, beta)

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

    return loss.item(), rewards.mean().item(), rewards.sum().item() / G

# Test the training step
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
loss, mean_reward, accuracy = grpo_training_step(model, ref_model, tokenizer, dataset, optimizer)
print(f"Loss: {loss:.4f}, Mean reward: {mean_reward:.3f}, Accuracy: {accuracy:.1%}")

### TODO 2: Add Format Reward

DeepSeek-R1 uses a format reward in addition to the correctness reward. Implement a combined reward.

In [None]:
def compute_combined_reward(problem, generated, correct_answer, format_weight=0.1):
    """
    TODO: Implement a combined reward that includes:
    1. Correctness reward (binary: 0 or 1) -- weight = 1.0
    2. Format reward: bonus for clean output (just digits, no garbage) -- weight = format_weight

    Hints:
    - Correctness: same as compute_reward()
    - Format: check if generated contains ONLY digits
    - Combined: correctness + format_weight * format_score
    """
    # Correctness
    correct = compute_reward(problem, generated, correct_answer)

    # Format: does the output contain only digits?
    clean = ''.join(c for c in generated if c.isdigit() or c == '>')
    format_score = 1.0 if len(clean) == len(generated) and len(generated) > 0 else 0.0

    return correct + format_weight * format_score

# Test
print(compute_combined_reward("7+5=", "12", "12"))     # Correct + clean
print(compute_combined_reward("7+5=", "12???", "12"))   # Correct but messy
print(compute_combined_reward("7+5=", "11", "12"))      # Wrong but clean

## 6. Putting It All Together

In [None]:
# Full GRPO training loop
def train_grpo(model, ref_model, tokenizer, dataset, n_steps=500, G=8, log_every=50):
    """Train the model using GRPO."""
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)

    history = defaultdict(list)

    print("Starting GRPO training...")
    print(f"Group size G={G}, Steps={n_steps}")
    print("-" * 60)

    start_time = time.time()

    for step in range(n_steps):
        loss, mean_reward, accuracy = grpo_training_step(
            model, ref_model, tokenizer, dataset, optimizer, G=G
        )

        history['loss'].append(loss)
        history['reward'].append(mean_reward)
        history['accuracy'].append(accuracy)

        if (step + 1) % log_every == 0:
            avg_loss = np.mean(history['loss'][-log_every:])
            avg_reward = np.mean(history['reward'][-log_every:])
            avg_acc = np.mean(history['accuracy'][-log_every:])
            elapsed = time.time() - start_time

            print(f"Step {step+1:4d} | Loss: {avg_loss:7.4f} | "
                  f"Reward: {avg_reward:.3f} | Acc: {avg_acc:.1%} | "
                  f"Time: {elapsed:.0f}s")

    return history

## 7. Training and Results

In [None]:
# Reset model
torch.manual_seed(42)
model = TinyTransformer(vocab_size=tokenizer.vocab_size + 1, d_model=64, n_heads=4, n_layers=2).to(device)
ref_model = TinyTransformer(vocab_size=tokenizer.vocab_size + 1, d_model=64, n_heads=4, n_layers=2).to(device)
ref_model.load_state_dict(model.state_dict())

# Train
dataset = ArithmeticDataset(max_num=10, operations=['+'])
history = train_grpo(model, ref_model, tokenizer, dataset, n_steps=500, G=8, log_every=50)

In [None]:
# Plot training curves
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

window = 30

# Loss
loss_smooth = np.convolve(history['loss'], np.ones(window)/window, mode='valid')
axes[0].plot(loss_smooth, linewidth=2, color='red')
axes[0].set_title("Training Loss", fontsize=13, fontweight='bold')
axes[0].set_xlabel("Step")
axes[0].set_ylabel("Loss")
axes[0].grid(True, alpha=0.3)

# Reward
reward_smooth = np.convolve(history['reward'], np.ones(window)/window, mode='valid')
axes[1].plot(reward_smooth, linewidth=2, color='blue')
axes[1].set_title("Mean Reward", fontsize=13, fontweight='bold')
axes[1].set_xlabel("Step")
axes[1].set_ylabel("Reward")
axes[1].grid(True, alpha=0.3)

# Accuracy
acc_smooth = np.convolve(history['accuracy'], np.ones(window)/window, mode='valid')
axes[2].plot(acc_smooth, linewidth=2, color='green')
axes[2].set_title("Accuracy", fontsize=13, fontweight='bold')
axes[2].set_xlabel("Step")
axes[2].set_ylabel("Accuracy")
axes[2].grid(True, alpha=0.3)

plt.suptitle("GRPO Training Results", fontsize=15, fontweight='bold')
plt.tight_layout()
plt.savefig("grpo_training_results.png", dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Evaluate the trained model
print("=" * 50)
print("Evaluation: Testing the GRPO-trained model")
print("=" * 50)

correct = 0
total = 20

for _ in range(total):
    problem, answer = dataset.generate_problem()
    prompt_ids = torch.tensor([tokenizer.encode(problem)]).to(device)

    with torch.no_grad():
        output = model.generate(prompt_ids, max_new_tokens=4, temperature=0.1)

    generated = tokenizer.decode(output[0, prompt_ids.shape[1]:].tolist())
    is_correct = compute_reward(problem, generated, answer)
    correct += is_correct

    status = "CORRECT" if is_correct else "WRONG"
    print(f"  {problem}{answer}  |  Model: {generated:<6}  [{status}]")

print(f"\nAccuracy: {correct}/{total} = {correct/total:.0%}")

## 8. Final Output

In [None]:
print("=" * 60)
print("GRPO Training Pipeline -- Complete!")
print("=" * 60)
print()
print("What we built:")
print("  1. Character-level tokenizer for arithmetic")
print("  2. Tiny transformer model (~50K params)")
print("  3. Binary verifiable reward function")
print("  4. Full GRPO training loop with:")
print("     - Group sampling (G completions per prompt)")
print("     - Group-relative advantage normalization")
print("     - Clipped surrogate objective")
print("     - KL divergence penalty")
print()
print("This is the SAME approach DeepSeek used for R1,")
print("just at a much smaller scale.")
print()
print("To scale up:")
print("  - Replace TinyTransformer with a real LLM")
print("  - Use harder math problems")
print("  - Increase G to 32-64")
print("  - Train for thousands of steps on GPUs")
print("=" * 60)

## 9. Reflection and Next Steps

**Key takeaways from this notebook:**

1. GRPO with verifiable rewards requires NO reward model and NO critic -- just a function that checks correctness.
2. Binary rewards (correct/incorrect) produce clean +1/-1 advantages after group normalization.
3. Even a tiny transformer can learn simple arithmetic through GRPO.
4. The same pipeline scales to state-of-the-art reasoning models (DeepSeek-R1).

**Reflection questions:**
- How would you modify this pipeline to train on multi-step reasoning problems (where intermediate steps matter)?
- What happens if the model's accuracy is very low (e.g., 1 out of 8 correct)? Does GRPO still learn?
- What if accuracy is very high (e.g., 7 out of 8 correct)? Is there still a learning signal?
- How would you add a "format reward" to encourage the model to show its work?

**Further reading:**
- [DeepSeek-R1 paper](https://arxiv.org/abs/2501.12948)
- [DeepSeekMath paper (GRPO)](https://arxiv.org/abs/2402.03300)