In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# REINFORCE from Scratch: Variance, Baselines, and Convergence

*Part 2 of the Vizuara series on Policy Gradient Methods*
*Estimated time: 50 minutes*

## 1. Why Does This Matter?

In the previous notebook, we built a policy gradient agent. It worked — but you may have noticed that the training was noisy. The reward jumped around wildly between episodes. Sometimes the agent would seem to learn, then suddenly forget everything.

This is the **variance problem** — the central challenge in policy gradient methods. It is the reason why vanilla REINFORCE took over 100,000 steps to solve CartPole, while value-based methods can do it in under 10,000.

In this notebook, we will:
- Diagnose the variance problem with visualizations
- Build the REINFORCE algorithm from scratch
- Implement variance reduction through baselines
- Compare convergence speed with and without baselines
- See Q-value variance drop by more than 3x

The techniques we build here — baselines and advantage estimation — are the foundation of every modern RL algorithm including PPO.

## 2. Building Intuition

Imagine you want to estimate the average age of people in your country. You go out and sample 100 people.

If you happen to sample mostly from a retirement community, you will estimate the average age is 70. If you happen to sample mostly from a university, you will estimate it is 22. Each sample gives a wildly different answer — this is high variance.

Now imagine subtracting the national average (say, 35) from each person's age before computing your estimate. The values you work with are now much smaller in magnitude: +35 for the 70-year-old, -13 for the 22-year-old. The estimate is more stable.

This is exactly what a baseline does in REINFORCE. Instead of weighting actions by raw returns (which can be very large and noisy), we subtract a baseline to center the gradient around zero. The expected gradient does not change, but the variance drops dramatically.

### Think About This

If every trajectory in your batch has a positive return (say, returns of 90, 95, 100, 105), the gradient says "increase the probability of ALL actions." But some actions were clearly better than others. How does subtracting the mean (97.5) help differentiate between them?

## 3. The Mathematics

### 3.1 The REINFORCE Update Rule

The REINFORCE update is:

$$\theta \leftarrow \theta + \alpha \sum_{t=0}^{T-1} \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot G_t$$

This says: for each action taken, adjust the policy parameters proportionally to the log-probability gradient times the return from that timestep forward.

Let us plug in numbers. Suppose $\alpha = 0.01$, $\nabla_\theta \log \pi = 0.8$, and $G_t = 100$:

$$\Delta\theta = 0.01 \times 0.8 \times 100 = 0.8$$

That is a large update! Now suppose a different trajectory has $G_t = 90$ and $\nabla_\theta \log \pi = -0.3$:

$$\Delta\theta = 0.01 \times (-0.3) \times 90 = -0.27$$

The updates swing between +0.8 and -0.27. High variance.

### 3.2 REINFORCE with Baseline

With a baseline $b(s_t)$:

$$\theta \leftarrow \theta + \alpha \sum_{t=0}^{T-1} \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot (G_t - b(s_t))$$

Using $b = 95$ (the mean return):

$$\Delta\theta_1 = 0.01 \times 0.8 \times (100 - 95) = 0.04$$
$$\Delta\theta_2 = 0.01 \times (-0.3) \times (90 - 95) = 0.015$$

Now the updates are +0.04 and +0.015. Both positive (both trajectories were reasonably good), but much smaller and more stable. This is exactly what we want.

### 3.3 The Advantage Function

The best baseline is the value function $V(s)$. The quantity $G_t - V(s_t)$ is called the **advantage**:

$$A(s_t, a_t) = G_t - V(s_t)$$

The advantage tells us: "How much better was this action compared to what we expected?" Positive advantage means better than average, negative means worse.

## 4. Let's Build It — Component by Component

### 4.1 Computing Discounted Returns

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym

torch.manual_seed(42)
np.random.seed(42)

GAMMA = 0.99

def compute_returns(rewards, gamma=GAMMA):
    """
    Compute discounted returns working backwards from the end.
    G_t = r_t + gamma * r_{t+1} + gamma^2 * r_{t+2} + ...
    """
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    return returns

# Example: rewards = [1, 1, 1, 1, 1] with gamma = 0.99
example_rewards = [1.0, 1.0, 1.0, 1.0, 1.0]
example_returns = compute_returns(example_rewards)

print("Rewards:", example_rewards)
print("Returns:", [f"{r:.4f}" for r in example_returns])
print("\nNotice: earlier timesteps have higher returns because they")
print("receive more future discounted rewards.")

### 4.2 The Policy Network

In [None]:
class PolicyNetwork(nn.Module):
    """Policy network that maps states to action logits."""
    def __init__(self, state_dim, n_actions, hidden_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, n_actions)
        )

    def forward(self, state):
        return self.net(state)

    def sample_action(self, state):
        logits = self.forward(state)
        probs = F.softmax(logits, dim=-1)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)

### 4.3 Episode Collection

In [None]:
def collect_episode(env, policy):
    """Run one full episode and collect experiences."""
    states, actions, rewards, log_probs = [], [], [], []
    state, _ = env.reset()
    done = False

    while not done:
        state_t = torch.as_tensor(state, dtype=torch.float32)
        action, log_prob = policy.sample_action(state_t)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        log_probs.append(log_prob)
        state = next_state

    return states, actions, rewards, log_probs

In [None]:
# Visualization: Show what a trajectory looks like
env = gym.make("CartPole-v1")
policy = PolicyNetwork(state_dim=4, n_actions=2)

states, actions, rewards, log_probs = collect_episode(env, policy)
returns = compute_returns(rewards)

fig, axes = plt.subplots(2, 1, figsize=(12, 6), sharex=True)

axes[0].plot(rewards, 'g-', alpha=0.7, label='Reward per step')
axes[0].set_ylabel('Reward')
axes[0].set_title('Trajectory Anatomy')
axes[0].legend()

axes[1].plot(returns, 'b-', linewidth=2, label='Discounted Return G_t')
axes[1].set_xlabel('Time Step')
axes[1].set_ylabel('Return')
axes[1].legend()

plt.tight_layout()
plt.show()
print(f"Episode length: {len(rewards)} | Total return: {returns[0]:.2f}")

### 4.4 REINFORCE Training Loop

In [None]:
def train_reinforce(env, policy, optimizer, num_episodes=500, use_baseline=False):
    """
    Train using REINFORCE, optionally with a mean-return baseline.
    """
    reward_history = []
    return_variance_history = []

    for episode in range(num_episodes):
        states, actions, rewards, log_probs = collect_episode(env, policy)
        returns = compute_returns(rewards)

        returns_t = torch.tensor(returns, dtype=torch.float32)

        # Apply baseline: subtract mean return
        if use_baseline:
            baseline = returns_t.mean()
            advantages = returns_t - baseline
        else:
            advantages = returns_t

        # Track variance
        return_variance_history.append(returns_t.var().item())

        # Policy gradient loss
        log_probs_t = torch.stack(log_probs)
        loss = -(log_probs_t * advantages).sum()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        reward_history.append(sum(rewards))

        if (episode + 1) % 100 == 0:
            avg = np.mean(reward_history[-100:])
            var = np.mean(return_variance_history[-100:])
            print(f"Ep {episode+1:4d} | Avg Reward: {avg:.1f} | Return Var: {var:.1f}")

    return reward_history, return_variance_history

## 5. Your Turn

### TODO: Implement REINFORCE with a Learned Value Baseline

In [None]:
class ValueNetwork(nn.Module):
    """
    A neural network that estimates V(s) — the expected return from state s.
    This serves as the baseline (the critic).
    """
    def __init__(self, state_dim, hidden_dim=128):
        super().__init__()
        # ============ TODO ============
        # Create a neural network with:
        # - Linear layer: state_dim -> hidden_dim
        # - ReLU activation
        # - Linear layer: hidden_dim -> 1 (single value output)
        # ==============================
        self.net = ???  # YOUR CODE HERE

    def forward(self, state):
        return self.net(state).squeeze(-1)

In [None]:
# Verification
value_net = ValueNetwork(state_dim=4)
test_state = torch.randn(4)
value = value_net(test_state)
assert value.shape == torch.Size([]), f"Expected scalar output, got {value.shape}"

# Test with batch
test_batch = torch.randn(10, 4)
values = value_net(test_batch)
assert values.shape == torch.Size([10]), f"Expected batch output, got {values.shape}"
print("Correct! ValueNetwork produces scalar state-value estimates.")

### TODO: Compute Advantage with the Value Baseline

In [None]:
def compute_advantages(returns, states, value_net):
    """
    Compute advantages A_t = G_t - V(s_t) using the value network.

    Args:
        returns: list of discounted returns
        states: list of state observations
        value_net: the value network

    Returns:
        advantages: torch tensor of advantage values
        value_loss: MSE loss for training the value network
    """
    returns_t = torch.tensor(returns, dtype=torch.float32)
    states_t = torch.tensor(np.array(states), dtype=torch.float32)

    # ============ TODO ============
    # Step 1: Get value predictions V(s) from value_net (detach for advantage, not for loss)
    # Step 2: Compute advantages: A_t = G_t - V(s_t)   [detach V for this]
    # Step 3: Compute value loss: MSE between returns and value predictions
    # ==============================

    advantages = ???  # YOUR CODE HERE
    value_loss = ???  # YOUR CODE HERE

    return advantages, value_loss

In [None]:
# Verification
dummy_returns = [5.0, 4.0, 3.0, 2.0, 1.0]
dummy_states = [np.random.randn(4) for _ in range(5)]
v_net = ValueNetwork(state_dim=4)

advs, v_loss = compute_advantages(dummy_returns, dummy_states, v_net)
assert advs.shape == torch.Size([5]), f"Expected 5 advantages, got {advs.shape}"
assert v_loss.shape == torch.Size([]), f"Expected scalar loss, got {v_loss.shape}"
print("Correct! Advantage computation works.")
print(f"Advantages: {advs.detach().numpy().round(2)}")
print(f"Value loss: {v_loss.item():.4f}")

## 6. Putting It All Together

Let us train both methods and compare them head to head.

In [None]:
# Train REINFORCE without baseline
print("=" * 60)
print("Training: REINFORCE (no baseline)")
print("=" * 60)

env1 = gym.make("CartPole-v1")
policy1 = PolicyNetwork(state_dim=4, n_actions=2)
opt1 = torch.optim.Adam(policy1.parameters(), lr=0.01)

rewards_no_baseline, var_no_baseline = train_reinforce(
    env1, policy1, opt1, num_episodes=500, use_baseline=False
)
env1.close()

# Train REINFORCE with baseline
print("\n" + "=" * 60)
print("Training: REINFORCE with Baseline")
print("=" * 60)

env2 = gym.make("CartPole-v1")
policy2 = PolicyNetwork(state_dim=4, n_actions=2)
opt2 = torch.optim.Adam(policy2.parameters(), lr=0.01)

rewards_with_baseline, var_with_baseline = train_reinforce(
    env2, policy2, opt2, num_episodes=500, use_baseline=True
)
env2.close()

## 7. Training and Results

In [None]:
# Compare training curves
fig, axes = plt.subplots(2, 1, figsize=(12, 8))

# Reward comparison
window = 20
for data, label, color in [
    (rewards_no_baseline, 'REINFORCE', '#ef4444'),
    (rewards_with_baseline, 'REINFORCE + Baseline', '#3b82f6')
]:
    if len(data) >= window:
        smoothed = np.convolve(data, np.ones(window)/window, mode='valid')
        axes[0].plot(range(window-1, len(data)), smoothed, label=label, color=color, linewidth=2)

axes[0].axhline(y=500, color='gray', linestyle='--', alpha=0.5, label='Max Reward')
axes[0].set_ylabel('Episode Reward', fontsize=12)
axes[0].set_title('REINFORCE vs REINFORCE with Baseline', fontsize=14)
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Variance comparison
for data, label, color in [
    (var_no_baseline, 'REINFORCE', '#ef4444'),
    (var_with_baseline, 'REINFORCE + Baseline', '#3b82f6')
]:
    if len(data) >= window:
        smoothed = np.convolve(data, np.ones(window)/window, mode='valid')
        axes[1].plot(range(window-1, len(data)), smoothed, label=label, color=color, linewidth=2)

axes[1].set_xlabel('Episode', fontsize=12)
axes[1].set_ylabel('Return Variance', fontsize=12)
axes[1].set_title('Return Variance Comparison', fontsize=14)
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistics
print("\n--- Convergence Statistics ---")
for name, data in [("REINFORCE", rewards_no_baseline), ("+ Baseline", rewards_with_baseline)]:
    first_500 = next((i for i, r in enumerate(data) if np.mean(data[max(0,i-20):i+1]) > 450), len(data))
    avg_final = np.mean(data[-50:]) if len(data) >= 50 else np.mean(data)
    print(f"  {name:12s}: First 450+ avg at ep {first_500:4d} | Final 50-ep avg: {avg_final:.1f}")

## 8. Final Output

In [None]:
# Final comparison summary
print("=" * 60)
print("EXPERIMENT SUMMARY: REINFORCE vs Baseline")
print("=" * 60)

# Key metrics
avg_no_bl = np.mean(rewards_no_baseline[-100:])
avg_bl = np.mean(rewards_with_baseline[-100:])
var_no_bl = np.mean(var_no_baseline[-100:])
var_bl = np.mean(var_with_baseline[-100:])

print(f"\nFinal 100-episode average reward:")
print(f"  REINFORCE:            {avg_no_bl:.1f}")
print(f"  REINFORCE + Baseline: {avg_bl:.1f}")
print(f"\nFinal 100-episode average return variance:")
print(f"  REINFORCE:            {var_no_bl:.1f}")
print(f"  REINFORCE + Baseline: {var_bl:.1f}")
print(f"  Variance reduction:   {(1 - var_bl/max(var_no_bl, 1))*100:.1f}%")
print(f"\nConclusion: Baseline reduces variance and improves convergence.")
print("Congratulations! You have built REINFORCE with baseline from scratch!")

## 9. Reflection and Next Steps

### Reflection Questions
1. Why does subtracting a baseline NOT change the expected gradient? (Hint: the baseline does not depend on the action, so its gradient contribution is zero in expectation.)
2. If you used a very bad baseline (e.g., a constant of 1,000,000), would it help or hurt? Why?
3. The advantage function $A(s,a) = G_t - V(s)$ can be negative. What does a negative advantage tell us about the action taken?

### Optional Challenges
1. Implement a learned value baseline (use a separate neural network to estimate $V(s)$) instead of the mean return baseline.
2. Try normalizing the advantages to have zero mean and unit variance before computing the gradient. Does this help?
3. Experiment with different discount factors (gamma = 0.9, 0.99, 0.999) and compare convergence.