In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Your First RL Agent with OpenAI Gymnasium -- Vizuara

## 1. Why Does This Matter?

Theory without practice is incomplete. In the previous notebooks, we built MDPs from scratch and learned the mathematical framework. Now it is time to use **OpenAI Gymnasium** -- the standard library that provides hundreds of pre-built RL environments with a unified interface.

By the end of this notebook, you will:
- Understand the Gymnasium API (make, reset, step, render)
- Explore the Lunar Lander and CartPole environments
- See why a random agent fails miserably
- Build a simple heuristic agent that uses domain knowledge
- Implement a basic Q-learning agent that learns from scratch

## 2. Building Intuition

Imagine you are given the controls to a lunar lander. You have four buttons: do nothing, fire left thruster, fire main engine, fire right thruster. You can see your position, velocity, angle, and whether your legs are touching the ground. Your goal: land safely on the pad.

If you press buttons randomly, you will crash. If you follow some hand-coded rules ("if tilting left, fire right thruster"), you might do okay. But the best approach? Let the agent learn the right strategy from experience.

This is exactly what Gymnasium lets us do.

In [None]:
# Install Gymnasium (run this once)
import subprocess
import sys
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'gymnasium[classic_control]', '-q'])

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

print("Gymnasium installed and imported.")
print(f"Gymnasium version: {gym.__version__}")

In [None]:
# Let us explore what Gymnasium offers
# List some available environments
env_categories = {
    'Classic Control': ['CartPole-v1', 'MountainCar-v0', 'Acrobot-v1', 'Pendulum-v1'],
    'Toy Text': ['FrozenLake-v1', 'Taxi-v3', 'CliffWalking-v0'],
}

print("Sample Gymnasium Environments:")
print("=" * 50)
for category, envs in env_categories.items():
    print(f"\n{category}:")
    for env_name in envs:
        try:
            env = gym.make(env_name)
            print(f"  {env_name}")
            print(f"    Action space:      {env.action_space}")
            print(f"    Observation space: {env.observation_space}")
            env.close()
        except Exception as e:
            print(f"  {env_name}: (not available: {e})")

## 3. The Mathematics

The Gymnasium API implements the MDP interface we built in Notebook 2:

At each time step, `env.step(action)` returns:
- **observation** ($s_{t+1}$) -- the new state
- **reward** ($r_{t+1}$) -- the immediate reward
- **terminated** -- whether a terminal state was reached
- **truncated** -- whether the episode was cut short
- **info** -- diagnostic information

The agent's goal remains:

$$\max_{\pi} \; \mathbb{E}\left[\sum_{t=0}^{T} \gamma^t r_t\right]$$

For **CartPole**: $r_t = +1$ for every step the pole stays upright (episodic, $\gamma = 1$).

For **Lunar Lander**: $r_t$ is shaped -- you get partial credit for orientation, velocity, and leg contact, plus +100 for landing safely and -100 for crashing.

In [None]:
# Let us examine CartPole in detail
env = gym.make('CartPole-v1')

print("CartPole-v1 Environment Details")
print("=" * 50)
print(f"Action space: {env.action_space}")
print(f"  - 0: Push cart LEFT")
print(f"  - 1: Push cart RIGHT")
print()
print(f"Observation space: {env.observation_space}")
print(f"  - obs[0]: Cart position     (range: {env.observation_space.low[0]:.1f} to {env.observation_space.high[0]:.1f})")
print(f"  - obs[1]: Cart velocity     (range: {env.observation_space.low[1]:.1f} to {env.observation_space.high[1]:.1f})")
print(f"  - obs[2]: Pole angle (rad)  (range: {env.observation_space.low[2]:.4f} to {env.observation_space.high[2]:.4f})")
print(f"  - obs[3]: Pole angular vel  (range: {env.observation_space.low[3]:.1f} to {env.observation_space.high[3]:.1f})")
print()
print("Reward: +1 for every step the pole stays upright")
print("Episode ends when: pole angle > 12 deg, cart off screen, or 500 steps")
print(f"Max possible return: 500")

env.close()

## 4. Let's Build It -- Component by Component

### Component 1: The Random Agent

In [None]:
def run_random_agent(env_name, n_episodes=20, max_steps=500):
    """Run a random agent and record rewards."""
    env = gym.make(env_name)
    episode_rewards = []

    for ep in range(n_episodes):
        obs, info = env.reset(seed=ep)
        total_reward = 0

        for step in range(max_steps):
            action = env.action_space.sample()  # random action
            obs, reward, terminated, truncated, info = env.step(action)
            total_reward += reward

            if terminated or truncated:
                break

        episode_rewards.append(total_reward)

    env.close()
    return episode_rewards

# Test random agent on CartPole
random_rewards_cartpole = run_random_agent('CartPole-v1', n_episodes=100)

print("Random Agent on CartPole-v1:")
print(f"  Mean reward:   {np.mean(random_rewards_cartpole):.1f}")
print(f"  Std reward:    {np.std(random_rewards_cartpole):.1f}")
print(f"  Max reward:    {np.max(random_rewards_cartpole):.1f}")
print(f"  Min reward:    {np.min(random_rewards_cartpole):.1f}")
print(f"  Max possible:  500")
print()
print("The random agent averages about 20 steps -- terrible!")
print("The pole falls almost immediately with no intelligent control.")

### Component 2: A Heuristic Agent

In [None]:
def heuristic_cartpole(obs):
    """
    Simple heuristic for CartPole:
    If the pole is tilting right (positive angle), push right.
    If the pole is tilting left (negative angle), push left.
    """
    cart_pos, cart_vel, pole_angle, pole_angular_vel = obs

    # Push in the direction the pole is tilting
    if pole_angle + 0.1 * pole_angular_vel > 0:
        return 1  # push right
    else:
        return 0  # push left

def run_heuristic_agent(env_name, heuristic_fn, n_episodes=20, max_steps=500):
    """Run a heuristic agent and record rewards."""
    env = gym.make(env_name)
    episode_rewards = []

    for ep in range(n_episodes):
        obs, info = env.reset(seed=ep)
        total_reward = 0

        for step in range(max_steps):
            action = heuristic_fn(obs)
            obs, reward, terminated, truncated, info = env.step(action)
            total_reward += reward

            if terminated or truncated:
                break

        episode_rewards.append(total_reward)

    env.close()
    return episode_rewards

heuristic_rewards = run_heuristic_agent('CartPole-v1', heuristic_cartpole, n_episodes=100)

print("Heuristic Agent on CartPole-v1:")
print(f"  Mean reward:   {np.mean(heuristic_rewards):.1f}")
print(f"  Std reward:    {np.std(heuristic_rewards):.1f}")
print(f"  Max reward:    {np.max(heuristic_rewards):.1f}")
print(f"  Min reward:    {np.min(heuristic_rewards):.1f}")
print()
print("Much better! But the heuristic was hand-designed.")
print("Can an agent LEARN this behavior on its own?")

### Component 3: Visualize the Difference

In [None]:
# Compare random vs heuristic
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of episode lengths
axes[0].hist(random_rewards_cartpole, bins=20, alpha=0.7, label='Random', color='red')
axes[0].hist(heuristic_rewards, bins=20, alpha=0.7, label='Heuristic', color='green')
axes[0].set_xlabel('Episode Reward (steps survived)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('Distribution of Episode Rewards', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].axvline(x=500, color='gold', linestyle='--', label='Max possible', linewidth=2)
axes[0].legend(fontsize=11)

# Bar chart of means
means = [np.mean(random_rewards_cartpole), np.mean(heuristic_rewards)]
stds = [np.std(random_rewards_cartpole), np.std(heuristic_rewards)]
bars = axes[1].bar(['Random Agent', 'Heuristic Agent'], means, yerr=stds, capsize=5,
                    color=['red', 'green'], alpha=0.7)
axes[1].set_ylabel('Mean Episode Reward', fontsize=12)
axes[1].set_title('Random vs Heuristic Agent', fontsize=14, fontweight='bold')
axes[1].axhline(y=500, color='gold', linestyle='--', label='Max possible')
axes[1].legend(fontsize=11)

# Add value labels
for bar, mean in zip(bars, means):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                 f'{mean:.0f}', ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

print("The heuristic agent is dramatically better, but it required human insight.")
print("Next: let us build an agent that LEARNS the optimal behavior from scratch.")

## 5. Your Turn

### Exercise 1: Explore the Environment

Run a single CartPole episode step-by-step and observe how the observation changes with each action.

In [None]:
# TODO: Run a single episode of CartPole and print each step

env = gym.make('CartPole-v1')
obs, info = env.reset(seed=42)

print("Step-by-step CartPole episode:")
print(f"{'Step':>4} | {'Cart Pos':>9} | {'Cart Vel':>9} | {'Pole Angle':>11} | {'Angular Vel':>11} | {'Action':>6} | {'Reward':>6}")
print("-" * 80)

for step in range(20):
    # TODO: Choose an action (try different strategies)
    # Option A: random
    # Option B: always push right (action=1)
    # Option C: push opposite to pole tilt
    action = env.action_space.sample()  # MODIFY THIS

    next_obs, reward, terminated, truncated, info = env.step(action)

    action_name = 'LEFT' if action == 0 else 'RIGHT'
    print(f"{step:>4} | {obs[0]:>9.4f} | {obs[1]:>9.4f} | {obs[2]:>11.6f} | {obs[3]:>11.6f} | {action_name:>6} | {reward:>6.1f}")

    obs = next_obs
    if terminated or truncated:
        print(f"\nEpisode ended at step {step+1}")
        break

env.close()

### Exercise 2: Improve the Heuristic

The heuristic above only considers the pole angle and angular velocity. Modify it to also consider cart position (keep the cart near the center).

In [None]:
# TODO: Implement an improved heuristic that also considers cart position

def improved_heuristic(obs):
    cart_pos, cart_vel, pole_angle, pole_angular_vel = obs

    # TODO: Combine pole angle, angular velocity, AND cart position
    # into a better decision rule.
    # Hint: if the cart is too far right, you might want to push left
    # even if the pole is tilting right.

    # YOUR CODE HERE
    return 1  # REPLACE THIS

# Test your improved heuristic
improved_rewards = run_heuristic_agent('CartPole-v1', improved_heuristic, n_episodes=100)
print(f"Improved Heuristic: mean={np.mean(improved_rewards):.1f}, max={np.max(improved_rewards):.1f}")
print(f"Original Heuristic: mean={np.mean(heuristic_rewards):.1f}, max={np.max(heuristic_rewards):.1f}")

## 6. Putting It All Together

Now let us build a **Q-learning agent** that learns entirely from interaction with the environment. No hand-coded rules -- pure reinforcement learning.

In [None]:
class QLearningAgent:
    """
    Q-Learning agent for discrete state spaces.

    Q-learning update rule:
    Q(s, a) <- Q(s, a) + alpha * [r + gamma * max_a' Q(s', a') - Q(s, a)]

    We discretize the continuous CartPole observations into bins
    so we can use a simple table-based approach.
    """

    def __init__(self, n_actions, n_bins=10, alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.n_actions = n_actions
        self.n_bins = n_bins
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        # Q-table: maps discretized state -> action values
        self.q_table = defaultdict(lambda: np.zeros(n_actions))

        # Bin edges for discretizing observations
        # CartPole: [cart_pos, cart_vel, pole_angle, pole_angular_vel]
        self.bins = [
            np.linspace(-2.4, 2.4, n_bins),      # cart position
            np.linspace(-3.0, 3.0, n_bins),       # cart velocity
            np.linspace(-0.21, 0.21, n_bins),     # pole angle
            np.linspace(-3.0, 3.0, n_bins),       # angular velocity
        ]

    def discretize(self, obs):
        """Convert continuous observation to discrete state."""
        state = []
        for i, val in enumerate(obs):
            idx = np.digitize(val, self.bins[i])
            state.append(idx)
        return tuple(state)

    def choose_action(self, state):
        """Epsilon-greedy action selection."""
        if np.random.random() < self.epsilon:
            return np.random.randint(self.n_actions)
        else:
            return np.argmax(self.q_table[state])

    def update(self, state, action, reward, next_state, done):
        """Q-learning update."""
        if done:
            target = reward
        else:
            target = reward + self.gamma * np.max(self.q_table[next_state])

        self.q_table[state][action] += self.alpha * (target - self.q_table[state][action])

    def decay_epsilon(self):
        """Reduce exploration rate over time."""
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

print("Q-Learning Agent defined.")
print()
print("Key formula:")
print("  Q(s,a) <- Q(s,a) + alpha * [r + gamma * max Q(s',a') - Q(s,a)]")
print()
print("This is the fundamental update rule of Q-learning.")
print("The agent learns the value of each (state, action) pair from experience.")

In [None]:
def train_q_agent(n_episodes=5000):
    """Train Q-learning agent on CartPole."""
    env = gym.make('CartPole-v1')
    agent = QLearningAgent(
        n_actions=env.action_space.n,
        n_bins=12,
        alpha=0.1,
        gamma=0.99,
        epsilon=1.0,
        epsilon_decay=0.998,
        epsilon_min=0.01,
    )

    episode_rewards = []
    best_reward = 0

    for ep in range(n_episodes):
        obs, info = env.reset(seed=ep % 100)
        state = agent.discretize(obs)
        total_reward = 0

        for step in range(500):
            action = agent.choose_action(state)
            next_obs, reward, terminated, truncated, info = env.step(action)
            next_state = agent.discretize(next_obs)
            done = terminated or truncated

            # Penalize falling
            if terminated and step < 499:
                reward = -10

            agent.update(state, action, reward, next_state, done)
            state = next_state
            total_reward += 1  # count steps survived

            if done:
                break

        episode_rewards.append(total_reward)
        agent.decay_epsilon()

        if total_reward > best_reward:
            best_reward = total_reward

        # Progress report
        if (ep + 1) % 500 == 0:
            recent = np.mean(episode_rewards[-100:])
            print(f"  Episode {ep+1:>5} | Recent avg: {recent:>6.1f} | Epsilon: {agent.epsilon:.3f} | Best: {best_reward}")

    env.close()
    return agent, episode_rewards

print("Training Q-learning agent on CartPole-v1...")
print("This learns from scratch -- no human knowledge built in.\n")
q_agent, q_rewards = train_q_agent(5000)

## 7. Training and Results

In [None]:
# Comprehensive results visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Learning curve
window = 50
smoothed = [np.mean(q_rewards[max(0,i-window):i+1]) for i in range(len(q_rewards))]
axes[0, 0].plot(smoothed, color='blue', alpha=0.8)
axes[0, 0].axhline(y=500, color='gold', linestyle='--', label='Max possible (500)')
axes[0, 0].axhline(y=np.mean(random_rewards_cartpole), color='red', linestyle='--', label=f'Random ({np.mean(random_rewards_cartpole):.0f})')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Reward (smoothed)')
axes[0, 0].set_title('Q-Learning: Training Progress', fontsize=13, fontweight='bold')
axes[0, 0].legend()

# Plot 2: Histogram comparison (all three agents)
axes[0, 1].hist(random_rewards_cartpole, bins=20, alpha=0.5, label='Random', color='red', density=True)
axes[0, 1].hist(heuristic_rewards, bins=20, alpha=0.5, label='Heuristic', color='green', density=True)
axes[0, 1].hist(q_rewards[-100:], bins=20, alpha=0.5, label='Q-Learning (last 100)', color='blue', density=True)
axes[0, 1].set_xlabel('Episode Reward')
axes[0, 1].set_ylabel('Density')
axes[0, 1].set_title('Reward Distribution Comparison', fontsize=13, fontweight='bold')
axes[0, 1].legend()

# Plot 3: Bar chart of final performance
final_q_mean = np.mean(q_rewards[-100:])
means = [np.mean(random_rewards_cartpole), np.mean(heuristic_rewards), final_q_mean]
labels = ['Random', 'Heuristic', 'Q-Learning']
colors = ['red', 'green', 'blue']
bars = axes[1, 0].bar(labels, means, color=colors, alpha=0.7)
axes[1, 0].set_ylabel('Mean Reward')
axes[1, 0].set_title('Final Performance Comparison', fontsize=13, fontweight='bold')
axes[1, 0].axhline(y=500, color='gold', linestyle='--')
for bar, mean in zip(bars, means):
    axes[1, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                     f'{mean:.0f}', ha='center', fontsize=12, fontweight='bold')

# Plot 4: Q-table size over training
q_sizes = []
check_points = list(range(0, len(q_rewards), 100))
for i in check_points:
    q_sizes.append(min(i * 2, len(q_agent.q_table)))  # approximate
axes[1, 1].plot(check_points, q_sizes, 'purple', linewidth=2)
axes[1, 1].set_xlabel('Episode')
axes[1, 1].set_ylabel('Approx. States Explored')
axes[1, 1].set_title('State Space Exploration', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nFinal Results (last 100 episodes):")
print(f"  Random Agent:    {np.mean(random_rewards_cartpole):>6.1f} avg reward")
print(f"  Heuristic Agent: {np.mean(heuristic_rewards):>6.1f} avg reward")
print(f"  Q-Learning Agent: {final_q_mean:>6.1f} avg reward")
print(f"\n  Q-table entries: {len(q_agent.q_table)}")

In [None]:
# Evaluate the trained Q-learning agent (greedy, no exploration)
def evaluate_agent(agent, env_name='CartPole-v1', n_episodes=50):
    """Evaluate trained agent with no exploration."""
    env = gym.make(env_name)
    rewards = []

    for ep in range(n_episodes):
        obs, info = env.reset(seed=ep + 1000)
        state = agent.discretize(obs)
        total_reward = 0

        for step in range(500):
            action = np.argmax(agent.q_table[state])  # greedy
            obs, reward, terminated, truncated, info = env.step(action)
            state = agent.discretize(obs)
            total_reward += 1

            if terminated or truncated:
                break

        rewards.append(total_reward)

    env.close()
    return rewards

eval_rewards = evaluate_agent(q_agent)
print(f"Evaluation (50 episodes, greedy policy):")
print(f"  Mean: {np.mean(eval_rewards):.1f}")
print(f"  Std:  {np.std(eval_rewards):.1f}")
print(f"  Min:  {np.min(eval_rewards)}")
print(f"  Max:  {np.max(eval_rewards)}")
print(f"  Solved (>= 475 avg): {'YES' if np.mean(eval_rewards) >= 475 else 'Not yet -- try more training'}")

## 8. Final Output

In [None]:
# Final interactive demo: step through one episode showing the learned Q-values

env = gym.make('CartPole-v1')
obs, info = env.reset(seed=99)
state = q_agent.discretize(obs)

print("Watching the trained Q-learning agent play one episode:")
print("=" * 80)
print(f"{'Step':>4} | {'Pole Angle':>11} | {'Action':>7} | {'Q(left)':>8} | {'Q(right)':>9} | {'Alive':>5}")
print("-" * 80)

for step in range(500):
    q_values = q_agent.q_table[state]
    action = np.argmax(q_values)
    action_name = 'LEFT' if action == 0 else 'RIGHT'

    print(f"{step:>4} | {obs[2]:>11.6f} | {action_name:>7} | {q_values[0]:>8.2f} | {q_values[1]:>9.2f} | {'yes':>5}")

    obs, reward, terminated, truncated, info = env.step(action)
    state = q_agent.discretize(obs)

    if terminated or truncated:
        print(f"\nEpisode ended at step {step+1}")
        break

    # Only show first 15 and last 5 steps to avoid clutter
    if step == 15:
        print(f"  ... (skipping middle steps for brevity) ...")
    if 15 < step < max(0, step - 5):
        continue

env.close()

print(f"\nThe agent survived {step+1} steps by choosing actions based on learned Q-values.")
print("Notice how Q(left) and Q(right) change based on the pole angle --")
print("the agent learned which action is better in each state, purely from experience.")

In [None]:
print("=" * 60)
print("SUMMARY: Your First RL Agent with Gymnasium")
print("=" * 60)
print()
print("What we built:")
print("  - Explored the Gymnasium API (make, reset, step)")
print("  - Tested a random agent (terrible: ~20 steps)")
print("  - Built a heuristic agent (good but hand-designed)")
print("  - Trained a Q-learning agent from scratch")
print("  - Compared all three approaches quantitatively")
print()
print("Key takeaway:")
print("  The Q-learning agent learned to balance the pole")
print("  WITHOUT any human knowledge of physics or control theory.")
print("  It discovered the right behavior purely through trial and error.")
print()
print("The Q-learning update rule:")
print("  Q(s,a) <- Q(s,a) + alpha * [r + gamma * max Q(s',a') - Q(s,a)]")
print()
print("This is the foundation of modern RL. Deep Q-Networks (DQN)")
print("replace the Q-table with a neural network to handle")
print("high-dimensional state spaces like Atari game pixels.")

## 9. Reflection and Next Steps

**Questions to think about:**
1. What happens if you increase n_bins (finer discretization)? Does the agent learn faster or slower? Why?
2. Why did we penalize the agent with -10 when it fails, instead of just 0? What happens without this penalty?
3. The Q-learning agent uses a table with finite entries. How would you handle an environment with continuous states and continuous actions?

**What comes next:**
You now have the complete foundation of RL: the problem definition, the mathematical framework (MDPs), and a working learning algorithm (Q-learning). In the next pod, we will go deeper into value functions, the Bellman equation, and build towards Deep Q-Networks (DQN) that can play Atari games directly from pixels.