In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# MDPs, Rewards, and the Markov Property -- Vizuara

## 1. Why Does This Matter?

In the previous notebook, we built a Tic-Tac-Toe agent that learns from experience. But we did not have a formal mathematical framework to describe the environment. We just hard-coded the rules.

In real RL problems -- controlling robots, optimizing chemical reactors, playing Atari -- we need a precise language for describing states, actions, transitions, and rewards. That language is the **Markov Decision Process** (MDP).

By the end of this notebook, you will:
- Implement a complete MDP from scratch (the recycling robot)
- Compute returns with and without discounting
- Verify the Markov property computationally
- Solve a small MDP to find optimal behavior

## 2. Building Intuition

Think about a recycling robot. At each moment, it has a battery level (High or Low) and must choose between searching for cans, waiting, or recharging. Each choice has consequences: searching finds more cans but drains the battery. If the battery dies, the robot must be rescued (a large penalty).

This is a classic MDP: the next state depends only on the current state and action, not on how you got there. The battery does not "remember" what the robot did three steps ago -- it only knows its current level.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

# Visualize the recycling robot decision problem
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.set_xlim(-1, 11)
ax.set_ylim(-1, 7)
ax.axis('off')
ax.set_title('The Recycling Robot MDP', fontsize=16, fontweight='bold')

# Draw states
circle_high = plt.Circle((3, 4), 1.2, fill=False, linewidth=2, color='green')
circle_low = plt.Circle((8, 4), 1.2, fill=False, linewidth=2, color='orange')
ax.add_patch(circle_high)
ax.add_patch(circle_low)
ax.text(3, 4, 'HIGH', ha='center', va='center', fontsize=14, fontweight='bold', color='green')
ax.text(8, 4, 'LOW', ha='center', va='center', fontsize=14, fontweight='bold', color='orange')

# Annotations
ax.annotate('search (alpha)\nreward = r_search', xy=(3, 5.5), fontsize=9, ha='center',
            bbox=dict(boxstyle='round,pad=0.3', facecolor='lightyellow'))
ax.annotate('wait\nreward = r_wait', xy=(3, 2.2), fontsize=9, ha='center',
            bbox=dict(boxstyle='round,pad=0.3', facecolor='lightcyan'))

# Arrow from High to Low
ax.annotate('', xy=(6.8, 4.3), xytext=(4.2, 4.3),
            arrowprops=dict(arrowstyle='->', color='red', lw=1.5))
ax.text(5.5, 4.8, 'search\n(1-alpha)', fontsize=8, ha='center', color='red')

# Arrow from Low to High (recharge)
ax.annotate('', xy=(4.2, 3.7), xytext=(6.8, 3.7),
            arrowprops=dict(arrowstyle='->', color='blue', lw=1.5))
ax.text(5.5, 3.0, 'recharge\nreward = 0', fontsize=8, ha='center', color='blue')

# Dead state
ax.text(8, 1, 'DEAD\n(rescue: -3)', ha='center', fontsize=10, color='red',
        bbox=dict(boxstyle='round,pad=0.3', facecolor='mistyrose'))
ax.annotate('', xy=(8, 1.6), xytext=(8, 2.8),
            arrowprops=dict(arrowstyle='->', color='red', lw=1.5))
ax.text(8.8, 2.2, 'search\n(1-beta)', fontsize=8, color='red')

plt.tight_layout()
plt.show()

print("The robot must balance short-term reward (searching) against long-term survival (recharging).")

## 3. The Mathematics

### The Agent-Environment Interface

At each time step $t$:
1. Agent observes state $s_t$
2. Agent takes action $a_t$
3. Environment returns reward $r_{t+1}$ and next state $s_{t+1}$

### Formal MDP Definition

An MDP is defined by the tuple:

$$\text{MDP} = (S, A, P, R, \gamma)$$

where:
- $S$ = set of states
- $A$ = set of actions
- $P(s' | s, a)$ = transition probability
- $R(s, a)$ = reward function
- $\gamma$ = discount factor

### Returns and Discounting

**Episodic return** (no discounting):
$$G_t = r_t + r_{t+1} + r_{t+2} + \cdots + r_T$$

**Discounted return**:
$$G_t = r_t + \gamma \, r_{t+1} + \gamma^2 \, r_{t+2} + \cdots = \sum_{k=0}^{\infty} \gamma^k \, r_{t+k}$$

### The Markov Property

A state is Markov if:
$$P(s_{t+1} | s_t, a_t) = P(s_{t+1} | s_0, a_0, \ldots, s_t, a_t)$$

The future depends only on the present, not the past.

In [None]:
# Let us verify each of these mathematically with code

# 1. Compute episodic vs discounted returns
rewards = [1, 2, 3, 4, 5]
gammas = [0.0, 0.5, 0.9, 0.99, 1.0]

print("Reward sequence:", rewards)
print()
print(f"{'gamma':>6} | {'G_0':>8} | Interpretation")
print("-" * 55)

for gamma in gammas:
    G = sum(gamma**k * rewards[k] for k in range(len(rewards)))
    if gamma == 0.0:
        interp = "Only immediate reward matters"
    elif gamma == 1.0:
        interp = "All rewards equally weighted"
    else:
        interp = f"Future rewards discounted by {gamma} per step"
    print(f"{gamma:>6.2f} | {G:>8.3f} | {interp}")

print("\nKey insight: gamma controls the agent's 'patience' for future rewards.")

In [None]:
# 2. Visualize how discounting affects the weight of future rewards
fig, ax = plt.subplots(1, 1, figsize=(10, 5))

timesteps = np.arange(20)
for gamma in [0.5, 0.8, 0.9, 0.95, 0.99]:
    weights = gamma ** timesteps
    ax.plot(timesteps, weights, 'o-', markersize=4, label=f'gamma={gamma}')

ax.set_xlabel('Time steps into the future', fontsize=12)
ax.set_ylabel('Weight (gamma^k)', fontsize=12)
ax.set_title('How Discount Factor Affects Future Reward Weights', fontsize=14, fontweight='bold')
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
ax.set_ylim(0, 1.05)
plt.tight_layout()
plt.show()

print("With gamma=0.5, rewards 10 steps away are worth only 0.001 of immediate reward.")
print("With gamma=0.99, rewards 10 steps away are still worth 0.90 of immediate reward.")

## 4. Let's Build It -- Component by Component

### Component 1: Building an MDP Class

In [None]:
class MDP:
    """A complete Markov Decision Process implementation."""

    def __init__(self, states, actions, transitions, rewards, gamma=0.9):
        """
        Args:
            states: list of state names
            actions: dict mapping state -> list of available actions
            transitions: dict mapping (state, action) -> list of (next_state, probability)
            rewards: dict mapping (state, action) -> immediate reward
            gamma: discount factor
        """
        self.states = states
        self.actions = actions
        self.transitions = transitions
        self.rewards = rewards
        self.gamma = gamma

    def step(self, state, action):
        """Take action in state, return (next_state, reward)."""
        possible = self.transitions[(state, action)]
        probs = [p for _, p in possible]
        next_states = [s for s, _ in possible]

        # Sample next state according to transition probabilities
        idx = np.random.choice(len(possible), p=probs)
        next_state = next_states[idx]
        reward = self.rewards[(state, action)]

        return next_state, reward

    def get_actions(self, state):
        """Get available actions in a state."""
        return self.actions.get(state, [])

    def print_model(self):
        """Display the full MDP model."""
        print("MDP Model")
        print("=" * 50)
        print(f"States: {self.states}")
        print(f"Gamma: {self.gamma}")
        print()
        for state in self.states:
            if state not in self.actions:
                continue
            print(f"State: {state}")
            for action in self.actions[state]:
                reward = self.rewards[(state, action)]
                print(f"  Action: {action} (reward = {reward})")
                for next_state, prob in self.transitions[(state, action)]:
                    print(f"    -> {next_state} with probability {prob:.2f}")
            print()

print("MDP class defined. Let us create the recycling robot MDP.")

### Component 2: The Recycling Robot MDP

In [None]:
# Define the recycling robot MDP
alpha = 0.7  # prob of staying high when searching from high
beta = 0.4   # prob of staying low when searching from low
r_search = 2
r_wait = 1

recycling_robot = MDP(
    states=['High', 'Low', 'Dead'],
    actions={
        'High': ['search', 'wait'],
        'Low': ['search', 'wait', 'recharge'],
    },
    transitions={
        ('High', 'search'):   [('High', alpha), ('Low', 1-alpha)],
        ('High', 'wait'):     [('High', 1.0)],
        ('Low', 'search'):    [('Low', beta), ('Dead', 1-beta)],
        ('Low', 'wait'):      [('Low', 1.0)],
        ('Low', 'recharge'):  [('High', 1.0)],
    },
    rewards={
        ('High', 'search'):   r_search,
        ('High', 'wait'):     r_wait,
        ('Low', 'search'):    r_search,  # expected if not dead
        ('Low', 'wait'):      r_wait,
        ('Low', 'recharge'):  0,
    },
    gamma=0.9
)

recycling_robot.print_model()

### Component 3: Simulating Episodes

In [None]:
def simulate_episode(mdp, policy, start_state, max_steps=20):
    """
    Run one episode following a policy.

    Args:
        policy: dict mapping state -> action
        start_state: initial state
        max_steps: maximum steps before truncation
    """
    state = start_state
    trajectory = []
    total_reward = 0

    for t in range(max_steps):
        if state == 'Dead' or state not in mdp.actions:
            break

        action = policy[state]
        next_state, reward = mdp.step(state, action)

        # Special case: if battery dies during search from Low
        if state == 'Low' and action == 'search' and next_state == 'Dead':
            reward = -3  # rescue penalty

        trajectory.append((t, state, action, reward, next_state))
        total_reward += (mdp.gamma ** t) * reward
        state = next_state

    return trajectory, total_reward

# Define two different policies
policy_aggressive = {'High': 'search', 'Low': 'search'}   # always search
policy_cautious = {'High': 'search', 'Low': 'recharge'}    # recharge when low

# Simulate both
np.random.seed(42)
print("AGGRESSIVE POLICY (always search):")
print("-" * 60)
traj_agg, reward_agg = simulate_episode(recycling_robot, policy_aggressive, 'High')
for t, s, a, r, ns in traj_agg:
    print(f"  t={t}: state={s:5s} action={a:10s} reward={r:+.0f} -> {ns}")
print(f"  Discounted return: {reward_agg:.2f}")

print()

np.random.seed(42)
print("CAUTIOUS POLICY (recharge when low):")
print("-" * 60)
traj_cau, reward_cau = simulate_episode(recycling_robot, policy_cautious, 'High')
for t, s, a, r, ns in traj_cau:
    print(f"  t={t}: state={s:5s} action={a:10s} reward={r:+.0f} -> {ns}")
print(f"  Discounted return: {reward_cau:.2f}")

## 5. Your Turn

### Exercise 1: Verify the Markov Property

Run the cell below and fill in the TODOs to verify that the recycling robot is indeed Markov. The key test: does the transition probability from a state depend on how you arrived there?

In [None]:
# TODO: Verify the Markov property empirically

# Run many episodes and track:
# 1. P(next_state | current_state=Low, action=search) regardless of history
# 2. P(next_state | current_state=Low, action=search, previous_state=High)
# 3. These should be equal if the Markov property holds

n_trials = 10000
# Count transitions from Low with search action
count_total = {'Low': 0, 'Dead': 0}        # overall counts
count_from_high = {'Low': 0, 'Dead': 0}    # counts when previous state was High

for trial in range(n_trials):
    state = 'High'
    prev_state = None

    for step in range(50):
        if state == 'Dead' or state not in recycling_robot.actions:
            break

        # Always search (to collect more data)
        action = 'search' if state in ['High', 'Low'] else 'recharge'
        next_state, _ = recycling_robot.step(state, action)

        if state == 'Low' and action == 'search' and next_state != state:
            next_state = 'Dead' if np.random.random() > beta else 'Low'

        # TODO: If current state is Low and action is search,
        # record the transition in count_total
        # Also, if prev_state was High, record in count_from_high

        # YOUR CODE HERE
        pass

        prev_state = state
        state = next_state

# TODO: Compute and compare the probabilities
# P(Dead | Low, search) overall vs P(Dead | Low, search, came from High)
# They should be approximately equal (both close to 1-beta = 0.6)

total_sum = count_total['Low'] + count_total['Dead']
from_high_sum = count_from_high['Low'] + count_from_high['Dead']

if total_sum > 0:
    print(f"P(Dead | Low, search) overall:         {count_total['Dead']/total_sum:.3f}")
if from_high_sum > 0:
    print(f"P(Dead | Low, search, came from High): {count_from_high['Dead']/from_high_sum:.3f}")
print(f"Theoretical:                            {1-beta:.3f}")
print()
print("If these are approximately equal, the Markov property holds!")

### Exercise 2: Design Your Own MDP

Create an MDP for a simple scenario: a student deciding between studying, partying, or sleeping. Define states, actions, transitions, and rewards.

In [None]:
# TODO: Define a Student MDP
# States: 'Rested', 'Tired', 'Exhausted'
# Actions: 'study' (reward but tiring), 'party' (fun but very tiring), 'sleep' (no reward but restoring)
# Transitions: studying from Rested -> 60% stay Rested, 40% become Tired, etc.

student_mdp = MDP(
    states=['Rested', 'Tired', 'Exhausted'],
    actions={
        'Rested':    ['study', 'party', 'sleep'],
        'Tired':     ['study', 'party', 'sleep'],
        'Exhausted': ['sleep'],  # can only sleep when exhausted
    },
    transitions={
        # TODO: Fill in realistic transition probabilities
        ('Rested', 'study'):     [('Rested', 0.6), ('Tired', 0.4)],
        ('Rested', 'party'):     [('Tired', 0.7), ('Exhausted', 0.3)],
        ('Rested', 'sleep'):     [('Rested', 1.0)],
        ('Tired', 'study'):      [('Tired', 0.5), ('Exhausted', 0.5)],
        ('Tired', 'party'):      [('Exhausted', 1.0)],
        ('Tired', 'sleep'):      [('Rested', 0.8), ('Tired', 0.2)],
        ('Exhausted', 'sleep'):  [('Tired', 0.7), ('Exhausted', 0.3)],
    },
    rewards={
        # TODO: Fill in rewards that capture studying > partying > sleeping
        ('Rested', 'study'):     3,
        ('Rested', 'party'):     2,
        ('Rested', 'sleep'):     0,
        ('Tired', 'study'):      2,
        ('Tired', 'party'):      1,
        ('Tired', 'sleep'):      0,
        ('Exhausted', 'sleep'):  0,
    },
    gamma=0.9
)

student_mdp.print_model()

# TODO: Test different policies and compare discounted returns
# Which is better: always study, or study when rested + sleep when tired?

## 6. Putting It All Together

Now let us compute the **value of each state** under a given policy. This is the expected discounted return starting from that state.

In [None]:
def estimate_state_values(mdp, policy, start_state, n_episodes=5000, max_steps=50):
    """Estimate V(s) for each state by averaging returns over many episodes."""
    returns = defaultdict(list)

    for _ in range(n_episodes):
        state = start_state
        episode = []

        for t in range(max_steps):
            if state == 'Dead' or state not in mdp.actions:
                break
            action = policy[state]
            next_state, reward = mdp.step(state, action)
            if state == 'Low' and action == 'search' and next_state == 'Dead':
                reward = -3
            episode.append((state, reward))
            state = next_state

        # Compute discounted returns for each state visited
        G = 0
        for state, reward in reversed(episode):
            G = reward + mdp.gamma * G
            returns[state].append(G)

    # Average returns for each state
    values = {}
    for state in mdp.states:
        if state in returns and len(returns[state]) > 0:
            values[state] = np.mean(returns[state])
        else:
            values[state] = 0.0

    return values

# Compare policies
policy_aggressive = {'High': 'search', 'Low': 'search'}
policy_cautious = {'High': 'search', 'Low': 'recharge'}
policy_lazy = {'High': 'wait', 'Low': 'wait'}

policies = {
    'Aggressive (always search)': policy_aggressive,
    'Cautious (search if high, recharge if low)': policy_cautious,
    'Lazy (always wait)': policy_lazy,
}

print("State Values Under Different Policies")
print("=" * 65)

all_values = {}
for name, policy in policies.items():
    values = estimate_state_values(recycling_robot, policy, 'High', n_episodes=10000)
    all_values[name] = values
    print(f"\n{name}:")
    for state in ['High', 'Low']:
        print(f"  V({state}) = {values[state]:.2f}")

In [None]:
# Visualize state values comparison
fig, ax = plt.subplots(1, 1, figsize=(10, 5))

policy_names = list(all_values.keys())
x = np.arange(len(policy_names))
width = 0.35

high_vals = [all_values[name]['High'] for name in policy_names]
low_vals = [all_values[name]['Low'] for name in policy_names]

bars1 = ax.bar(x - width/2, high_vals, width, label='V(High)', color='green', alpha=0.7)
bars2 = ax.bar(x + width/2, low_vals, width, label='V(Low)', color='orange', alpha=0.7)

ax.set_xlabel('Policy', fontsize=12)
ax.set_ylabel('State Value V(s)', fontsize=12)
ax.set_title('State Values Under Different Policies\n(Recycling Robot MDP)', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(['Aggressive', 'Cautious', 'Lazy'], fontsize=11)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar in bars1:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
            f'{bar.get_height():.1f}', ha='center', fontsize=10)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
            f'{bar.get_height():.1f}', ha='center', fontsize=10)

plt.tight_layout()
plt.show()

print("The cautious policy yields the highest state values -- it balances reward and risk.")

## 7. Training and Results

Let us now find the optimal policy using **Monte Carlo policy evaluation with exploration**.

In [None]:
def find_best_policy(mdp, n_iterations=20000, max_steps=50):
    """Find the best policy by trying all state-action combinations."""
    # For this small MDP, we can enumerate all possible policies
    active_states = [s for s in mdp.states if s in mdp.actions]

    best_policy = None
    best_value = -float('inf')
    all_policies_results = []

    # Generate all possible policies
    from itertools import product
    action_lists = [mdp.actions[s] for s in active_states]

    for combo in product(*action_lists):
        policy = dict(zip(active_states, combo))

        # Evaluate this policy
        values = estimate_state_values(mdp, policy, 'High', n_episodes=5000)
        total_value = values.get('High', 0)

        all_policies_results.append((policy.copy(), total_value))

        if total_value > best_value:
            best_value = total_value
            best_policy = policy.copy()

    return best_policy, best_value, all_policies_results

print("Searching for optimal policy (evaluating all possible policies)...")
best_policy, best_value, all_results = find_best_policy(recycling_robot)

print(f"\nOptimal Policy Found:")
for state, action in best_policy.items():
    print(f"  State: {state:5s} -> Action: {action}")
print(f"\n  V(High) under optimal policy: {best_value:.2f}")

# Show all policies ranked
print("\nAll Policies Ranked:")
all_results.sort(key=lambda x: x[1], reverse=True)
for i, (policy, value) in enumerate(all_results):
    actions_str = ', '.join(f"{s}:{a}" for s, a in policy.items())
    marker = " <-- BEST" if i == 0 else ""
    print(f"  {i+1}. V(High)={value:>7.2f}  |  {actions_str}{marker}")

## 8. Final Output

In [None]:
# Final demonstration: simulate the optimal policy for 100 steps
np.random.seed(0)
state = 'High'
total_cans = 0
steps_alive = 0

state_history = []
reward_history = []

for t in range(100):
    if state == 'Dead' or state not in recycling_robot.actions:
        break

    action = best_policy[state]
    next_state, reward = recycling_robot.step(state, action)
    if state == 'Low' and action == 'search' and next_state == 'Dead':
        reward = -3

    state_history.append(state)
    reward_history.append(reward)
    total_cans += max(0, reward)
    steps_alive += 1
    state = next_state

# Plot the simulation
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 6), sharex=True)

# State over time
state_numeric = [1 if s == 'High' else 0 for s in state_history]
ax1.step(range(len(state_history)), state_numeric, 'b-', linewidth=2)
ax1.set_yticks([0, 1])
ax1.set_yticklabels(['Low', 'High'])
ax1.set_ylabel('Battery State', fontsize=12)
ax1.set_title(f'Optimal Policy Simulation ({steps_alive} steps, {total_cans:.0f} cans collected)', fontsize=14, fontweight='bold')
ax1.fill_between(range(len(state_history)), state_numeric, alpha=0.2)

# Cumulative reward
cumulative = np.cumsum(reward_history)
ax2.plot(cumulative, 'g-', linewidth=2)
ax2.set_xlabel('Time step', fontsize=12)
ax2.set_ylabel('Cumulative Reward', fontsize=12)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"The optimal policy kept the robot alive for {steps_alive} steps")
print(f"and collected {total_cans:.0f} cans (cumulative reward: {cumulative[-1]:.1f}).")

In [None]:
print("=" * 60)
print("SUMMARY: MDPs, Rewards, and the Markov Property")
print("=" * 60)
print()
print("What we built:")
print("  - A complete MDP class with states, actions, transitions, rewards")
print("  - The recycling robot MDP from Sutton & Barto")
print("  - Episodic and discounted return computation")
print("  - Monte Carlo state value estimation")
print("  - Exhaustive policy search for the optimal policy")
print()
print("Key equations:")
print("  MDP = (S, A, P, R, gamma)")
print("  G_t = r_t + gamma * r_{t+1} + gamma^2 * r_{t+2} + ...")
print("  P(s' | s, a) -- the Markov property")
print()
print("Key insight: The optimal policy balances immediate reward")
print("against long-term consequences. In our recycling robot,")
print("this means knowing when to recharge instead of searching.")

## 9. Reflection and Next Steps

**Questions to think about:**
1. What happens to the optimal policy if we increase the rescue penalty from -3 to -10? Does the robot become more cautious?
2. How does changing gamma affect the optimal policy? Try gamma = 0.5 vs 0.99.
3. Can you think of a real-world problem where the Markov property does NOT hold? What would you do in that case?

**What comes next:**
We now have the mathematical framework. In the next notebook, we will use OpenAI Gymnasium to interact with pre-built environments, and you will see how these MDP concepts map to real simulations. We will also explore why a random policy fails miserably, motivating the need for proper learning algorithms.