In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Reinforcement Learning Foundations -- Vizuara

## 1. Why Does This Matter?

Supervised learning needs labels. Unsupervised learning needs data. But what if your agent has neither -- and must learn entirely from interacting with its environment?

This is the core idea behind **reinforcement learning** (RL). It is the closest form of machine learning to how humans and animals actually learn: try something, observe the outcome, adjust, repeat.

In this notebook, we will build your intuition for what RL is, how it differs from other ML paradigms, and what the four foundational elements of every RL system are. By the end, you will implement a complete Tic-Tac-Toe agent that learns to play by updating a value function through self-play -- no labeled data required.

## 2. Building Intuition

Think about learning to ride a bicycle. Nobody hands you a dataset of "correct pedaling patterns." You get on, wobble, fall (negative reward), adjust, and eventually learn to balance.

Now contrast this with:
- **Supervised learning**: "Here are 1,000 images labeled cat or dog. Learn the mapping."
- **Unsupervised learning**: "Here is unlabeled data. Find clusters."
- **Reinforcement learning**: "There is no dataset. Interact with the world and maximize reward."

Let us make this concrete with code. We will simulate all three paradigms side by side.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# -------------------------------------------------------
# Supervised Learning: learn from labeled examples
# -------------------------------------------------------
# Simple linear regression: y = 2x + 1
np.random.seed(42)
X_supervised = np.random.rand(50)
y_supervised = 2 * X_supervised + 1 + np.random.randn(50) * 0.1

# -------------------------------------------------------
# Unsupervised Learning: find structure without labels
# -------------------------------------------------------
from sklearn.datasets import make_blobs
X_unsupervised, _ = make_blobs(n_samples=100, centers=3, cluster_std=0.6, random_state=42)

# -------------------------------------------------------
# Reinforcement Learning: learn by interaction
# -------------------------------------------------------
# Simple bandit: agent picks arms, receives rewards
n_arms = 3
true_means = [1.0, 2.0, 1.5]  # arm 0, 1, 2
rl_rewards = []
rl_choices = []

# Random agent -- no learning yet
for step in range(50):
    arm = np.random.randint(n_arms)
    reward = np.random.randn() + true_means[arm]
    rl_choices.append(arm)
    rl_rewards.append(reward)

# Visualize all three paradigms
fig, axes = plt.subplots(1, 3, figsize=(16, 4))

# Panel 1: Supervised
axes[0].scatter(X_supervised, y_supervised, c='steelblue', alpha=0.7, s=30)
axes[0].plot([0, 1], [1, 3], 'r-', linewidth=2, label='Learned mapping')
axes[0].set_title('Supervised Learning\n(labeled data -> model)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Input X')
axes[0].set_ylabel('Label y')
axes[0].legend()

# Panel 2: Unsupervised
axes[1].scatter(X_unsupervised[:, 0], X_unsupervised[:, 1], c='gray', alpha=0.5, s=30)
axes[1].set_title('Unsupervised Learning\n(unlabeled data -> clusters)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Feature 1')
axes[1].set_ylabel('Feature 2')

# Panel 3: RL
axes[2].plot(rl_rewards, 'o-', markersize=3, alpha=0.7, color='green')
axes[2].axhline(y=max(true_means), color='red', linestyle='--', label='Best arm mean')
axes[2].set_title('Reinforcement Learning\n(no data -- learn by interaction)', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Time step')
axes[2].set_ylabel('Reward received')
axes[2].legend()

plt.tight_layout()
plt.show()

print("Key insight: In RL, there is no dataset. The agent generates its own data through interaction.")

## 3. The Mathematics

The goal of an RL agent is to find a **policy** $\pi$ (a strategy for choosing actions) that maximizes the **expected cumulative reward**:

$$\max_{\pi} \; \mathbb{E}\left[\sum_{t=0}^{T} r_t\right]$$

Here:
- $\pi$ is the policy (maps states to actions)
- $r_t$ is the reward at time step $t$
- $T$ is the final time step of the episode

Let us compute this by hand for a simple example.

In [None]:
# A robot collects cans over 5 time steps
rewards = [1, 0, 1, 1, 0]

# Compute cumulative return from each time step
print("Time step | Reward | Return from here onward")
print("-" * 50)
for t in range(len(rewards)):
    G_t = sum(rewards[t:])
    print(f"    t={t}    |   {rewards[t]}    |   G_{t} = {' + '.join(map(str, rewards[t:]))} = {G_t}")

total = sum(rewards)
print(f"\nTotal return G_0 = {total}")
print(f"The agent wants to find a policy that makes G_0 as large as possible.")

## 4. Let's Build It -- Component by Component

Every RL system has **four elements**. Let us understand each one and build it in code.

### Element 1: Policy

The policy defines the agent's behavior. It maps states to actions.

In [None]:
# A simple deterministic policy for a grid world
# States: positions on a 4x4 grid (0-15)
# Actions: 0=up, 1=right, 2=down, 3=left

def simple_policy(state):
    """A hand-coded policy: always go right, then down."""
    row, col = state // 4, state % 4
    if col < 3:
        return 1  # go right
    else:
        return 2  # go down

# Test the policy
print("Policy decisions:")
for state in range(16):
    action = simple_policy(state)
    action_name = ['up', 'right', 'down', 'left'][action]
    row, col = state // 4, state % 4
    print(f"  State {state:2d} (row={row}, col={col}) -> action: {action_name}")

### Element 2: Reward Signal

The reward is the immediate feedback signal -- a single number at each step.

In [None]:
# Different reward structures for different problems
print("Example reward structures:\n")

# Chess
print("Chess:")
print("  Win:  +1")
print("  Lose: -1")
print("  Draw:  0")
print()

# Robot in maze
print("Robot in maze:")
maze_rewards = [-1, -1, -1, -1, -1, 0]  # -1 per step, 0 at exit
print(f"  Rewards per step: {maze_rewards}")
print(f"  Total: {sum(maze_rewards)} (faster escape = less negative total)")
print()

# Robot collecting cans
print("Robot collecting cans:")
can_rewards = [1, 0, 0, 1, 1, 0, 1]
print(f"  Rewards per step: {can_rewards}")
print(f"  Total cans collected: {sum(can_rewards)}")

### Element 3: Value Function

The value function tells us the **long-term desirability** of a state -- not just the immediate reward, but the total expected future reward.

In [None]:
# Value function for a simple 5-state chain
# States: [0] -> [1] -> [2] -> [3] -> [4=goal]
# Reward: +1 at goal, 0 elsewhere

# True values (under optimal policy: always move right)
# V(s) = probability of reaching goal * reward
# With deterministic transitions:
# V(4) = 1.0 (at goal)
# V(3) = 1.0 (one step to goal)
# V(2) = 1.0 (two steps to goal)
# V(1) = 1.0 (three steps to goal)
# V(0) = 1.0 (four steps to goal)

# But with stochastic transitions (50% chance of staying):
gamma = 0.9
V_stochastic = [0.0] * 5
V_stochastic[4] = 1.0  # goal

# Work backwards: V(s) = 0.5 * gamma * V(s) + 0.5 * gamma * V(s+1)
# Rearranging: V(s) = (0.5 * gamma * V(s+1)) / (1 - 0.5 * gamma)
for s in range(3, -1, -1):
    V_stochastic[s] = (0.5 * gamma * V_stochastic[s + 1]) / (1 - 0.5 * gamma)

print("State values (stochastic transitions, gamma=0.9):")
for s in range(5):
    bar = "#" * int(V_stochastic[s] * 40)
    print(f"  State {s}: V = {V_stochastic[s]:.4f}  {bar}")

print("\nKey insight: States closer to the goal have higher value.")
print("The value function captures LONG-TERM desirability, not just immediate reward.")

### Element 4: Model of the Environment

The model predicts what happens next: given a state and action, what is the next state and reward?

In [None]:
# A simple environment model (transition table)
# Two states: High battery, Low battery
# Two actions: Search, Wait

model = {
    ('High', 'Search'): [('High', 0.7, 2), ('Low', 0.3, 2)],
    ('High', 'Wait'):   [('High', 1.0, 1)],
    ('Low', 'Search'):  [('Low', 0.4, 2), ('Dead', 0.6, -3)],
    ('Low', 'Wait'):    [('Low', 1.0, 1)],
    ('Low', 'Recharge'):[('High', 1.0, 0)],
}

print("Environment Model (Recycling Robot):")
print("=" * 60)
for (state, action), transitions in model.items():
    print(f"\n  State: {state}, Action: {action}")
    for next_state, prob, reward in transitions:
        print(f"    -> {next_state} with prob {prob:.1f}, reward = {reward}")

print("\nModel-based: agent can PLAN by simulating the model.")
print("Model-free: agent learns directly from experience.")

## 5. Your Turn

### Exercise 1: Modify the Bandit

The multi-armed bandit below uses a random strategy. Modify it to use an **epsilon-greedy** strategy:
- With probability epsilon, choose a random arm (explore)
- With probability 1-epsilon, choose the arm with the highest estimated mean (exploit)

In [None]:
# TODO: Implement epsilon-greedy bandit

n_arms = 4
true_means = [1.0, 2.5, 1.8, 0.5]
n_steps = 1000
epsilon = 0.1

# Track estimates and counts
Q = np.zeros(n_arms)       # estimated value of each arm
N = np.zeros(n_arms)       # number of times each arm was pulled
rewards_history = []

for step in range(n_steps):
    # TODO: Implement epsilon-greedy action selection
    # Hint: Use np.random.random() < epsilon for exploration
    # Hint: Use np.argmax(Q) for exploitation
    arm = np.random.randint(n_arms)  # REPLACE THIS LINE

    # Get reward (noisy version of true mean)
    reward = np.random.randn() + true_means[arm]

    # TODO: Update Q[arm] using incremental mean formula:
    # Q[arm] = Q[arm] + (1 / N[arm]) * (reward - Q[arm])
    # Don't forget to increment N[arm] first!
    pass  # REPLACE THIS LINE

    rewards_history.append(reward)

# Plot results
plt.figure(figsize=(10, 4))
window = 50
smoothed = [np.mean(rewards_history[max(0,i-window):i+1]) for i in range(len(rewards_history))]
plt.plot(smoothed, alpha=0.8)
plt.axhline(y=max(true_means), color='red', linestyle='--', label=f'Best arm mean = {max(true_means)}')
plt.xlabel('Step')
plt.ylabel('Average Reward (smoothed)')
plt.title('Epsilon-Greedy Bandit')
plt.legend()
plt.show()

print(f"Estimated values: {Q}")
print(f"True means:       {true_means}")
print(f"Arm pull counts:  {N}")

### Exercise 2: Compute Returns by Hand

Given the following reward sequences, compute the return $G_t$ at each time step.

In [None]:
# TODO: Compute returns for each reward sequence

# Sequence A: episodic (no discounting)
rewards_a = [0, 0, 0, 1]
print("Sequence A (episodic, no discounting):")
for t in range(len(rewards_a)):
    G_t = None  # TODO: compute sum of rewards from t to end
    print(f"  G_{t} = {G_t}")

print()

# Sequence B: discounted with gamma = 0.9
rewards_b = [1, 2, 3, 4]
gamma = 0.9
print(f"Sequence B (discounted, gamma={gamma}):")
for t in range(len(rewards_b)):
    G_t = None  # TODO: compute discounted return from t
    # Hint: G_t = r_t + gamma * r_{t+1} + gamma^2 * r_{t+2} + ...
    print(f"  G_{t} = {G_t}")

## 6. Putting It All Together

Now let us build something real: a **Tic-Tac-Toe agent** that learns to play by updating its value function through self-play. This brings together all four elements.

In [None]:
import numpy as np
from collections import defaultdict

class TicTacToeEnv:
    """Simple Tic-Tac-Toe environment."""

    def __init__(self):
        self.board = [0] * 9  # 0=empty, 1=X, -1=O
        self.current_player = 1  # X goes first

    def reset(self):
        self.board = [0] * 9
        self.current_player = 1
        return tuple(self.board)

    def get_state(self):
        return tuple(self.board)

    def available_actions(self):
        return [i for i in range(9) if self.board[i] == 0]

    def step(self, action):
        """Take action (place piece), return (state, reward, done)."""
        self.board[action] = self.current_player
        winner = self._check_winner()

        if winner != 0:
            return tuple(self.board), winner, True
        elif 0 not in self.board:
            return tuple(self.board), 0, True  # draw
        else:
            self.current_player *= -1
            return tuple(self.board), 0, False

    def _check_winner(self):
        lines = [
            [0,1,2], [3,4,5], [6,7,8],  # rows
            [0,3,6], [1,4,7], [2,5,8],  # cols
            [0,4,8], [2,4,6]             # diagonals
        ]
        for line in lines:
            s = sum(self.board[i] for i in line)
            if s == 3: return 1    # X wins
            if s == -3: return -1  # O wins
        return 0

    def render(self):
        symbols = {0: '.', 1: 'X', -1: 'O'}
        for i in range(0, 9, 3):
            print(' '.join(symbols[self.board[j]] for j in range(i, i+3)))
        print()

In [None]:
class ValueFunctionAgent:
    """An RL agent that learns to play Tic-Tac-Toe using a value function."""

    def __init__(self, player=1, epsilon=0.2, alpha=0.1):
        self.player = player         # 1 for X, -1 for O
        self.epsilon = epsilon       # exploration rate
        self.alpha = alpha           # learning rate (step size)
        self.values = defaultdict(lambda: 0.5)  # initial value = 0.5

    def choose_action(self, env, training=True):
        """Epsilon-greedy action selection."""
        actions = env.available_actions()

        if training and np.random.random() < self.epsilon:
            # EXPLORE: random action
            return np.random.choice(actions)
        else:
            # EXPLOIT: choose action leading to highest-value state
            best_value = -float('inf')
            best_action = actions[0]

            for action in actions:
                # Simulate the action
                test_board = env.board.copy()
                test_board[action] = env.current_player
                state = tuple(test_board)

                if self.values[state] > best_value:
                    best_value = self.values[state]
                    best_action = action

            return best_action

    def update(self, state_history, reward):
        """Update values using temporal difference: back up from final reward."""
        # Convert reward to this player's perspective
        if self.player == -1:
            reward = -reward

        # Terminal state value
        target = 1.0 if reward > 0 else (0.0 if reward < 0 else 0.5)

        # Update backwards through the states this player visited
        for state in reversed(state_history):
            self.values[state] += self.alpha * (target - self.values[state])
            target = self.values[state]  # bootstrap

print("TicTacToe environment and ValueFunctionAgent defined.")
print("The agent uses epsilon-greedy for exploration/exploitation balance.")
print("It updates its value function by 'backing up' from the game outcome.")

## 7. Training and Results

In [None]:
def train_agents(n_episodes=50000):
    """Train two agents by self-play."""
    env = TicTacToeEnv()
    agent_x = ValueFunctionAgent(player=1, epsilon=0.2, alpha=0.1)
    agent_o = ValueFunctionAgent(player=-1, epsilon=0.2, alpha=0.1)

    stats = {'X_wins': 0, 'O_wins': 0, 'draws': 0}
    win_rates = []  # for plotting

    for episode in range(n_episodes):
        state = env.reset()
        x_states = [state]
        o_states = []
        done = False

        while not done:
            if env.current_player == 1:
                action = agent_x.choose_action(env)
            else:
                action = agent_o.choose_action(env)

            state, reward, done = env.step(action)

            if env.current_player == 1 or done:
                o_states.append(state)
            if env.current_player == -1 or done:
                x_states.append(state)

        # Update both agents
        agent_x.update(x_states, reward)
        agent_o.update(o_states, reward)

        # Track stats
        if reward == 1: stats['X_wins'] += 1
        elif reward == -1: stats['O_wins'] += 1
        else: stats['draws'] += 1

        # Record win rate every 1000 episodes
        if (episode + 1) % 1000 == 0:
            recent = 1000
            win_rates.append(stats['draws'] / (episode + 1))

    return agent_x, agent_o, stats, win_rates

print("Training two agents through self-play (50,000 games)...")
agent_x, agent_o, stats, win_rates = train_agents(50000)

print(f"\nResults after 50,000 games:")
print(f"  X wins: {stats['X_wins']} ({stats['X_wins']/500:.1f}%)")
print(f"  O wins: {stats['O_wins']} ({stats['O_wins']/500:.1f}%)")
print(f"  Draws:  {stats['draws']} ({stats['draws']/500:.1f}%)")

# Plot learning curve
plt.figure(figsize=(10, 4))
plt.plot(range(1000, 50001, 1000), win_rates, 'b-o', markersize=4)
plt.xlabel('Episodes')
plt.ylabel('Cumulative Draw Rate')
plt.title('Tic-Tac-Toe Self-Play: Draw Rate Over Training')
plt.grid(True, alpha=0.3)
plt.show()
print("As training progresses, both agents learn to play optimally, leading to more draws.")

In [None]:
# Visualize the learned value function for key states
print("Learned Value Function (sample states):\n")

env = TicTacToeEnv()
env.reset()

# Show values for opening moves
print("Empty board value:", f"{agent_x.values[tuple([0]*9)]:.3f}")
print()

# Show value after X plays center
test_board = [0]*9
test_board[4] = 1  # X in center
print("After X plays center:")
env.board = test_board.copy()
env.render()
print(f"  Value (X perspective): {agent_x.values[tuple(test_board)]:.3f}")
print()

# Show value after X plays center, O plays corner
test_board[0] = -1  # O in top-left
print("After X center, O corner:")
env.board = test_board.copy()
env.render()
print(f"  Value (X perspective): {agent_x.values[tuple(test_board)]:.3f}")
print()

# Show a winning state
test_board2 = [1, 1, 1, -1, -1, 0, 0, 0, 0]
print("X has three in a row (winning):")
env.board = test_board2.copy()
env.render()
print(f"  Value (X perspective): {agent_x.values[tuple(test_board2)]:.3f}")

In [None]:
# Watch the trained agent play against a random opponent
def play_demo(agent, agent_player=1, n_games=5):
    """Watch trained agent vs random opponent."""
    env = TicTacToeEnv()
    wins = 0

    for game in range(n_games):
        state = env.reset()
        done = False
        print(f"--- Game {game+1} ---")

        while not done:
            if env.current_player == agent_player:
                action = agent.choose_action(env, training=False)
                print(f"Trained agent (X) plays position {action}")
            else:
                actions = env.available_actions()
                action = np.random.choice(actions)
                print(f"Random agent (O) plays position {action}")

            state, reward, done = env.step(action)
            env.render()

        if reward == 1:
            print("Result: X WINS\n")
            wins += 1
        elif reward == -1:
            print("Result: O WINS\n")
        else:
            print("Result: DRAW\n")

    print(f"Trained agent won {wins}/{n_games} games against random opponent.")

play_demo(agent_x, agent_player=1, n_games=3)

## 8. Final Output

You have built a complete RL agent from scratch. The Tic-Tac-Toe agent demonstrates all four elements:
- **Policy**: Epsilon-greedy (explore vs exploit)
- **Reward**: +1 for win, -1 for loss, 0 for draw
- **Value Function**: Learned probability of winning from each board state
- **Model**: Not used (model-free approach)

In [None]:
print("=" * 60)
print("SUMMARY: Reinforcement Learning Foundations")
print("=" * 60)
print()
print("What we built:")
print("  - Compared RL to supervised and unsupervised learning")
print("  - Understood the four elements of every RL system")
print("  - Implemented a Tic-Tac-Toe agent that learns from self-play")
print("  - Observed exploration vs exploitation in action")
print()
print("Key takeaway:")
print("  RL agents learn from INTERACTION, not from DATA.")
print("  The value function captures long-term desirability.")
print("  Balancing exploration and exploitation is fundamental.")
print()
print(f"Unique states the agent learned: {len(agent_x.values)}")

## 9. Reflection and Next Steps

**Questions to think about:**
1. What happens if you set epsilon to 0 (pure exploitation) during training? Does the agent still learn well?
2. Why does the draw rate increase as training progresses?
3. Could you design a reward that accidentally teaches the wrong behavior? (Hint: what if you rewarded capturing pieces instead of winning?)

**What comes next:**
In the next notebook, we will formalize the mathematical framework behind RL: Markov Decision Processes, the Bellman equation, and how to compute optimal value functions. These are the tools that power algorithms like Q-learning and policy gradients.