In [1]:
# CSCN8020 Assignment 2 – Q-Learning on Taxi-v3
# Produces:
#  - final_metrics.csv  (per-episode returns/lengths for the baseline run)
#  - summary_df.csv     (summary across all requested hyperparameter variants)
#
# Matches reference schemas you provided:
# final_metrics.csv columns: episode_returns, episode_lengths, total_episodes
# summary_df.csv   columns: run, alpha, epsilon, gamma, avg_return, avg_length
#
# Requirements:
#   pip install gymnasium numpy pandas
# (Taxi-v3 is part of gymnasium.toy_text and ships with gymnasium)

import numpy as np
import pandas as pd
import gymnasium as gym
from collections import defaultdict
from dataclasses import dataclass, asdict
from typing import Tuple, Dict, List, Optional
import random
import os

# ----------------------------
# Config & Utilities
# ----------------------------

@dataclass
class RunConfig:
    name: str
    alpha: float = 0.1       # learning rate
    epsilon: float = 0.1     # exploration rate
    gamma: float = 0.9       # discount factor
    episodes: int = 5000
    max_steps: int = 200
    seed: int = 42           # for reproducibility

def set_global_seed(seed: int):
    np.random.seed(seed)
    random.seed(seed)

def epsilon_greedy_action(q_row: np.ndarray, epsilon: float, nA: int) -> int:
    if np.random.rand() < epsilon:
        return np.random.randint(nA)
    return int(np.argmax(q_row))

def q_learning(
    env_id: str,
    cfg: RunConfig
) -> Tuple[np.ndarray, List[float], List[int]]:
    """
    Basic tabular Q-learning for discrete envs like Taxi-v3.
    Returns:
      Q-table, list of per-episode returns, list of per-episode lengths
    """
    set_global_seed(cfg.seed)
    env = gym.make(env_id)
    # seed env & action-space for reproducibility (Gymnasium style)
    try:
        env.reset(seed=cfg.seed)
        env.action_space.seed(cfg.seed)
        env.observation_space.seed(cfg.seed)
    except Exception:
        pass

    nS = env.observation_space.n
    nA = env.action_space.n
    Q  = np.zeros((nS, nA), dtype=np.float32)

    ep_returns: List[float] = []
    ep_lengths: List[int] = []

    for ep in range(cfg.episodes):
        state, info = env.reset()
        total_r = 0.0
        steps = 0

        for t in range(cfg.max_steps):
            a = epsilon_greedy_action(Q[state], cfg.epsilon, nA)
            next_state, reward, terminated, truncated, info = env.step(a)

            # Q-learning update
            best_next = np.max(Q[next_state])
            td_target = reward + cfg.gamma * best_next * (0 if (terminated or truncated) else 1)
            td_error  = td_target - Q[state, a]
            Q[state, a] += cfg.alpha * td_error

            total_r += reward
            steps += 1
            state = next_state

            if terminated or truncated:
                break

        ep_returns.append(total_r)
        ep_lengths.append(steps)

    env.close()
    return Q, ep_returns, ep_lengths

def summarize_returns_lengths(returns: List[float], lengths: List[int]) -> Tuple[float, float]:
    return float(np.mean(returns)), float(np.mean(lengths))

# ----------------------------
# Experiment Grid
# ----------------------------

# Assignment baseline (α=0.1, ε=0.1, γ=0.9) and variations.
# NOTE: The assignment PDF’s “Exploration Factor γ = [0.2, 0.3]” appears to be a typo.
# We vary epsilon in {0.2, 0.3}, as in your reference CSVs.
baseline = RunConfig(name="baseline", alpha=0.1, epsilon=0.1, gamma=0.9, episodes=5000, max_steps=200)
alpha_variants = [
    RunConfig(name="alpha_0.01", alpha=0.01, epsilon=0.1, gamma=0.9, episodes=5000, max_steps=200),
    RunConfig(name="alpha_0.001", alpha=0.001, epsilon=0.1, gamma=0.9, episodes=5000, max_steps=200),
    RunConfig(name="alpha_0.2", alpha=0.2, epsilon=0.1, gamma=0.9, episodes=5000, max_steps=200),
]
epsilon_variants = [
    RunConfig(name="epsilon_0.2", alpha=0.1, epsilon=0.2, gamma=0.9, episodes=5000, max_steps=200),
    RunConfig(name="epsilon_0.3", alpha=0.1, epsilon=0.3, gamma=0.9, episodes=5000, max_steps=200),
]
all_runs: List[RunConfig] = [baseline] + alpha_variants + epsilon_variants

# ----------------------------
# Run all experiments
# ----------------------------

summary_rows = []
baseline_returns = None
baseline_lengths = None

for cfg in all_runs:
    print(f"Training: {cfg.name} | alpha={cfg.alpha} epsilon={cfg.epsilon} gamma={cfg.gamma} "
          f"| episodes={cfg.episodes} max_steps={cfg.max_steps}")
    Q, ep_returns, ep_lengths = q_learning("Taxi-v3", cfg)
    avg_ret, avg_len = summarize_returns_lengths(ep_returns, ep_lengths)

    summary_rows.append({
        "run": cfg.name,
        "alpha": cfg.alpha,
        "epsilon": cfg.epsilon,
        "gamma": cfg.gamma,
        "avg_return": round(avg_ret, 4),
        "avg_length": round(avg_len, 4),
    })

    if cfg.name == "baseline":
        baseline_returns = ep_returns
        baseline_lengths = ep_lengths

# ----------------------------
# Save outputs (matching your reference schemas)
# ----------------------------

# 1) summary_df.csv
summary_df = pd.DataFrame(summary_rows, columns=["run", "alpha", "epsilon", "gamma", "avg_return", "avg_length"])
summary_df.to_csv("summary_df.csv", index=False)
print("\nWrote summary_df.csv")
display(summary_df)

# 2) final_metrics.csv (store ONLY the baseline’s per-episode series)
if baseline_returns is None or baseline_lengths is None:
    raise RuntimeError("Baseline run missing; cannot produce final_metrics.csv.")

final_metrics = pd.DataFrame({
    "episode_returns": baseline_returns,
    "episode_lengths": baseline_lengths,
})
final_metrics["total_episodes"] = len(final_metrics)
final_metrics.to_csv("final_metrics.csv", index=False)
print("\nWrote final_metrics.csv")
display(final_metrics.head())


Training: baseline | alpha=0.1 epsilon=0.1 gamma=0.9 | episodes=5000 max_steps=200
Training: alpha_0.01 | alpha=0.01 epsilon=0.1 gamma=0.9 | episodes=5000 max_steps=200
Training: alpha_0.001 | alpha=0.001 epsilon=0.1 gamma=0.9 | episodes=5000 max_steps=200
Training: alpha_0.2 | alpha=0.2 epsilon=0.1 gamma=0.9 | episodes=5000 max_steps=200
Training: epsilon_0.2 | alpha=0.1 epsilon=0.2 gamma=0.9 | episodes=5000 max_steps=200
Training: epsilon_0.3 | alpha=0.1 epsilon=0.3 gamma=0.9 | episodes=5000 max_steps=200

Wrote summary_df.csv


Unnamed: 0,run,alpha,epsilon,gamma,avg_return,avg_length
0,baseline,0.1,0.1,0.9,-21.373,30.3664
1,alpha_0.01,0.01,0.1,0.9,-161.0448,127.6254
2,alpha_0.001,0.001,0.1,0.9,-258.9354,185.6976
3,alpha_0.2,0.2,0.1,0.9,-11.3424,23.4384
4,epsilon_0.2,0.1,0.2,0.9,-32.3808,32.7762
5,epsilon_0.3,0.1,0.3,0.9,-47.9292,36.255



Wrote final_metrics.csv


Unnamed: 0,episode_returns,episode_lengths,total_episodes
0,-605.0,200,5000
1,-596.0,200,5000
2,-596.0,200,5000
3,-542.0,200,5000
4,-506.0,200,5000
