<a href="https://colab.research.google.com/github/Vyshnavijulapelly/Reinforcement-Learning/blob/main/RL_Lab_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import sys
import math
import numpy as np
import random
import argparse  # Import argparse

# Try Gymnasium first; fallback to Gym
try:
    import gymnasium as gym
    GYMN = "gymnasium"
except Exception:
    import gym
    GYMN = "gym"


def make_env(env_id: str,
             is_slippery: bool | None = None,
             render_mode: str | None = None,
             seed: int | None = 42):
    """Create an environment with sensible defaults for FrozenLake and Taxi."""
    kwargs = {"render_mode": render_mode} if render_mode else {}
    if env_id.startswith("FrozenLake"):
        if is_slippery is not None:
            kwargs["is_slippery"] = bool(is_slippery)
    env = gym.make(env_id, **kwargs)
    # Set seed for reproducibility where supported
    try:
        env.reset(seed=seed)
        if hasattr(env, "action_space") and hasattr(env.action_space, "seed"):
            env.action_space.seed(seed)
        if hasattr(env, "observation_space") and hasattr(env.observation_space, "seed"):
            env.observation_space.seed(seed)
    except TypeError:
        pass
    return env


def epsilon_greedy_action(q_table: np.ndarray, state: int, epsilon: float, n_actions: int) -> int:
    if random.random() < epsilon:
        return random.randrange(n_actions)
    return int(np.argmax(q_table[state]))


def train_q_learning(env_id: str = "FrozenLake-v1",
                     episodes: int = 20000,
                     max_steps: int = 200,
                     alpha: float = 0.8,
                     gamma: float = 0.95,
                     epsilon: float = 1.0,
                     epsilon_min: float = 0.01,
                     epsilon_decay: float = 0.9995,
                     is_slippery: bool | None = None,
                     seed: int = 42,
                     verbose: bool = True):
    env = make_env(env_id, is_slippery=is_slippery, seed=seed)

    assert hasattr(env.observation_space, 'n') and hasattr(env.action_space, 'n'), \
        "This Q-learning implementation expects discrete state and action spaces."

    n_states = env.observation_space.n
    n_actions = env.action_space.n

    q_table = np.zeros((n_states, n_actions), dtype=np.float32)

    returns = np.zeros(episodes, dtype=np.float32)
    epsilons = np.zeros(episodes, dtype=np.float32)

    for ep in range(episodes):
        # newer Gymnasium returns (obs, info)
        reset_out = env.reset(seed=seed + ep)
        state = reset_out[0] if isinstance(reset_out, tuple) else reset_out

        total_reward = 0.0
        for t in range(max_steps):
            action = epsilon_greedy_action(q_table, state, epsilon, n_actions)
            step_out = env.step(action)
            # Gymnasium: obs, reward, terminated, truncated, info
            if len(step_out) == 5:
                next_state, reward, terminated, truncated, _ = step_out
                done = terminated or truncated
            else:  # old Gym: obs, reward, done, info
                next_state, reward, done, _ = step_out

            # Q-learning update
            best_next = np.max(q_table[next_state])
            td_target = reward + gamma * best_next * (0 if done else 1)
            td_error = td_target - q_table[state, action]
            q_table[state, action] += alpha * td_error

            state = next_state
            total_reward += reward
            if done:
                break

        # Book-keeping
        returns[ep] = total_reward
        epsilons[ep] = epsilon

        # Decay epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)

        # Progress log
        if verbose and (ep + 1) % max(1, episodes // 10) == 0:
            window = 200 if episodes >= 200 else max(1, episodes // 5)
            recent_avg = float(np.mean(returns[max(0, ep - window + 1):ep + 1]))
            print(f"[{ep+1:6d}/{episodes}] avg_return(last {window}) = {recent_avg:.3f}, epsilon={epsilon:.3f}")

    env.close()
    return q_table, returns, epsilons


def evaluate_policy(env_id: str,
                    q_table: np.ndarray,
                    episodes: int = 100,
                    max_steps: int = 200,
                    is_slippery: bool | None = None,
                    seed: int = 9999):
    env = make_env(env_id, is_slippery=is_slippery, render_mode=None, seed=seed)

    total_rewards = []
    steps_taken = []

    for ep in range(episodes):
        reset_out = env.reset(seed=seed + ep)
        state = reset_out[0] if isinstance(reset_out, tuple) else reset_out
        ep_reward = 0.0
        for t in range(max_steps):
            action = int(np.argmax(q_table[state]))
            step_out = env.step(action)
            if len(step_out) == 5:
                next_state, reward, terminated, truncated, _ = step_out
                done = terminated or truncated
            else:
                next_state, reward, done, _ = step_out

            ep_reward += reward
            state = next_state
            if done:
                steps_taken.append(t + 1)
                break
        else:
            steps_taken.append(max_steps)
        total_rewards.append(ep_reward)

    env.close()
    return float(np.mean(total_rewards)), float(np.mean(steps_taken))


def main():
    parser = argparse.ArgumentParser(description="Tabular Q-Learning for discrete-action Gym environments")
    parser.add_argument('--env', type=str, default='FrozenLake-v1', help='Env id, e.g., FrozenLake-v1 or Taxi-v3')
    parser.add_argument('--episodes', type=int, default=20000)
    parser.add_argument('--max_steps', type=int, default=200)
    parser.add_argument('--alpha', type=float, default=0.8, help='Learning rate')
    parser.add_argument('--gamma', type=float, default=0.95, help='Discount factor')
    parser.add_argument('--epsilon', type=float, default=1.0, help='Initial exploration rate')
    parser.add_argument('--epsilon_min', type=float, default=0.01)
    parser.add_argument('--epsilon_decay', type=float, default=0.9995)
    parser.add_argument('--is_slippery', type=int, default=None, choices=[0,1], help='Only for FrozenLake: 1 or 0')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--no_verbose', action='store_true', help='Disable training logs')

    # ✅ FIX for Jupyter/Colab: ignore unknown args
    args, _ = parser.parse_known_args()

    print(f"Using backend: {GYMN}")
    print(f"Training {args.env} | episodes={args.episodes}, alpha={args.alpha}, gamma={args.gamma}")

    q_table, returns, eps = train_q_learning(env_id=args.env,
                                             episodes=args.episodes,
                                             max_steps=args.max_steps,
                                             alpha=args.alpha,
                                             gamma=args.gamma,
                                             epsilon=args.epsilon,
                                             epsilon_min=args.epsilon_min,
                                             epsilon_decay=args.epsilon_decay,
                                             is_slippery=(None if args.is_slippery is None else bool(args.is_slippery)),
                                             seed=args.seed,
                                             verbose=not args.no_verbose)

    # Evaluation
    avg_reward, avg_steps = evaluate_policy(env_id=args.env,
                                            q_table=q_table,
                                            max_steps=args.max_steps,
                                            is_slippery=(None if args.is_slippery is None else bool(args.is_slippery)))

    print("\nEvaluation (greedy policy):")
    print(f"Average reward over 100 episodes: {avg_reward:.3f}")
    print(f"Average steps to termination: {avg_steps:.1f}")

    # Quick tips for typical setups
    if args.env.startswith("FrozenLake"):
        print("\nTips for FrozenLake:")
        print("- For is_slippery=1 (stochastic), increase episodes (>=20k) and use slow epsilon decay (e.g., 0.9997).")
        print("- For is_slippery=0 (deterministic), 3k–10k episodes often suffice.")
        print("- Consider gamma around 0.95–0.99 and alpha 0.5–0.9.")
    elif args.env.startswith("Taxi"):
        print("\nTips for Taxi:")
        print("- Learns faster: 5k–10k episodes often work well.")
        print("- Smaller epsilon decay (e.g., 0.995) can speed up convergence.")

    # Save artifacts for later use
    np.save("q_table.npy", q_table)
    np.save("returns.npy", returns)
    np.save("epsilons.npy", eps)
    print("\nSaved: q_table.npy, returns.npy, epsilons.npy")


if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        print("Interrupted by user.")


Using backend: gymnasium
Training FrozenLake-v1 | episodes=20000, alpha=0.8, gamma=0.95
[  2000/20000] avg_return(last 200) = 0.075, epsilon=0.368
[  4000/20000] avg_return(last 200) = 0.075, epsilon=0.135
[  6000/20000] avg_return(last 200) = 0.165, epsilon=0.050
[  8000/20000] avg_return(last 200) = 0.330, epsilon=0.018
[ 10000/20000] avg_return(last 200) = 0.445, epsilon=0.010
[ 12000/20000] avg_return(last 200) = 0.515, epsilon=0.010
[ 14000/20000] avg_return(last 200) = 0.525, epsilon=0.010
[ 16000/20000] avg_return(last 200) = 0.510, epsilon=0.010
[ 18000/20000] avg_return(last 200) = 0.490, epsilon=0.010
[ 20000/20000] avg_return(last 200) = 0.440, epsilon=0.010

Evaluation (greedy policy):
Average reward over 100 episodes: 0.660
Average steps to termination: 45.8

Tips for FrozenLake:
- For is_slippery=1 (stochastic), increase episodes (>=20k) and use slow epsilon decay (e.g., 0.9997).
- For is_slippery=0 (deterministic), 3k–10k episodes often suffice.
- Consider gamma around 0