Imports + env

In [1]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

In [2]:
def epsilon_greedy(Q, state, n_actions, epsilon, rng):
    if rng.random() < epsilon:
        return int(rng.integers(n_actions))
    return int(np.argmax(Q[state]))

def moving_average(x, window=200):
    x = np.asarray(x, dtype=np.float32)
    if len(x) < window:
        return x
    return np.convolve(x, np.ones(window)/window, mode="valid")

In [3]:
def train_sarsa(
    env_id="Taxi-v3",
    episodes=20_000,
    alpha=0.1,
    gamma=0.99,
    epsilon_start=1.0,
    epsilon_end=0.05,
    epsilon_decay=0.9995,
    seed=0,
    max_steps=200,
):
    env = gym.make(env_id)
    rng = np.random.default_rng(seed)

    n_states = env.observation_space.n
    n_actions = env.action_space.n
    Q = np.zeros((n_states, n_actions), dtype=np.float32)

    epsilon = epsilon_start
    rewards_history = []

    for ep in range(episodes):
        state, info = env.reset(seed=seed + ep)
        action = epsilon_greedy(Q, state, n_actions, epsilon, rng)

        total_reward = 0.0

        for _ in range(max_steps):
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            total_reward += reward

            next_action = epsilon_greedy(Q, next_state, n_actions, epsilon, rng)

            td_target = reward + (0.0 if done else gamma * Q[next_state, next_action])
            Q[state, action] += alpha * (td_target - Q[state, action])

            state, action = next_state, next_action
            if done:
                break

        rewards_history.append(total_reward)
        epsilon = max(epsilon_end, epsilon * epsilon_decay)

        if (ep + 1) % 1000 == 0:
            print(f"Episode {ep+1:>6} | eps={epsilon:.3f} | avg(last1000)={np.mean(rewards_history[-1000:]):.2f}")

    env.close()
    return Q, rewards_history
