<a href="https://colab.research.google.com/github/Tejaswini170104/CH5020-Term-paper-presentation/blob/main/CartPole_v1_sarsa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import gymnasium as gym
import random

# Constants
GAMMA = 0.99
EPSILON_START = 1
EPSILON_MIN = 0.01
MAX_EPISODES = 100000
MAX_STEPS = 500
NUM_BINS = 20
SOLVED_CRITERIA = 500
CONSECUTIVE_EPISODES = 50
NUM_RUNS = 1  # Number of runs per config

# Default hyperparameters
ALPHA = 0.1
LAMBDA_DECAY = 0.001

# State discretization bounds
state_bounds = [
    [-4.8, 4.8],
    [-3.0, 3.0],
    [-0.418, 0.418],
    [-3.0, 3.0]
]

def discretize_state(state):
    return tuple(np.digitize(state[i], np.linspace(state_bounds[i][0], state_bounds[i][1], NUM_BINS)) - 1 for i in range(len(state)))


def choose_action(q_table, state, epsilon):
    return random.randint(0, 1) if random.uniform(0, 1) < epsilon else np.argmax(q_table[state])


def sarsa_q_table(env, alpha, lambda_decay):
    """Run SARSA with given hyperparameters and return regret statistics."""
    q_table = np.zeros((NUM_BINS, NUM_BINS, NUM_BINS, NUM_BINS, env.action_space.n))
    returns, regrets = [], []
    rolling_rewards = []

    for episode in range(MAX_EPISODES):
        epsilon = max(EPSILON_MIN, EPSILON_START * np.exp(-lambda_decay * episode))
        state = discretize_state(env.reset()[0])
        action = choose_action(q_table, state, epsilon)
        total_reward = 0

        for _ in range(MAX_STEPS):
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_state = discretize_state(next_state)
            next_action = choose_action(q_table, next_state, epsilon)
            td_target = reward + (GAMMA * q_table[next_state][next_action] if not (terminated or truncated) else 0)
            q_table[state][action] += alpha * (td_target - q_table[state][action])
            state, action = next_state, next_action
            total_reward += reward
            if terminated or truncated:
                break

        returns.append(total_reward)
        regrets.append(MAX_STEPS - total_reward)  # Regret is how far we are from max possible reward
        rolling_rewards.append(total_reward)

        if len(rolling_rewards) > 100:
            rolling_rewards.pop(0)

        if len(rolling_rewards) == 100 and np.mean(rolling_rewards) >= SOLVED_CRITERIA:
            print(f"Environment solved after {episode + 1} episodes!")
            break

        if episode % 100 == 0:
            print(f"Episode {episode}: Avg Reward (last 100): {np.mean(rolling_rewards):.2f}, Epsilon: {epsilon:.4f}")

    avg_reward = np.mean(returns)
    avg_regret = np.mean(regrets)

    return avg_reward, avg_regret


def train():
    """Train SARSA without logging, only printing results."""
    mean_rewards, mean_regrets = [], []

    for run in range(NUM_RUNS):
        env = gym.make("CartPole-v1")
        avg_reward, avg_regret = sarsa_q_table(env, ALPHA, LAMBDA_DECAY)
        mean_rewards.append(avg_reward)
        mean_regrets.append(avg_regret)
        print(f"Run {run + 1}: Avg Reward = {avg_reward:.2f}, Avg Regret = {avg_regret:.2f}")

    print(f"Final Results - Avg Reward: {np.mean(mean_rewards):.2f}, Avg Regret: {np.mean(mean_regrets):.2f}")


# Start training
train()


Episode 0: Avg Reward (last 100): 16.00, Epsilon: 1.0000
Episode 100: Avg Reward (last 100): 25.83, Epsilon: 0.9048
Episode 200: Avg Reward (last 100): 23.47, Epsilon: 0.8187
Episode 300: Avg Reward (last 100): 28.02, Epsilon: 0.7408
Episode 400: Avg Reward (last 100): 28.93, Epsilon: 0.6703
Episode 500: Avg Reward (last 100): 34.90, Epsilon: 0.6065
Episode 600: Avg Reward (last 100): 39.48, Epsilon: 0.5488
Episode 700: Avg Reward (last 100): 44.25, Epsilon: 0.4966
Episode 800: Avg Reward (last 100): 56.96, Epsilon: 0.4493
Episode 900: Avg Reward (last 100): 59.69, Epsilon: 0.4066
Episode 1000: Avg Reward (last 100): 72.59, Epsilon: 0.3679
Episode 1100: Avg Reward (last 100): 96.18, Epsilon: 0.3329
Episode 1200: Avg Reward (last 100): 94.36, Epsilon: 0.3012
Episode 1300: Avg Reward (last 100): 94.55, Epsilon: 0.2725
Episode 1400: Avg Reward (last 100): 111.58, Epsilon: 0.2466
Episode 1500: Avg Reward (last 100): 111.43, Epsilon: 0.2231
Episode 1600: Avg Reward (last 100): 137.47, Epsil