In [10]:
import gymnasium as gym
import numpy as np
import random

# Create environment
env = gym.make("CartPole-v1")

# Observation space (state size)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

print("State size:", state_size)
print("Action size:", action_size)

State size: 4
Action size: 2


In [11]:
# Discretize states
def discretize_state(state, bins=(6, 12)):
    cart_pos, cart_vel, pole_angle, pole_vel = state
    cart_pos_bins = np.linspace(-2.4, 2.4, bins[0])
    cart_vel_bins = np.linspace(-3.0, 3.0, bins[1])
    pole_angle_bins = np.linspace(-0.21, 0.21, bins[0])
    pole_vel_bins = np.linspace(-3.5, 3.5, bins[1])

    state_idx = (
        np.digitize(cart_pos, cart_pos_bins),
        np.digitize(cart_vel, cart_vel_bins),
        np.digitize(pole_angle, pole_angle_bins),
        np.digitize(pole_vel, pole_vel_bins),
    )
    return state_idx

In [12]:
# Initialize Q-table
q_table = np.zeros((7, 13, 7, 13, action_size))  # (bins + 1 for digitize output)

# Hyperparameters
alpha = 0.1   # learning rate
gamma = 0.99  # discount factor
epsilon = 1.0 # exploration rate
epsilon_decay = 0.995
epsilon_min = 0.01
episodes = 5000

In [13]:
for episode in range(episodes):
    state = discretize_state(env.reset()[0])  # get initial state
    done = False
    total_reward = 0

    while not done:
        # ε-greedy policy
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])

        next_state, reward, done, _, _ = env.step(action)
        next_state = discretize_state(next_state)

        # Update Q-value
        best_next_action = np.argmax(q_table[next_state])
        q_table[state + (action,)] += alpha * (
            reward + gamma * q_table[next_state + (best_next_action,)] - q_table[state + (action,)]
        )

        state = next_state
        total_reward += reward

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    if episode % 500 == 0:
        print(f"Episode {episode}, Total reward: {total_reward}, Epsilon: {epsilon:.2f}")

Episode 0, Total reward: 24.0, Epsilon: 0.99
Episode 500, Total reward: 62.0, Epsilon: 0.08
Episode 1000, Total reward: 111.0, Epsilon: 0.01
Episode 1500, Total reward: 100.0, Epsilon: 0.01
Episode 2000, Total reward: 115.0, Epsilon: 0.01
Episode 2500, Total reward: 100.0, Epsilon: 0.01
Episode 3000, Total reward: 103.0, Epsilon: 0.01
Episode 3500, Total reward: 66.0, Epsilon: 0.01
Episode 4000, Total reward: 103.0, Epsilon: 0.01
Episode 4500, Total reward: 96.0, Epsilon: 0.01


In [14]:
state = discretize_state(env.reset()[0])
done = False
total_reward = 0

while not done:
    env.render()
    action = np.argmax(q_table[state])
    next_state, reward, done, _, _ = env.step(action)
    next_state = discretize_state(next_state)
    state = next_state
    total_reward += reward

print("Test Reward:", total_reward)
env.close()

Test Reward: 114.0
