In [70]:
import gymnasium as gym
import numpy as np
from collections import defaultdict

env = gym.make('CartPole-v1')

n_bins = (6, 12, 6, 12)
bins = [
    np.linspace(-4.8, 4.8, n_bins[0]),
    np.linspace(-4, 4, n_bins[1]),
    np.linspace(-0.418, 0.418, n_bins[2]),
    np.linspace(-4, 4, n_bins[3])
]

def discretize_state(state):
    state_idx = []
    for i in range(len(state)):
        state_idx.append(np.digitize(state[i], bins[i]) - 1)
    return tuple(state_idx)

Q = defaultdict(lambda: np.zeros(env.action_space.n))
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 1000

for episode in range(episodes):
    state, info = env.reset()
    state = discretize_state(state)

    terminated = False
    truncated = False

    while not terminated and not truncated:
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])

        next_state, reward, terminated, truncated, info = env.step(action)

        next_state = discretize_state(next_state)
        done = terminated or truncated

        Q[state][action] += alpha * (float(reward) + gamma * np.max(Q[next_state]) - Q[state][action])
        state = next_state

total_steps = 0
test_episodes = 100
for _ in range(test_episodes):
    state, info = env.reset()
    state = discretize_state(state)
    steps = 0
    terminated = False
    truncated = False
    while not terminated and not truncated:
        action = np.argmax(Q[state])
        next_state, _, terminated, truncated, _ = env.step(action)
        next_state = discretize_state(next_state)
        steps += 1
        state = next_state

    total_steps += steps

env.close()
print(f"Average steps per episode: {total_steps / test_episodes}")

Average steps per episode: 9.82
