In [1]:
!pip install gymnasium



In [2]:
import gymnasium as gym
import numpy as np

env = gym.make('CartPole-v1')
np.random.seed(0)

# Initialize variables
alpha = 0.1
gamma = 0.9999
epsilon = 1
nA = env.action_space.n
epsilon_decay = 0.995  # Epsilon decay rate
epsilon_min = 0.01  # Minimum exploration rate

# Initialize bin_bounds with number of bins
cartPosSpace = np.linspace(-2.4, 2.4, 10)
cartVelSpace = np.linspace(-3.0, 3.0, 10)
poleAngleSpace = np.linspace(-0.418, 0.418, 10)
poleVelSpace = np.linspace(-3.0, 3.0, 10)

# Initialize state-action values function
Q = np.random.uniform(-0.5,0.5,(11, 11, 11, 11, nA))


def epsilon_greedy(Q , state_index, epsilon):
    if np.random.uniform(0,1) > epsilon:
        return np.argmax(Q[state_index])
    else:
        return np.random.choice(nA)

def discretize_state(observation):
    cartX, cartXdot, cartTheta, cartThetadot = observation
    cartX_bin = int(np.digitize(cartX, cartPosSpace))
    cartXdot_bin = int(np.digitize(cartXdot, cartVelSpace))
    cartTheta_bin = int(np.digitize(cartTheta, poleAngleSpace))
    cartThetadot_bin = int(np.digitize(cartThetadot, poleVelSpace))
    return (cartX_bin, cartXdot_bin, cartTheta_bin, cartThetadot_bin)

episode_rewards = []
num_episodes = 5000

for episode in range(num_episodes):
    episode_reward = 0
    episode_buffer = []

    currentState, _ = env.reset()
    state = discretize_state(currentState)

    done = False

    while not done:
        if episode % 2 == 0:  
            action = epsilon_greedy(Q, state, epsilon)
        else:
            action = np.argmax(Q[state])
        #action = epsilon_greedy(Q, state, epsilon)
        next_state, reward, done, _ , _ = env.step(action)
        episode_reward += reward
        next_state_discrete = discretize_state(next_state)
        episode_buffer.append((state, action, reward))
        state = next_state_discrete

    episode_rewards.append(episode_reward)

    # Update state value function using Monte Carlo update
    G = 0
    for t in range(len(episode_buffer) - 1, -1, -1):
        gammas = [gamma**i for i in range(len(episode_buffer))]
        state, action, reward = episode_buffer[t]
        G = gammas[t] * G + reward
        state_index = state + (action,)
        Q[state_index] += alpha * (G - Q[state_index])

    if episode > 200:
        epsilon -= 1 / num_episodes if epsilon > 0 else 0
    
    avg_reward = np.mean(episode_rewards[-100:])
    if (episode + 1) % 100 == 0:
        print(f"Episode: {episode + 1}, Average Reward (last 100 episodes): {avg_reward}, epsilon = {epsilon}")
        
    if avg_reward >= 195:
        print(f"Solved after {episode} episodes! Average Rewards is now {avg_reward}")
        break

env.close()

Episode: 100, Average Reward (last 100 episodes): 47.18, epsilon = 1
Episode: 200, Average Reward (last 100 episodes): 53.08, epsilon = 1
Episode: 300, Average Reward (last 100 episodes): 54.16, epsilon = 0.9802000000000022
Episode: 400, Average Reward (last 100 episodes): 52.69, epsilon = 0.9602000000000044
Episode: 500, Average Reward (last 100 episodes): 56.09, epsilon = 0.9402000000000066
Episode: 600, Average Reward (last 100 episodes): 53.25, epsilon = 0.9202000000000088
Episode: 700, Average Reward (last 100 episodes): 53.85, epsilon = 0.900200000000011
Episode: 800, Average Reward (last 100 episodes): 55.73, epsilon = 0.8802000000000132
Episode: 900, Average Reward (last 100 episodes): 69.3, epsilon = 0.8602000000000154
Episode: 1000, Average Reward (last 100 episodes): 58.01, epsilon = 0.8402000000000176
Episode: 1100, Average Reward (last 100 episodes): 78.34, epsilon = 0.8202000000000198
Episode: 1200, Average Reward (last 100 episodes): 167.72, epsilon = 0.800200000000022
E