In [None]:
import gym
import numpy as np
import random

# Initialize MountainCar environment
env = gym.make('MountainCar-v0')

# Set random seed for reproducibility
env.seed(0)
np.random.seed(0)
random.seed(0)


  deprecation(
  deprecation(
  deprecation(


In [None]:
# Hyperparameters
learning_rate = 0.1         # Alpha: learning rate for Q-table updates
discount_factor = 0.99      # Gamma: discount factor
epsilon = 1.0               # Epsilon: exploration rate for epsilon-greedy policy
epsilon_decay = 0.995       # Decay rate of epsilon per episode
epsilon_min = 0.01          # Minimum epsilon after decay
num_episodes = 5000         # Number of training episodes
max_steps_per_episode = 200 # Max steps per episode (MountainCar env maxes out at 200)


In [None]:
# Number of discrete bins per state dimension
num_bins = (20, 20)  # (position, velocity)

# Set up bin sizes for position and velocity
state_bounds = list(zip(env.observation_space.low, env.observation_space.high))
state_bins = [np.linspace(state_bounds[i][0], state_bounds[i][1], num_bins[i]) for i in range(len(num_bins))]

# Function to discretize a continuous state
def discretize_state(state):
    discretized = [int(np.digitize(state[i], state_bins[i]) - 1) for i in range(len(state))]
    return tuple(discretized)  # Convert to tuple to use as dictionary key in Q-table


In [None]:
# Initialize Q-table with zeros
action_space_size = env.action_space.n
q_table = np.zeros(num_bins + (action_space_size,))


In [None]:
def select_action(state, epsilon):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()  # Explore: random action
    else:
        return np.argmax(q_table[state])  # Exploit: best action for current state


In [None]:
def update_q_table(state, action, reward, next_state, alpha, gamma):
    best_next_action = np.argmax(q_table[next_state])
    td_target = reward + gamma * q_table[next_state][best_next_action]
    td_error = td_target - q_table[state][action]
    q_table[state][action] += alpha * td_error


In [None]:
# Training loop
for episode in range(num_episodes):
    state = discretize_state(env.reset())
    total_reward = 0

    for step in range(max_steps_per_episode):
        # Select action using epsilon-greedy policy
        action = select_action(state, epsilon)

        # Take the action and observe the result
        next_state, reward, done, _ = env.step(action)
        next_state = discretize_state(next_state)

        # Update Q-table
        update_q_table(state, action, reward, next_state, learning_rate, discount_factor)

        state = next_state
        total_reward += reward

        if done:
            break

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Print progress every 100 episodes
    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")


  if not isinstance(terminated, (bool, np.bool8)):


Episode 100, Total Reward: -200.0, Epsilon: 0.606
Episode 200, Total Reward: -200.0, Epsilon: 0.367
Episode 300, Total Reward: -200.0, Epsilon: 0.222
Episode 400, Total Reward: -200.0, Epsilon: 0.135
Episode 500, Total Reward: -200.0, Epsilon: 0.082
Episode 600, Total Reward: -200.0, Epsilon: 0.049
Episode 700, Total Reward: -200.0, Epsilon: 0.030
Episode 800, Total Reward: -200.0, Epsilon: 0.018
Episode 900, Total Reward: -200.0, Epsilon: 0.011
Episode 1000, Total Reward: -200.0, Epsilon: 0.010
Episode 1100, Total Reward: -163.0, Epsilon: 0.010
Episode 1200, Total Reward: -187.0, Epsilon: 0.010
Episode 1300, Total Reward: -200.0, Epsilon: 0.010
Episode 1400, Total Reward: -147.0, Epsilon: 0.010
Episode 1500, Total Reward: -200.0, Epsilon: 0.010
Episode 1600, Total Reward: -200.0, Epsilon: 0.010
Episode 1700, Total Reward: -200.0, Epsilon: 0.010
Episode 1800, Total Reward: -200.0, Epsilon: 0.010
Episode 1900, Total Reward: -200.0, Epsilon: 0.010
Episode 2000, Total Reward: -196.0, Epsi

In [None]:
# Evaluate the trained agent
num_eval_episodes = 10
total_rewards = []

for _ in range(num_eval_episodes):
    state = discretize_state(env.reset())
    total_reward = 0

    for _ in range(max_steps_per_episode):
        action = np.argmax(q_table[state])  # Select the best action based on Q-table
        next_state, reward, done, _ = env.step(action)
        state = discretize_state(next_state)
        total_reward += reward
        if done:
            break

    total_rewards.append(total_reward)

print(f"Average reward over {num_eval_episodes} evaluation episodes: {np.mean(total_rewards)}")


Average reward over 10 evaluation episodes: -147.8
