In [1]:
#? imports
import gym
import numpy as np
import random

In [2]:
#? initialize the environment
env = gym.make('MountainCar-v0')

# get the action and observation space (used when constructing the q table)
ACTION_SPACE      = env.action_space.n
OBSERVATION_SPACE = len(env.observation_space.sample())

In [3]:
#? build the q table
Q_INCREMENTS = 20 # how detailed the q table is
DISCRETE_OS_SIZE = [Q_INCREMENTS] * OBSERVATION_SPACE

q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [ACTION_SPACE]))

In [4]:
#? build a function that takes an observation and return the action given by the q table
def obs_To_Index(env, obs, increments):

    # get the bounds of the observation_space
    obs_min = env.observation_space.low
    obs_max = env.observation_space.high

    # normalize the observation
    obs = (obs - obs_min) / (obs_max - obs_min)

    # convert the normalized array to an integer indice
    indice = tuple(np.round(obs * increments).astype(int))

    return indice

In [5]:
#? initialize parameters related to training
EPOCHS        = 5000  # number of environments to simulate
DISCOUNT      = 0.95  # how much the agent cares about future rewards
LEARNING_RATE = 0.1   # how quickly values in the q table change

RENDER_EVERY  = 500  # how often to render a run

In [6]:
#? train the agent by updating the q table
for e in range(1, EPOCHS+1):

    # store the initial state of the environment
    observation = env.reset()
    done = False
        
    while not done:

        # render every [RENDER_EVERY] epochs
        if e % RENDER_EVERY == 0:
            env.render()

        # find the discrete cell coresponding to the current observation
        indice = obs_To_Index(env, observation, Q_INCREMENTS)

        # select the action to take
        action = q_table[indice].argmax()

        # take the action
        new_observation, reward, done, info = env.step(action)

        # calculate the predicted future reward
        new_indice = obs_To_Index(env, new_observation, Q_INCREMENTS)
        future_reward = reward + DISCOUNT * q_table[new_indice].max()

        # update the value in the q table
        current_q = q_table[indice + (action,)]
        new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * future_reward

        q_table[indice + (action,)] = new_q

        # update the current observation
        observation = new_observation

        if (observation[0] >= env.goal_position):
            print(f'Won at epoch {e}')

env.close()

Won at epoch 579
Won at epoch 584
Won at epoch 591
Won at epoch 593
Won at epoch 594
Won at epoch 598
Won at epoch 600
Won at epoch 601
Won at epoch 602
Won at epoch 604
Won at epoch 610
Won at epoch 611
Won at epoch 613
Won at epoch 618
Won at epoch 619
Won at epoch 672
Won at epoch 689
Won at epoch 691
Won at epoch 714
Won at epoch 720
Won at epoch 724
Won at epoch 725
Won at epoch 730
Won at epoch 731
Won at epoch 733
Won at epoch 734
Won at epoch 783
Won at epoch 784
Won at epoch 799
Won at epoch 815
Won at epoch 832
Won at epoch 842
Won at epoch 848
Won at epoch 852
Won at epoch 853
Won at epoch 861
Won at epoch 878
Won at epoch 880
Won at epoch 882
Won at epoch 885
Won at epoch 895
Won at epoch 909
Won at epoch 913
Won at epoch 915
Won at epoch 944
Won at epoch 954
Won at epoch 960
Won at epoch 961
Won at epoch 988
Won at epoch 998
Won at epoch 1015
Won at epoch 1016
Won at epoch 1018
Won at epoch 1087
Won at epoch 1098
Won at epoch 1099
Won at epoch 1105
Won at epoch 1107
Won at