In [1]:
#? imports
import gym
import numpy as np
import random

In [2]:
#? initialize the environment
env = gym.make('MountainCar-v0')

# get the action and observation space (used when constructing the q table)
ACTION_SPACE      = env.action_space.n
OBSERVATION_SPACE = len(env.observation_space.sample())

In [3]:
#? build the q table
Q_INCREMENTS = 20 # how detailed the q table is
DISCRETE_OS_SIZE = [Q_INCREMENTS] * OBSERVATION_SPACE

q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [ACTION_SPACE]))

In [4]:
#? build a function that takes an observation and return the action given by the q table
def obs_To_Index(env, obs, increments):

    # get the bounds of the observation_space
    obs_min = env.observation_space.low
    obs_max = env.observation_space.high

    # normalize the observation
    obs = (obs - obs_min) / (obs_max - obs_min)

    # convert the normalized array to an integer indice
    indice = tuple(np.floor(obs * increments).astype(int))

    return indice

In [5]:
#? initialize parameters related to training
EPOCHS        = 1000  # number of environments to simulate
DISCOUNT      = 0.95  # how much the agent cares about future rewards
LEARNING_RATE = 0.1   # how quickly values in the q table change

RENDER_EVERY  = 500   # how often to render a run

In [6]:
#? train the agent by updating the q table
for e in range(1, EPOCHS+1):

    # store the initial state of the environment
    observation = env.reset()
    done = False
        
    while not done:

        # render every [RENDER_EVERY] epochs
        if e % RENDER_EVERY == 0:
            env.render()

        # find the discrete cell coresponding to the current observation
        indice = obs_To_Index(env, observation, Q_INCREMENTS)

        # select the action to take
        action = q_table[indice].argmax()

        # take the action
        new_observation, reward, done, info = env.step(action)

        # calculate the predicted future reward
        new_indice = obs_To_Index(env, new_observation, Q_INCREMENTS)
        future_reward = reward + DISCOUNT * q_table[new_indice].max()

        # update the value in the q table
        current_q = q_table[indice + (action,)]
        new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * future_reward

        q_table[indice + (action,)] = new_q

        # update the current observation
        observation = new_observation

        if (observation[0] >= env.goal_position):
            print(f'Won on epoch {e}')

    # debug message
    if e % 50 == 0:
        print(f'Reached epoch {e}')

env.close()

Won at epoch 488
Won at epoch 633
Won at epoch 736
Won at epoch 788
Won at epoch 804
Won at epoch 824
Won at epoch 826
Won at epoch 828
Won at epoch 829
Won at epoch 830
Won at epoch 835
Won at epoch 836
Won at epoch 837
Won at epoch 841
Won at epoch 842
Won at epoch 843
Won at epoch 844
Won at epoch 845
Won at epoch 846
Won at epoch 848
Won at epoch 849
Won at epoch 851
Won at epoch 853
Won at epoch 854
Won at epoch 860
Won at epoch 861
Won at epoch 862
Won at epoch 864
Won at epoch 865
Won at epoch 866
Won at epoch 867
Won at epoch 869
Won at epoch 870
Won at epoch 871
Won at epoch 872
Won at epoch 873
Won at epoch 903
Won at epoch 906
Won at epoch 909
Won at epoch 910
Won at epoch 912
Won at epoch 917
Won at epoch 918
Won at epoch 919
Won at epoch 921
Won at epoch 922
Won at epoch 992
Won at epoch 993
Won at epoch 995
Won at epoch 1000
