In [1]:
#? imports
import gym
import numpy as np
import random

In [2]:
#? initialize the environment
env = gym.make('MountainCar-v0')

# get the action and observation space (used when constructing the q table)
ACTION_SPACE      = env.action_space.n
OBSERVATION_SPACE = len(env.observation_space.sample())

In [3]:
#? build the q table
Q_INCREMENTS = 10 # how detailed the q table is
DISCRETE_OS_SIZE = [Q_INCREMENTS] * OBSERVATION_SPACE

q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [ACTION_SPACE]))

In [4]:
#? build a function that takes an observation and return the action given by the q table
def obs_To_Index(env, obs, increments):

    # get the bounds of the observation_space
    obs_min = env.observation_space.low
    obs_max = env.observation_space.high

    # normalize the observation
    obs = (obs - obs_min) / (obs_max - obs_min)

    # convert the normalized array to an integer indice
    indice = tuple(np.floor(obs * increments).astype(int))

    return indice

In [5]:
#? initialize parameters related to training
EPOCHS        = 5000  # number of environments to simulate
DISCOUNT      = 0.95  # how much the agent cares about future rewards
LEARNING_RATE = 0.1   # how quickly values in the q table change
EPSILON       = 0.5   # chance of the agent taking a random action
EPSILON_DECAY = 0.9998

RENDER_EVERY  = 500  # how often to render a run

In [6]:
#? train the agent by updating the q table
for e in range(1, EPOCHS+1):

    # store the initial state of the environment
    observation = env.reset()
    done = False
        
    while not done:

        # render every [RENDER_EVERY] epochs
        if e % RENDER_EVERY == 0:
            env.render()

        # find the discrete cell coresponding to the current observation
        indice = obs_To_Index(env, observation, Q_INCREMENTS)

        # select the action to take
        if random.uniform(0, 1) < EPSILON:
            action = env.action_space.sample() # random action (exploration)
        else:
            action = q_table[indice].argmax()  # action from the q table

        # take the action
        new_observation, reward, done, info = env.step(action)

        # calculate the predicted future reward
        new_indice = obs_To_Index(env, new_observation, Q_INCREMENTS)
        future_reward = reward + DISCOUNT * q_table[new_indice].max()

        # update the value in the q table
        current_q = q_table[indice + (action,)]
        new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * future_reward

        q_table[indice + (action,)] = new_q

        # update the current observation
        observation = new_observation

        # debug message upon wining
        if observation[0] >= env.goal_position:
            print(f'Won on epoch {e}')

    # reduce epsilon
    EPSILON = EPSILON * EPSILON_DECAY

    # debug message
    if e % 200 == 0:
        print(f'Reached epoch {e}')

env.close()

# save the q table
np.save('4_trained_t_table.npy', q_table)