**Q-Learning** is about updating `Q-values` in the `Q-Table` to maximize the reward about going to a state for a particular step for an action.

In [18]:
import gym # toolkit for RL algo
import numpy as np # mathematical manipulation
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
# initialise the environment
env = gym.make('MountainCar-v0')

In [20]:
# Q-Learning settings
LEARNING_RATE = 0.1 # rate at which agent learns or updates Q-values
DISCOUNT = 0.95 # the factor of max reward to be considered after the step is taken
EPISODES = 25000 # number of iterations to run
SHOW_EVERY = 2000 # every number of episodes environment is rendered 

In [21]:
# discrete observation space size, for combinations of observation space samples
DISCRETE_OS_SIZE = [20] * len(env.observation_space.low)
DISCRETE_OS_SIZE

[20, 20]

In [22]:
# window size of each observation smaple
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE
discrete_os_win_size

array([0.09 , 0.007])

In [23]:
DISCRETE_OS_SIZE + [env.action_space.n]

[20, 20, 3]

As seen from above, here `q-table` can be thought of `20x20x3`, where `20x20` is the every possible combination of observation space samples(position,velocity) and `x3` can be thought of the action-space.

In [24]:
# initialise q-table with randomo q-values
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))
q_table

array([[[-1.69050913, -1.16948244, -1.26313923],
        [-0.90808084, -1.66328272, -0.31475987],
        [-0.40395714, -1.870182  , -0.67626117],
        ...,
        [-1.84295164, -1.35335332, -0.83033135],
        [-0.16397614, -0.6097854 , -1.29303367],
        [-1.37375663, -1.31751277, -1.92931083]],

       [[-0.00791838, -1.77304587, -0.32802657],
        [-1.05350572, -1.56665497, -1.37713454],
        [-0.22860324, -1.22899994, -1.62200979],
        ...,
        [-1.78616919, -1.28981191, -1.40489348],
        [-0.50521477, -1.8132981 , -1.68508131],
        [-1.77373601, -1.57999813, -0.89155134]],

       [[-1.72793273, -0.91497616, -0.43850907],
        [-0.54355632, -1.69615882, -1.05096149],
        [-1.54429143, -0.59016985, -1.48673438],
        ...,
        [-0.54433672, -0.89091968, -0.24599144],
        [-1.66048   , -0.26567434, -1.85599074],
        [-1.00906792, -1.48288185, -0.96911025]],

       ...,

       [[-1.20216727, -0.26740148, -0.62363353],
        [-1

In [25]:
# shape of q-table
q_table.shape

(20, 20, 3)

Q-Learning is based on **exploration-exploitation trade-off**, where the agent tries to `explore` more possible new states by taking an action, but also `exploit` for maximising the reward for accomplishment of the task.

In [26]:
def get_discrete_state(state):
    '''
    Converts the granular state of an agent to more
    discrete values, by considering the discrete window size
    '''
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int32))  # we use this tuple to look up the 3 Q values for the available actions in the q-table

In [27]:
# Exploration settings - random moves
epsilon = 1  # not a constant, qoing to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)
epsilon_decay_value

8.000640051204096e-05

In [28]:
# For stats for episodes
ep_rewards = []
aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []}
STATS_EVERY = 100 # episodes after which to record stats

In [29]:
# initial discrete state of the agent
discrete_state = get_discrete_state(env.reset())
discrete_state

(6, 10)

In [30]:
done = False

In [31]:
for episode in range(EPISODES): # run through all the episodes
    
    episode_reward = 0
    
    if episode % SHOW_EVERY == 0: # render the environment every SHOW_EVERY episodes 
        render = True
        print(episode)
    else:
        render = False
        
    while not done: # run the agent till job is not done

        if np.random.random() > epsilon:
            # Get action from Q table for max q-value
            action = np.argmax(q_table[discrete_state]) # get the action with maximum q-value - exploitation
        else:
            # Get random action - exploration
            action = np.random.randint(0, env.action_space.n)
            
        new_state, reward, done, _ = env.step(action) # perform the step for that action

        episode_reward += reward
        
        new_discrete_state = get_discrete_state(new_state) # get the new discrete state after performing the action
        
        if render: # render the environment
            env.render()

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # And here's our equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reson) - if goal position is achived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            # the reward at goal accomplishment is 0, because it is achieved
            q_table[discrete_state + (action,)] = 0

        # make new discrete state the state for next iteration 
        discrete_state = new_discrete_state
        
    # Decaying is being done every episode if episode number is within decaying range
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value
    
    ep_rewards.append(episode_reward)
    if not episode % STATS_EVERY: # equiavalent to if episode % STATS_EVERY == 0:
        average_reward = sum(ep_rewards[-STATS_EVERY:])/STATS_EVERY
        aggr_ep_rewards['ep'].append(episode)
        aggr_ep_rewards['avg'].append(average_reward)
        aggr_ep_rewards['max'].append(max(ep_rewards[-STATS_EVERY:]))
        aggr_ep_rewards['min'].append(min(ep_rewards[-STATS_EVERY:]))
        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')


env.close()

0
Episode:     0, average reward: -2.0, current epsilon: 1.00
Episode:   100, average reward:  0.0, current epsilon: 0.99
Episode:   200, average reward:  0.0, current epsilon: 0.98
Episode:   300, average reward:  0.0, current epsilon: 0.98
Episode:   400, average reward:  0.0, current epsilon: 0.97
Episode:   500, average reward:  0.0, current epsilon: 0.96
Episode:   600, average reward:  0.0, current epsilon: 0.95
Episode:   700, average reward:  0.0, current epsilon: 0.94
Episode:   800, average reward:  0.0, current epsilon: 0.94
Episode:   900, average reward:  0.0, current epsilon: 0.93
Episode:  1000, average reward:  0.0, current epsilon: 0.92
Episode:  1100, average reward:  0.0, current epsilon: 0.91
Episode:  1200, average reward:  0.0, current epsilon: 0.90
Episode:  1300, average reward:  0.0, current epsilon: 0.90
Episode:  1400, average reward:  0.0, current epsilon: 0.89
Episode:  1500, average reward:  0.0, current epsilon: 0.88
Episode:  1600, average reward:  0.0, 