In [33]:
import gym
import numpy as np
import random
import time

In [34]:
# Table of environments- https://github.com/openai/gym/wiki/Table-of-environments
ENV_NAME = 'MountainCar-v0'

In [35]:
env = gym.make(ENV_NAME)
env.reset()

STATE_SIZE = len(env.observation_space.low)
ACTION_SIZE = env.action_space.n

done = False

time.sleep(1)
while not done:
    action = random.randint(0,ACTION_SIZE-1)
    new_state, reward, done, _ = env.step(action)    
    env.render()

env.close()

### Observation Space for MountainCar-v0
    The observation is a `ndarray` with shape `(2,)` where the elements correspond to the following:
    | Num | Observation                                                 | Min                | Max    | Unit |
    |-----|-------------------------------------------------------------|--------------------|--------|------|
    | 0   | position of the car along the x-axis                        | -Inf               | Inf    | position (m) |
    | 1   | velocity of the car                                         | -Inf               | Inf  | position (m) |
    ### Action Space
    There are 3 discrete deterministic actions:
    | Num | Observation                                                 | Value   | Unit |
    |-----|-------------------------------------------------------------|---------|------|
    | 0   | Accelerate to the left                                      | Inf    | position (m) |
    | 1   | Don't accelerate                                            | Inf  | position (m) |
    | 2   | Accelerate to the right                                     | Inf    | position (m) |

### Reward:
    The goal is to reach the flag placed on top of the right hill as quickly as possible, as such the agent is penalised with a reward of -1 for each timestep it isn't at the goal and is not penalised (reward = 0) for when it reaches the goal.
    ### Starting State
    The position of the car is assigned a uniform random value in *[-0.6 , -0.4]*. The starting velocity of the car is always assigned to 0.
    ### Episode Termination
    The episode terminates if either of the following happens:
    1. The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)
    2. The length of the episode is 200.

In [36]:
def calculate_bucket_sizes(ob_space_box,memory):
    low,high = ob_space_box.low, ob_space_box.high
    start = high[0] - low[0]
    product = 1
    for a,b in zip(low[1:],high[1:]):
        product *= (b-a)/start
    bucket_start = (memory/product)**(1/len(low))
    mem_arr = [int(bucket_start)]
    for a,b in zip(low[1:],high[1:]):
        mem_arr.append(int((b-a)/start * bucket_start))
    return mem_arr

In [37]:
# the observation bounds of the state
env.observation_space

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)

In [38]:
# actions we can take
env.action_space

Discrete(3)

In [39]:
STATE_SIZE = len(env.observation_space.low)
ACTION_SIZE = env.action_space.n

In [40]:
BUCKET_SIZE = 16
bucket_sizes = [BUCKET_SIZE] * STATE_SIZE

def get_bucket_idx(num,s_idx):
    """
    Util method to convert a continuous observation space value to bucket index
    """
    low,hi = env.observation_space.low[s_idx],env.observation_space.high[s_idx]
    offset = (num - low)/(hi - low)
    return min(int(offset * bucket_sizes[s_idx]),bucket_sizes[s_idx]-1)

def get_action_space(state):
    """
    Util method to dynamically get state space. We do this to avoid any hard-coding 
    """
    s_idxs = [get_bucket_idx(num,i) for i,num in enumerate(state)]
    action_space = q_table
    for i in s_idxs:
        action_space = action_space[i]
    return action_space


In [41]:
# example of get_bucket_idx- 1st observed state lies in range [-1.2  0.6]
print('lowest value gets bucket of index:',get_bucket_idx(-1.2,0))
print('-0.5 gets index:',get_bucket_idx(-0.5,0))
print('and so on')
print('highest value gets bucket of index:',get_bucket_idx(0.6,0),'which is maximum')

lowest value gets bucket of index: 0
-0.5 gets index: 6
and so on
highest value gets bucket of index: 15 which is maximum


In [42]:
lr = 0.05
discount_factor = 0.95

In [43]:
q_table = np.random.randn(*bucket_sizes,ACTION_SIZE)

In [44]:
q_table.shape

(16, 16, 3)

In [45]:
def update_q_table(old_state,new_state,action,reward):
    """
    update happens after the action yields a reward.
    We need updated state, old state, action and reward
    as we go from old state to new state
    """
    old_action_space,new_action_space = get_action_space(old_state), get_action_space(new_state)
    temporal_diff = reward + discount_factor * max(new_action_space) - old_action_space[action]
    old_action_space[action] += lr*temporal_diff
    

In [47]:
total_rewards = []
RENDER_EVERY = 1000

for i in range(10000):
    if i%1000 == 0:
        print('rendering this env to check progress.')
    env = gym.make(ENV_NAME)
    state = env.reset()

    done = False

    total_reward = 0

    while not done:
        action = np.argmax(get_action_space(state))
        new_state, reward, done, _ = env.step(action)
        total_reward += reward

        update_q_table(state,new_state,action,reward)

        state = new_state
        
        if i % RENDER_EVERY == 0:
            env.render()
    
    if i%500 == 0 and i>10:
        print(f'reward reward avg for {i}th episode :',np.average(total_rewards[-10:]))

    total_rewards.append(total_reward)
    env.close()

rendering this env to check progress.
reward reward avg for 500th episode : -200.0
rendering this env to check progress.
reward reward avg for 1000th episode : -196.8


KeyboardInterrupt: 