In [1]:
import gym
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

%matplotlib inline

np.random.seed(0)

In [2]:
env = gym.make('MountainCar-v0')

[2017-05-30 10:10:42,833] Making new env: MountainCar-v0


In [3]:
α = 0.5
γ = 0.99

In [4]:
position_min, velocity_min = env.observation_space.low
position_max, velocity_max = env.observation_space.high

position_offset = (position_max - position_min) / 10
velocity_offset = (velocity_max - velocity_min) / 10

In [5]:
actions_num = env.action_space.n
steps_num = env.spec.timestep_limit

In [6]:
tilings_num = 10
tiles_num = 10

In [7]:
features_num = tilings_num * tiles_num**2

In [8]:
position_tiling = np.zeros((tilings_num, tiles_num+1))
velocity_tiling = np.zeros((tilings_num, tiles_num+1))

for i in range(tilings_num):
    position_tiling[i] = np.linspace(position_min - position_offset * (1 - i / (tilings_num - 1)),
                                     position_max + position_offset * i / (tilings_num - 1), tiles_num+1)
    velocity_tiling[i] = np.linspace(velocity_min - velocity_offset * (1 - i / (tilings_num - 1)),
                                     velocity_max + velocity_offset * i / (tilings_num - 1), tiles_num+1)

In [9]:
def get_features(position, velocity):
    return np.argmin(position_tiling < position, axis=1) * tiles_num + np.argmin(velocity_tiling < velocity, axis=1) \
            + np.arange(tilings_num)*100

In [10]:
Q = np.zeros((tilings_num * tiles_num**2, actions_num))

In [11]:
def next_action(position, velocity):
    return np.argmax(Q[get_features(position, velocity)].sum(axis=0))

In [12]:
epochs = 500

for _ in tqdm(range(epochs)):
    
    s = env.reset()
    
    for _ in range(steps_num):
        a = next_action(*s)
        s_, r, terminal, _ = env.step(a)
        
        t = get_features(*s)
        
        Q[t, a] += α * (r + γ * np.max(Q[get_features(*s_)].sum(axis=0)) - Q[t, a].sum()) / tilings_num
            
        s = s_
        
        if terminal:
            break
            

100%|████████████████████████████████████████| 500/500 [00:37<00:00, 17.39it/s]


In [18]:
s = env.reset()
R = 0

while True:
    env.render()
    s, r, terminal, _ = env.step(next_action(*s))
    R += r
    if terminal:
        print(R)
        break

-108.0
