In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import time
import os

In [2]:
env = gym.make("Acrobot-v1")
env.reset()

array([ 0.99702029, -0.0771398 ,  0.99802939,  0.06274821, -0.00786349,
       -0.03986648])

In [3]:
for step in range(100):
    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    print(observation)
    time.sleep(0.02)
env.close()
env.reset()

[ 0.9976714  -0.06820394  0.99913902  0.04148749  0.09487007 -0.16780855]
[ 0.99851536 -0.05447081  0.99944044  0.03344864  0.03980326  0.09061267]
[ 0.99921918 -0.03950985  0.99908681  0.04272651  0.10714042  0.00307536]
[ 9.99999991e-01 -1.30983194e-04  9.99998570e-01  1.69106736e-03
  2.77248327e-01 -3.99961165e-01]
[ 0.99861935  0.05252999  0.9972709  -0.07382926  0.23663898 -0.33484695]
[ 0.99597392  0.08964344  0.99225162 -0.12424461  0.12656124 -0.1578329 ]
[ 0.9949355   0.1005154   0.99113626 -0.13284922 -0.01961456  0.07383396]
[ 0.99543338  0.09545883  0.9915072  -0.13005177 -0.03010572 -0.046793  ]
[ 0.99604097  0.08889533  0.98851833 -0.15110102 -0.0352038  -0.16302469]
[ 0.99666727  0.08157425  0.98128507 -0.19256068 -0.03818159 -0.25160635]
[ 0.99816383  0.06057198  0.97690904 -0.21365564 -0.1684583   0.03612507]
[ 0.99985316  0.01713629  0.98350801 -0.18086457 -0.25634813  0.28592568]
[ 0.9993142  -0.03702879  0.99417037 -0.10782061 -0.27219085  0.4314522 ]
[ 0.99734231 

array([ 0.99841289,  0.05631793,  0.99730797, -0.07332681, -0.09726379,
        0.02263332])

In [79]:
PI = np.pi
EPOCHS = 10000 # number of episodes
ALPHA = 0.8 # Learning rate
GAMMA = 0.9 # Discount factor
NUM_BINS = 20

def create_bins(num_bins_per_action = 10):
    bins_cosine_theta1 = np.linspace(-1, 1, num_bins_per_action)
    bins_sin_theta1 = np.linspace(-1, 1, num_bins_per_action)
    bins_cosine_theta2 = np.linspace(-1, 1, num_bins_per_action)
    bins_sin_theta2 = np.linspace(-1, 1, num_bins_per_action)
    bins_angular_velocity1 = np.linspace(-4*PI, 4*PI, num_bins_per_action)
    bins_angular_velocity2 = np.linspace(-9*PI, 9*PI, num_bins_per_action)
    bins = np.array([bins_cosine_theta1, bins_sin_theta1, bins_cosine_theta2, bins_sin_theta2, bins_angular_velocity1, bins_angular_velocity2])
    return bins

BINS = create_bins(NUM_BINS)

def discretize_observation(observations, bins):
    binned_observations = []
    for i , observation in enumerate(observations):
        binned_observations.append(np.digitize(observations[i], bins[i]))
    return tuple(binned_observations)

def epsilon_greedy_action_selection(epsilon, q_table, discrete_state):
    # EXPLORATION
    if np.random.rand() <= epsilon:
        return env.action_space.sample()
    # EXPLOITATION
    else:
        return np.argmax(q_table[discrete_state])

def compute_next_q_value(old_q_value, reward, new_q_value):
    return old_q_value + ALPHA * (reward + GAMMA * new_q_value - old_q_value)

BURN_IN = 1
EPSILON_END =10000
EPSILON_REDUCE = 0.0001

def reduce_epsilon(epsilon, epoch):
    if BURN_IN <= epoch < EPSILON_END:
        return epsilon - EPSILON_REDUCE
    return epsilon

def fail(done, points, reward):
    if done and points > 250:
        reward = -100
    return reward


In [85]:
epsilon =1.0
rewards = []
log_interval = 500
render_interval = 10000

q_table_shape = (NUM_BINS, NUM_BINS, NUM_BINS, NUM_BINS, NUM_BINS, NUM_BINS, env.action_space.n)
q_table = np.zeros(q_table_shape)


In [98]:
%matplotlib

fig = plt.figure()
ax = fig.add_subplot(111)
ax.invert_yaxis()
plt.ion()
fig.canvas.draw()
plt.show()



rewards = []
mean_rewards = []
epochs = []

for epoch in range(EPOCHS):
    initial_state = env.reset()
    discrete_state = discretize_observation(initial_state, BINS)
    done = False
    points = 0

    epochs.append(epoch)

    while not done:
        action = epsilon_greedy_action_selection(epsilon, q_table, discrete_state)
        new_state, reward, done, info = env.step(action)

        new_discrete_state = discretize_observation(new_state, BINS)
        old_q_value = q_table[discrete_state + (action,)]
        new_q_value = np.max(q_table[new_discrete_state])

        reward = fail(done, points, reward)

        q_table[discrete_state + (action,)] = compute_next_q_value(old_q_value, reward, new_q_value)
        discrete_state = new_discrete_state
        points += 1

    
    epsilon = reduce_epsilon(epsilon, epoch)
    rewards.append(points)
    mean_rewards.append(np.mean(rewards[-50:]))
    if epoch % log_interval == 0:
        ax.clear()
        ax.plot(epochs, rewards, label='Rewards')
        ax.plot(epochs, mean_rewards, label='Mean Rewards')
        ax.invert_yaxis()
        plt.legend()
        fig.canvas.draw()
        plt.pause(0.01)
        plt.show()

    
env.close()

        

Using matplotlib backend: QtAgg


KeyboardInterrupt: 

In [102]:
observation = env.reset()
rewards = 0
for step in range(600):
    env.render()
    discrete_state = discretize_observation(observation, BINS)
    action = np.argmax(q_table[discrete_state])
    observation, reward, done, info = env.step(action)
    rewards += 1

    if done: 
        print("Finished with score of ", rewards)
        break
    time.sleep(0.01)

env.close()

Finished with score of  382
