In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import time
import os

In [2]:
env = gym.make("Acrobot-v1")
env.reset()

array([ 0.99522918,  0.09756471,  0.99992304,  0.01240634, -0.00124644,
       -0.06435354])

In [6]:
for step in range(100):
    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    print(observation)
    time.sleep(0.02)
env.close()
env.reset()

[-0.57343817  0.81924884 -0.46537616 -0.885113    1.72648526 -3.90805245]
[-0.74314442  0.66913106 -0.91870746 -0.39493873  0.59200595 -2.94298885]
[-0.75788012  0.65239384 -0.99748384  0.07089415 -0.33281836 -1.80639762]
[-0.65531938  0.75535191 -0.9483254   0.31729943 -1.13454872 -0.67933584]
[-0.37735208  0.92606987 -0.95211047  0.30575422 -2.18768572  0.82401089]
[ 0.16436785  0.98639911 -0.99967441  0.02551606 -3.34873646  2.04783923]
[ 0.81248815  0.58297771 -0.84637824 -0.53258227 -4.36548707  3.89162795]
[ 0.96645823 -0.25682385 -0.05378776 -0.99855239 -4.32523643  5.67121179]
[ 0.46317725 -0.88626567  0.94703807 -0.32112132 -3.87888544  6.98613487]
[-0.17650802 -0.9842992   0.5498345   0.83527362 -2.569441    5.84215063]
[-0.49214585 -0.87051276 -0.3783194   0.92567512 -0.83405203  3.99288457]
[-0.51004702 -0.86014652 -0.88273252  0.46987583  0.60020281  2.93336763]
[-0.27989155 -0.96003162 -0.99914256  0.04140226  1.88444902  1.51806121]
[ 0.20676791 -0.97839002 -0.99101704 -

array([ 0.99993494, -0.01140652,  0.99593561, -0.09006805, -0.01420477,
        0.0370999 ])

In [79]:
PI = np.pi
EPOCHS = 10000 # number of episodes
ALPHA = 0.8 # Learning rate
GAMMA = 0.9 # Discount factor
NUM_BINS = 15

def create_bins(num_bins_per_action = 10):
    bins_cosine_theta1 = np.linspace(-1, 1, num_bins_per_action)
    bins_sin_theta1 = np.linspace(-1, 1, num_bins_per_action)
    bins_cosine_theta2 = np.linspace(-1, 1, num_bins_per_action)
    bins_sin_theta2 = np.linspace(-1, 1, num_bins_per_action)
    bins_angular_velocity1 = np.linspace(-4*PI, 4*PI, num_bins_per_action)
    bins_angular_velocity2 = np.linspace(-9*PI, 9*PI, num_bins_per_action)
    bins = np.array([bins_cosine_theta1, bins_sin_theta1, bins_cosine_theta2, bins_sin_theta2, bins_angular_velocity1, bins_angular_velocity2])
    return bins

BINS = create_bins(NUM_BINS)

def discretize_observation(observations, bins):
    binned_observations = []
    for i , observation in enumerate(observations):
        binned_observations.append(np.digitize(observations[i], bins[i]))
    return tuple(binned_observations)

def epsilon_greedy_action_selection(epsilon, q_table, discrete_state):
    # EXPLORATION
    if np.random.rand() <= epsilon:
        return env.action_space.sample()
    # EXPLOITATION
    else:
        return np.argmax(q_table[discrete_state])

def compute_next_q_value(old_q_value, reward, new_q_value):
    return old_q_value + ALPHA * (reward + GAMMA * new_q_value - old_q_value)

BURN_IN = 1
EPSILON_END =10000
EPSILON_REDUCE = 0.0001

def reduce_epsilon(epsilon, epoch):
    if BURN_IN <= epoch < EPSILON_END:
        return epsilon - EPSILON_REDUCE
    return epsilon

def fail(done, points, reward):
    if done and points > 250:
        reward = -100
    return reward


In [85]:
epsilon =1.0
rewards = []
log_interval = 500
render_interval = 10000

q_table_shape = (NUM_BINS, NUM_BINS, NUM_BINS, NUM_BINS, NUM_BINS, NUM_BINS, env.action_space.n)
q_table = np.zeros(q_table_shape)


In [98]:
%matplotlib

fig = plt.figure()
ax = fig.add_subplot(111)
ax.invert_yaxis()
plt.ion()
fig.canvas.draw()
plt.show()



rewards = []
mean_rewards = []
epochs = []

for epoch in range(EPOCHS):
    initial_state = env.reset()
    discrete_state = discretize_observation(initial_state, BINS)
    done = False
    points = 0

    epochs.append(epoch)

    while not done:
        action = epsilon_greedy_action_selection(epsilon, q_table, discrete_state)
        new_state, reward, done, info = env.step(action)

        new_discrete_state = discretize_observation(new_state, BINS)
        old_q_value = q_table[discrete_state + (action,)]
        new_q_value = np.max(q_table[new_discrete_state])

        reward = fail(done, points, reward)

        q_table[discrete_state + (action,)] = compute_next_q_value(old_q_value, reward, new_q_value)
        discrete_state = new_discrete_state
        points += 1

    
    epsilon = reduce_epsilon(epsilon, epoch)
    rewards.append(points)
    mean_rewards.append(np.mean(rewards[-50:]))
    if epoch % log_interval == 0:
        ax.clear()
        ax.plot(epochs, rewards, label='Rewards')
        ax.plot(epochs, mean_rewards, label='Mean Rewards')
        ax.invert_yaxis()
        plt.legend()
        fig.canvas.draw()
        plt.pause(0.01)
        plt.show()

    
env.close()

        

Using matplotlib backend: QtAgg


KeyboardInterrupt: 

In [102]:
observation = env.reset()
rewards = 0
for step in range(600):
    env.render()
    discrete_state = discretize_observation(observation, BINS)
    action = np.argmax(q_table[discrete_state])
    observation, reward, done, info = env.step(action)
    rewards += 1

    if done: 
        print("Finished with score of ", rewards)
        break
    time.sleep(0.01)

env.close()

Finished with score of  382
