In [13]:
from __future__ import print_function
import gym
import sys
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
env = gym.make("CartPole-v0")

In [3]:
print(env.observation_space)
print(env.action_space)
print("observation state space consists of cartpole position, cartpole velocity, pole angle, pole velocity")
if(env.observation_space.is_bounded):
    print("lower_bound on observation state spaces: ", env.observation_space.low)
    print("upper_bound on observation state spaces: ", env.observation_space.high)
print("action space consists of 0-push left, 1-push right")

Box(4,)
Discrete(2)
observation state space consists of cartpole position, cartpole velocity, pole angle, pole velocity
lower_bound on observation state spaces:  [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
upper_bound on observation state spaces:  [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
action space consists of 0-push left, 1-push right


In [29]:
# There are 4 discrete states and 2 discrete actions.
# Choose discretization criteria for the states
# Considering only pole angle and position
buckets=(1, 1, 6, 12,)
def discretize(obs):
        upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50)]
        lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50)]
        ratios = [(obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(obs))]
        new_obs = [int(round((buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
        new_obs = [min(buckets[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
        return tuple(new_obs)

In [39]:
# Q-Learning approach
episodes = 10000
done = False
alpha = 0.1
gamma = 0.99
epsilon = 0.95

reward_history = []
average_reward_history = []
counter = 0

ns0 = int(10*(env.observation_space.high[0] - env.observation_space.low[0])) + 1
ns1 = int(10*(env.observation_space.high[2] - env.observation_space.low[2])) + 1
# q_table = np.random.uniform(low=-1, high=1, size=(ns0, ns1, env.action_space.n))
# q_table = np.zeros((ns0, ns1, env.action_space.n))
Q = np.zeros(buckets + (env.action_space.n,))

# for every episode
for episode in range(episodes):
    
    # reset environment
    current_state = discretize(env.reset())
#     q_current_state = [int(10*(current_state[0]-env.observation_space.low[0])), int(10*(current_state[2]-env.observation_space.low[2]))]
    
    # decay epsilon
    if epsilon > 0.1:
        epsilon = epsilon - 10/episodes
    if alpha < 0.9:
        alpha = alpha + 10/episodes
    
    # initialize any other value if needed
    done = False
    episodic_reward = 0

    # until episode ends
    while(not done):
        
        #choose when to render
#         if(episodes%10 == 9):
#             env.render()
            
        # choose an action to perform
        # epsilon-greedy strategy
        if(np.random.random() < epsilon): # exploration
            action = env.action_space.sample()
        else:                            # exploitation
#             action = np.argmax(q_table[q_current_state[0], q_current_state[1]])
            action = np.argmax(Q[current_state])
                
        
        # execute the action
        obs, reward, done, _ = env.step(action)
#         q_next_state = [int(10*(obs[0]-env.observation_space.low[0])), int(10*(obs[2]-env.observation_space.low[2]))]
        next_state = discretize(obs)
        
        # update Q table based on this action
#         q_table[q_current_state[0], q_current_state[1], action] = alpha*q_table[q_current_state[0], q_current_state[1], action] + (1-alpha)*(reward + gamma*np.argmax(q_table[q_next_state[0], q_next_state[1]]))
        Q[current_state] = alpha*Q[current_state] + (1-alpha)*(reward + gamma*np.argmax(Q[next_state]))
        
        # next iteration preparation
        current_state = next_state
        counter += 1
        
        episodic_reward += reward
        
    reward_history.append(episodic_reward)
    average_reward_history.append(np.mean(reward_history))
    average_reward = np.mean(reward_history)
    
    if(average_reward>=195 and episode>=100):
        print('Ran {} episodes. Solved after {} trials ✔'.format(episode, episode - 100))
        break
    
    if episode % 100 == 0:
        print('Epsilon value is {}'.format(epsilon))
        print('alhpa value is {}'.format(alpha))
        print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(episode, average_reward))
            
#             if(counter%100==99):
#                 fig = plt.figure(1)
#                 plt.clf()
#                 plt.subplot(121)
#                 plt.plot(reward_history,'r')
#                 plt.xlabel('Episode')
#                 plt.ylabel('Reward')
#                 plt.title('Reward Per Episode')
#                 plt.subplot(122)
#                 plt.plot(average_reward_history, 'b')
#                 plt.xlabel('Episode')
#                 plt.ylabel('average_reward')
#                 plt.title('average_reward plot')
#                 plt.pause(0.01)
#                 fig.canvas.draw()
            
env.close()

Epsilon value is 0.949
alhpa value is 0.101
[Episode 0] - Mean survival time over last 100 episodes was 14.0 ticks.
Epsilon value is 0.8489999999999999
alhpa value is 0.2010000000000001
[Episode 100] - Mean survival time over last 100 episodes was 20.128712871287128 ticks.
Epsilon value is 0.7489999999999998
alhpa value is 0.30100000000000016
[Episode 200] - Mean survival time over last 100 episodes was 20.412935323383085 ticks.
Epsilon value is 0.6489999999999997
alhpa value is 0.40100000000000025
[Episode 300] - Mean survival time over last 100 episodes was 19.398671096345517 ticks.
Epsilon value is 0.5489999999999996
alhpa value is 0.5010000000000003
[Episode 400] - Mean survival time over last 100 episodes was 18.306733167082296 ticks.
Epsilon value is 0.4489999999999995
alhpa value is 0.6010000000000004
[Episode 500] - Mean survival time over last 100 episodes was 17.461077844311376 ticks.
Epsilon value is 0.3489999999999994
alhpa value is 0.7010000000000005
[Episode 600] - Mean s

Epsilon value is 0.0999999999999992
alhpa value is 0.9000000000000007
[Episode 5300] - Mean survival time over last 100 episodes was 10.708922844746274 ticks.
Epsilon value is 0.0999999999999992
alhpa value is 0.9000000000000007
[Episode 5400] - Mean survival time over last 100 episodes was 10.698204036289576 ticks.
Epsilon value is 0.0999999999999992
alhpa value is 0.9000000000000007
[Episode 5500] - Mean survival time over last 100 episodes was 10.68314851845119 ticks.
Epsilon value is 0.0999999999999992
alhpa value is 0.9000000000000007
[Episode 5600] - Mean survival time over last 100 episodes was 10.667380824852705 ticks.
Epsilon value is 0.0999999999999992
alhpa value is 0.9000000000000007
[Episode 5700] - Mean survival time over last 100 episodes was 10.653394141378705 ticks.
Epsilon value is 0.0999999999999992
alhpa value is 0.9000000000000007
[Episode 5800] - Mean survival time over last 100 episodes was 10.639717290122393 ticks.
Epsilon value is 0.0999999999999992
alhpa value

In [24]:
np.zeros((1,1,6,12,) + (env.action_space.n,)).shape


(1, 1, 6, 12, 2)

In [19]:
import gym
import numpy as np
import math
from collections import deque

class QCartPoleSolver():
    def __init__(self, buckets=(1, 1, 6, 12,), n_episodes=1000, n_win_ticks=195, min_alpha=0.1, min_epsilon=0.1, gamma=1.0, ada_divisor=25, max_env_steps=None, quiet=False, monitor=False):
        self.buckets = buckets # down-scaling feature space to discrete range
        self.n_episodes = n_episodes # training episodes 
        self.n_win_ticks = n_win_ticks # average ticks over 100 episodes required for win
        self.min_alpha = min_alpha # learning rate
        self.min_epsilon = min_epsilon # exploration rate
        self.gamma = gamma # discount factor
        self.ada_divisor = ada_divisor # only for development purposes
        self.quiet = quiet

        self.env = gym.make('CartPole-v0')
        if max_env_steps is not None: self.env._max_episode_steps = max_env_steps
        if monitor: self.env = gym.wrappers.Monitor(self.env, 'tmp/cartpole-1', force=True) # record results for upload

        self.Q = np.zeros(self.buckets + (self.env.action_space.n,))

    def discretize(self, obs):
        upper_bounds = [self.env.observation_space.high[0], 0.5, self.env.observation_space.high[2], math.radians(50)]
        lower_bounds = [self.env.observation_space.low[0], -0.5, self.env.observation_space.low[2], -math.radians(50)]
        ratios = [(obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(obs))]
        new_obs = [int(round((self.buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
        new_obs = [min(self.buckets[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
        return tuple(new_obs)

    def choose_action(self, state, epsilon):
        return self.env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(self.Q[state])

    def update_q(self, state_old, action, reward, state_new, alpha):
        self.Q[state_old][action] += alpha * (reward + self.gamma * np.max(self.Q[state_new]) - self.Q[state_old][action])

    def get_epsilon(self, t):
        return max(self.min_epsilon, min(1, 1.0 - math.log10((t + 1) / self.ada_divisor)))

    def get_alpha(self, t):
        return max(self.min_alpha, min(1.0, 1.0 - math.log10((t + 1) / self.ada_divisor)))

    def run(self):
        scores = deque(maxlen=100)

        for e in range(self.n_episodes):
            current_state = self.discretize(self.env.reset())

            alpha = self.get_alpha(e)
            epsilon = self.get_epsilon(e)
            done = False
            i = 0

            while not done:
                # self.env.render()
                action = self.choose_action(current_state, epsilon)
                obs, reward, done, _ = self.env.step(action)
                new_state = self.discretize(obs)
                self.update_q(current_state, action, reward, new_state, alpha)
                current_state = new_state
                i += 1

            scores.append(i)
            mean_score = np.mean(scores)
            if mean_score >= self.n_win_ticks and e >= 100:
                if not self.quiet: print('Ran {} episodes. Solved after {} trials ✔'.format(e, e - 100))
                return e - 100
            if e % 100 == 0 and not self.quiet:
                print('[Episode {}] - Mean survival time over last 100 episodes was {} ticks.'.format(e, mean_score))

        if not self.quiet: print('Did not solve after {} episodes 😞'.format(e))
        return e

if __name__ == "__main__":
    solver = QCartPoleSolver()
    solver.run()
    # gym.upload('tmp/cartpole-1', api_key='')

[Episode 0] - Mean survival time over last 100 episodes was 17.0 ticks.
[Episode 100] - Mean survival time over last 100 episodes was 40.75 ticks.
[Episode 200] - Mean survival time over last 100 episodes was 150.04 ticks.
Ran 241 episodes. Solved after 141 trials ✔
