In [33]:
import gym
from gym import logger as gymlogger
from gym.wrappers import RecordVideo
gymlogger.set_level(40) #error only
#import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

In [34]:
#Making the environment
env = gym.make("CartPole-v1")

## Initialize variables

In [35]:
#Defining Hypermeters
#LEARNING_RATE = 0.1
LEARNING_RATE = 0.1

DISCOUNT = 0.99 #instead of 0.95
EPISODES = 60000
total = 0
total_reward = 0

#0.25, 0.25, 0.01, 0.1
# Define observation and window size
Observation = [30, 30, 50, 50]
np_array_win_size = np.array([0.5, 0.5, 0.05, 0.5])

epsilon = 0.9
epsilon_decay = 0.995
epsilon_min = 0.1
epsilon_max = 0.95

prev_mean = 0
obs = env.reset()


new_env = env.step(0)[0]
print(new_env)
print(round(new_env[2], 2))

[ 0.00360035 -0.15824363  0.00600953  0.3338028 ]
0.01


## Create Q table

In [36]:
#Creating the q table
q_table = np.zeros((Observation + [env.action_space.n]))
q_table.shape

(30, 30, 50, 50, 2)

## Getting discrete state

In [37]:
#defining the discrete state
def get_discrete_state(state):
    
    #Given a state, round to the nearest 0.25
    #divide by the window size to find which bucket it belongs to
    #discrete_state =np.array([np.round(state[0]*4)/4, np.round(state[1] *4) /4, np.round(state[2], 1) ,np.round(state[3], 1)])
    #discrete_state =np.array([np.round(state[0]*2)/2, np.round(state[1] *2) /2, np.round(state[2]* 2)/2 ,np.round(state[3]* 2)/2])
    discrete_state = state/np_array_win_size+ np.array([15,10,1,10])
    #discrete_state = (discrete_state/np_array_win_size)
    #print(discrete_state)
    return tuple(discrete_state.astype(int))


print(get_discrete_state([4, -2, 0.418, 0.63751878]))

(23, 6, 9, 11)


## Running

In [None]:
#Training the agent
for episode in range(EPISODES):

    discrete_state = get_discrete_state(env.reset()[0])
    done = False
    episode_reward = 0
    episode_length = 0
    
    # update every 2000 episodes
    if episode % 2000 == 0:
        print("Episode: " + str(episode))

    while not done:
        
        # always get max for now
        if np.random.random() < epsilon:
            action = np.argmax(q_table[discrete_state])
        else:
            action = np.random.randint(0, env.action_space.n)
        
        new_env = env.step(action)
        new_state = new_env[0]
        reward = new_env[1]
        done = new_env[2]
        episode_reward += reward
        episode_length += 1
        
        new_discrete_state = get_discrete_state(new_state)

        if not done:
            max_future_q = np.max(q_table[new_discrete_state])
            current_q = q_table[discrete_state + (action,)]
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            q_table[discrete_state + (action,)] = new_q

        discrete_state = new_discrete_state
        
        # check termination conditions
        if abs(new_state[0]) > 2.4 or abs(new_state[2]) > np.radians(12) or episode_length > 500:
            done = True
            
    total_reward += episode_reward
    
    if episode % 1000 == 0:
        mean_reward = total_reward / 1000
        print("Mean Reward: " + str(mean_reward))
        if mean_reward > prev_mean:
            epsilon *= epsilon_decay
        else:
            epsilon = max(epsilon_min, epsilon / epsilon_decay)
        prev_mean = mean_reward
        total_reward = 0

env.close()


Episode: 0
Mean Reward: 0.009
Mean Reward: 43.488
Episode: 2000
Mean Reward: 108.654
Mean Reward: 139.54
Episode: 4000
Mean Reward: 160.27
Mean Reward: 176.257
Episode: 6000
Mean Reward: 186.595
Mean Reward: 193.78
Episode: 8000
Mean Reward: 182.405
Mean Reward: 217.148
Episode: 10000
Mean Reward: 209.515
Mean Reward: 224.653
Episode: 12000
Mean Reward: 225.661
Mean Reward: 240.708
Episode: 14000
Mean Reward: 234.627
Mean Reward: 246.516
Episode: 16000
Mean Reward: 251.056
Mean Reward: 240.15
Episode: 18000
Mean Reward: 267.824
Mean Reward: 281.916
Episode: 20000
Mean Reward: 268.825
Mean Reward: 264.623
Episode: 22000
Mean Reward: 278.499
Mean Reward: 252.944
Episode: 24000
Mean Reward: 266.07
Mean Reward: 263.237
Episode: 26000
Mean Reward: 274.73
Mean Reward: 286.286
Episode: 28000
Mean Reward: 284.305
Mean Reward: 271.51
Episode: 30000
Mean Reward: 262.631
Mean Reward: 271.519
Episode: 32000
Mean Reward: 285.039
Mean Reward: 268.826
Episode: 34000
Mean Reward: 260.117
Mean Reward: 