In [50]:
import numpy as np
import time
import math
import random
import gym
from typing import Tuple
import matplotlib.pyplot as plt
from IPython import display
from sklearn.preprocessing import KBinsDiscretizer

## CartPole-V1 Enviornment Setup

In [51]:
env = gym.make('CartPole-v1')

## Enviornment Documentation

In [35]:
?env.env

## Visualization

In [33]:
#Hard coded policy value
policy = lambda obs: 0
for i in range(5):
    obs = env.reset()
    for j in range(30):
        actions = policy(obs)
        obs, reward, done, info = env.step(actions)
        env.render()
        print(obs)
        ## IF WANT TO PRESENT ANIMATION IN JUPYTER
        #plt.imshow(env.render(mode='rgb_array'))
        #display.display(plt.gcf())    
        #display.clear_output(wait=True)
        time.sleep(0.1)
env.close()

[ 0.02565401 -0.1647732   0.04665167  0.27335662]
[ 0.02235855 -0.36052868  0.0521188   0.580381  ]
[ 0.01514797 -0.55634075  0.06372643  0.8890162 ]
[ 0.00402116 -0.7522669   0.08150674  1.2010318 ]
[-0.01102418 -0.9483429   0.10552739  1.5181059 ]
[-0.02999104 -1.144571    0.1358895   1.841778  ]
[-0.05288246 -1.3409059   0.17272507  2.1733942 ]
[-0.07970057 -1.5372396   0.21619295  2.5140417 ]




[-0.11044537 -1.7333832   0.26647377  2.8644748 ]
[-0.14511304 -1.9290476   0.32376328  3.225032  ]
[-0.18369399 -2.1238244   0.3882639   3.59555   ]
[-0.22617048 -2.3171692   0.46017492  3.9752834 ]
[-0.27251387 -2.5083911   0.5396806   4.3628454 ]
[-0.32268167 -2.6966553   0.6269375   4.756181  ]
[-0.37661478 -2.8810008   0.7220611   5.1525936 ]
[-0.4342348 -3.060383   0.825113   5.5488324]
[-0.49544245 -3.2337406   0.93608963  5.94124   ]
[-0.56011724 -3.400087    1.0549145   6.3259325 ]
[-0.628119  -3.558625   1.1814331  6.698982 ]
[-0.6992915 -3.7088695  1.3154128  7.0565248]
[-0.7734689 -3.850772   1.4565432  7.3947597]
[-0.8504843 -3.9848351  1.6044384  7.709769 ]
[-0.930181  -4.1122074  1.7586337  7.997176 ]
[-1.0124252 -4.2347474  1.9185773  8.251681 ]
[-1.0971202 -4.355042   2.083611   8.466582 ]
[-1.184221  -4.4763546  2.2529426  8.633485 ]
[-1.273748  -4.6024528  2.4256122  8.742444 ]
[-1.3657972 -4.737281   2.6004612  8.782832 ]
[-1.4605428 -4.884448   2.7761178  8.745061 

KeyboardInterrupt: 

## Policy Function

In [52]:
#policy = lambda _,__,___, tip_velocity : int( tip_velocity > 0 )

## Buckets and Actions

In [64]:
#n_bucket = (1,1,6,3)
n_bucket = (6,12)
n_actions = env.action_space.n
print(n_actions)
print(n_bucket)

2
(6, 12)


## Reseting boundarys

In [54]:
state_val_bounds = list(zip(env.observation_space.low,env.observation_space.high))
#Reset cart velocity
state_val_bounds[1] = [-0.5,0.5]
#Reset pole angular velocity
state_val_bounds[3] = [-math.radians(50),math.radians(50)]
print(state_val_bounds[2][0])

-0.41887903


## Action index and Q value

In [55]:
act_index = len(n_bucket)
q_val_table = np.zeros(n_bucket+(n_actions,))
q_val_table.shape


(6, 12, 2)

## Define rate

In [56]:
min_exp_rate = 0.1
min_learn_rate = 0.01

## INITIALIZATIONS

In [57]:
episodes = 2000
discount = 0.95

In [58]:
## This function choose what action to perform
def sel_act(sel_val,exp_rate):
    #epsilon greedy algorithm
    #exploration and exploitation
    if random.random() < exp_rate:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_val_table[sel_val])
    return action

In [59]:
## This function choose what explore rate to use
def sel_exp_rate(input):
    return max(min_exp_rate,min(1,1.0 - math.log10((input+1)/25)))
## This function choose what learn rate to use
def sel_learn_rate(input):
    return max(min_learn_rate,min(1.0,1.0 - math.log10((input+1)/25)))

In [71]:
## Bucket state value
def bucket_state_val(stateval):
    est = KBinsDiscretizer(n_bins=n_bucket, encode='ordinal', strategy='uniform')
    est.fit([[state_val_bounds[2][0],state_val_bounds[3][0]], [state_val_bounds[2][1],state_val_bounds[3][1]]])
    return tuple(map(int,est.transform([[stateval[2], stateval[3]]])[0]))

In [72]:
## Update Q value
def Update_Q_val(best_q_val,reward):
    # Temporal difference
    return discount * best_q_val + reward
    

## Training

In [73]:
for episode in range(episodes):
    exp_rate = sel_exp_rate(episode)
    learn_rate = sel_learn_rate(episode)
    obs = env.reset()
    init_state_val = bucket_state_val(obs)
    print(init_state_val)
    done = False
    while done == False:
        action = sel_act(init_state_val,exp_rate)
        obs, reward, done, info = env.step(action)
        state_val = bucket_state_val(obs)
        best_q_val = np.max(q_val_table[state_val])
        learnt_val = Update_Q_val(best_q_val,reward)
        old_val = q_val_table[init_state_val][action]
        # Bellman equation
        q_val_table[init_state_val][action] = (1 - learn_rate) * old_val + learnt_val * learn_rate
        init_state_val = state_val
        env.render()
        #print('Episode: %d' % episode)
        #print(best_q_val)
        #print('Action: %d' % action)
        #print('Reward : %f' % reward)
env.close()

(3, 6)
(3, 5)
(2, 5)
(2, 5)
(3, 5)
(2, 6)
(3, 6)
(2, 6)
(2, 6)
(3, 5)
(3, 5)
(3, 6)
(2, 6)
(3, 6)
(2, 6)
(2, 5)
(2, 6)
(2, 6)
(3, 6)
(2, 5)
(2, 6)
(2, 6)
(3, 5)
(3, 6)
(3, 5)
(2, 5)
(3, 5)
(2, 6)
(2, 5)
(2, 5)
(2, 5)
(2, 6)
(2, 6)
(2, 5)
(3, 6)
(2, 6)
(3, 5)
(2, 6)
(2, 6)
(3, 5)
(3, 5)
(2, 5)
(3, 5)
(2, 5)
(2, 6)
(2, 6)
(3, 6)
(2, 6)
(2, 5)
(3, 5)
(3, 6)
(3, 6)
(3, 6)
(3, 5)
(2, 5)
(2, 5)
(3, 6)
(3, 5)
(3, 6)
(2, 6)
(3, 6)
(2, 5)
(2, 6)
(2, 6)
(3, 6)
(2, 6)
(3, 5)
(3, 5)
(2, 5)
(3, 6)
(3, 6)
(3, 6)
(2, 5)
(2, 5)
(3, 6)
(2, 5)
(2, 5)
(3, 5)
(2, 6)
(3, 6)
(3, 6)
(2, 6)
(3, 5)


KeyboardInterrupt: 

In [None]:
env.render(close=True)