In [1]:
#help taken from anirudh topiwala git-repo

import numpy as np
import gym
import random
import math
from time import sleep

In [2]:
## Initialize the "Cart-Pole" environment
env = gym.make('CartPole-v1')

## Defining the environment related constants

# Number of discrete states (bucket) per state dimension
NUM_BUCKETS = (5,5, 6, 4)  # (x, x', theta, theta')
# Number of discrete actions
NUM_ACTIONS = env.action_space.n # (left, right)
# Bounds for each discrete state
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))
STATE_BOUNDS[1] = [-2.4, 2.4]
STATE_BOUNDS[3] = [-math.radians(15), math.radians(15)]
# Index of the action
ACTION_INDEX = len(NUM_BUCKETS)

## Creating a Q-Table for each state-action pair
qtable = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,))
print(qtable.shape)

## Learning related constants


# Feel free to play with these hyperparameters

total_episodes =1200       # Total episodes
test_episodes = 100            # Test episodes
learning_rate = 1           # Learning rate
max_steps =  250              # Max steps per episode
gamma = 0.99                  # Discounting rate
STREAK_TO_END = 50
SOLVED_T = 150

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01            # Exponential decay rate for exploration prob

MIN_LEARNING_RATE = 0.1

(5, 5, 6, 4, 2)


In [3]:
# function used to discretize observations

def state_to_bucket(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_BUCKETS[i] - 1
        else:
            # Mapping the state bounds to the bucket array
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_BUCKETS[i]-1)*STATE_BOUNDS[i][0]/bound_width
            scaling = (NUM_BUCKETS[i]-1)/bound_width
            bucket_index = int(round(scaling*state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)

In [4]:

num_streaks = 0
rewards = []
for episode in range(total_episodes):
    
    # state = env.reset()
    # this the change here from the frozen lake problem 
    # in that problem we have finite states here it is continuous states 
    # thus we discretize the states into a finite nuber of states 
    
    # Reset the environment
    obv = env.reset()

    # the initial state
    state = state_to_bucket(obv)
    
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # Choose an action a in the current state (greedy or explore)
        
        # im doing an ephsilon greedy approach
        exp_exp_tradeoff = random.uniform(0, 1)  
        # exploitation (taking the max Q value for this state)
        if exp_exp_tradeoff > epsilon:
            # Enter code here            
            action = np.argmax(qtable[state])
            ## Hint: Greedily choose an action according to Q value

        # exploration
        else:
            # Enter code here
            ## Hint: Randomly choose an action
            action = env.action_space.sample()

        # Take this action and observe
        obv, reward, done, info = env.step(action)
        
        # Observe the result
        new_state = state_to_bucket(obv)

        # Do a Q update
        # Enter code here
        ## Hint: One line update equation convert to one line code, start with "qtable[state, action] = ..."
#         print(state + (action,))
        qtable[state + (action,)] = qtable[state + (action,)] + learning_rate * (reward + gamma * np.amax(qtable[new_state]) - qtable[state + (action,)])
#         x =  np.amax(qtable[new_state])
        
        total_rewards += reward
        
        state = new_state
        
        if done == True:         
            print("Episode %d finished after %f time steps" % (episode,step))
#             print(qtable)
            if (step >= SOLVED_T):
               num_streaks += 1
            else:
                num_streaks = 0
            break

            #sleep(0.25)

        # It's considered done when it's solved over 100 times consecutively
        if num_streaks > STREAK_TO_END:
            break
        
    # Decay epsilon to reduce exploration as time progresses
    
    # Enter code here to assign a decay value to "decay_parameter"
    ## Hint: 
    ## 1. Use inbuilt polynomial, exponential(, or whatever works) functions to decay epsilon
    ## 2. "decay_parameter" is a function of "decay_rate" and "episode"
    
    epsilon = max(min_epsilon, min(1, 1.0 - math.log10((episode+1)/25.0))) 
    rewards.append(total_rewards)
    learning_rate = max(MIN_LEARNING_RATE, min(0.5, 1.0 - math.log10((episode+1)/25.0))) 
    
print("Score over time: " +  str(sum(rewards)/total_episodes))
print("Q values:")
print(qtable)

Episode 0 finished after 18.000000 time steps
Episode 1 finished after 15.000000 time steps
Episode 2 finished after 22.000000 time steps
Episode 3 finished after 11.000000 time steps
Episode 4 finished after 26.000000 time steps
Episode 5 finished after 66.000000 time steps
Episode 6 finished after 12.000000 time steps
Episode 7 finished after 9.000000 time steps
Episode 8 finished after 34.000000 time steps
Episode 9 finished after 14.000000 time steps
Episode 10 finished after 21.000000 time steps
Episode 11 finished after 13.000000 time steps
Episode 12 finished after 14.000000 time steps
Episode 13 finished after 12.000000 time steps
Episode 14 finished after 23.000000 time steps
Episode 15 finished after 21.000000 time steps
Episode 16 finished after 12.000000 time steps
Episode 17 finished after 47.000000 time steps
Episode 18 finished after 40.000000 time steps
Episode 19 finished after 14.000000 time steps
Episode 20 finished after 20.000000 time steps
Episode 21 finished afte

Score over time: 220.026666667
Q values:
[[[[[  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]]

   [[  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]]

   [[  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]]

   [[  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]]

   [[  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]]

   [[  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]]]


  [[[  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]
    [  0.           0.        ]]

   [[  0.           0.        ]
    [  0.      

In [5]:

for i_episode in range(100):
    observation = env.reset()
    time = 0
    for t in range(1000):
        env.render()
#         print(observation)
        state = state_to_bucket(observation)
        action = np.argmax(qtable[state])
        observation, reward, done, info = env.step(action)
        if done:
            print(done)
            break
        
        time = t+1
        
    print('Ran {} episodes. Solved after {} trials ✔'.format(i_episode+1, time))
env.close()


True
Ran 1 episodes. Solved after 499 trials ✔
True
Ran 2 episodes. Solved after 499 trials ✔
True
Ran 3 episodes. Solved after 499 trials ✔
True
Ran 4 episodes. Solved after 499 trials ✔
True
Ran 5 episodes. Solved after 499 trials ✔
True
Ran 6 episodes. Solved after 499 trials ✔
True
Ran 7 episodes. Solved after 499 trials ✔
True
Ran 8 episodes. Solved after 499 trials ✔
True
Ran 9 episodes. Solved after 499 trials ✔
True
Ran 10 episodes. Solved after 499 trials ✔
True
Ran 11 episodes. Solved after 499 trials ✔
True
Ran 12 episodes. Solved after 499 trials ✔
True
Ran 13 episodes. Solved after 499 trials ✔
True
Ran 14 episodes. Solved after 499 trials ✔
True
Ran 15 episodes. Solved after 499 trials ✔
True
Ran 16 episodes. Solved after 499 trials ✔
True
Ran 17 episodes. Solved after 499 trials ✔
True
Ran 18 episodes. Solved after 499 trials ✔
True
Ran 19 episodes. Solved after 499 trials ✔
True
Ran 20 episodes. Solved after 499 trials ✔
True
Ran 21 episodes. Solved after 499 trials ✔
T