In [21]:
import numpy as np
import math
# Set up environment
import gym
env = gym.make("CartPole-v0")

# Upper and lower limits
upper_limits = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50)/1.]
lower_limits = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50)/1.]
buckets=(3, 3, 6, 6)

In [22]:
# q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [29]:
# Function for discretizising the env state
# The data from the env is continious and needs to be discretized befored being passed to the agent
def discretize_state(state):
        discretized = list()
        for i in range(len(state)):
            scaling = ((state[i] + abs(lower_limits[i]))
                      / (upper_limits[i] - lower_limits[i]))
            new_state = int(round((buckets[i] - 1) * scaling))
            new_state = min(buckets[i] - 1, max(0, new_state))
            discretized.append(new_state)
        return tuple(discretized)

In [45]:
# Define agent class
class cart_pole_agent:
    # Constructor
    def __init__(self, min_learning_rate=0.1, min_epsilon=0.1, discount=1.0, decay_rate=25):
        
        self.min_learning_rate = min_learning_rate
        self.learning_rate = min_learning_rate
        self.min_epsilon = min_epsilon
        self.epsilon = min_epsilon
        self.discount = discount
        self.decay_rate = decay_rate
        
        
        self.q_table = np.zeros(buckets + (env.action_space.n,))
        
        #self.steps = np.zeros(self.num_episodes)
    
    def choose_action(self, state):
        
        if (np.random.random() < self.epsilon):
            return env.action_space.sample()
        else:
            return np.argmax(self.q_table[state])
        
    def get_learning_rate(self, episode_number):
        
        # Returns the current learning rate of the agent.
        # The learning rate decreases as the episode_number increases.
        # The learning rate cannot go below the min_learning_rate.
        return max(self.min_learning_rate, min(1.0, 1.0 - math.log10((episode_number+1)/self.decay_rate)))
    
    def get_epsilon(self, episode_number):
        
        # Returns the current epsilon value.
        # The epsilon is responsible for random exploration.
        # As we "approach" a splution, we want the amount of randomness to decrease
        return max(self.min_epsilon, min(1.0, 1.0 - math.log10((episode_number+1)/ self.decay_rate)))
    
    def update_q_table(self, state, action, reward, new_state):
        
        self.q_table[state][action] += (self.learning_rate * (reward + self.discount * np.max(self.q_table[new_state]) - self.q_table[state][action]))
    
    

In [48]:
episodes = 500
agent = cart_pole_agent()

# Train the agent
for episode in range(episodes):
    # Get initial state from the environment
    current_state = discretize_state(env.reset())
    
    agent.learning_rate = agent.get_learning_rate(episode)
    agent.epsilon = agent.get_epsilon(episode)
    
    done = False
    
    while not done:
        
        action = agent.choose_action(current_state)
        new_state, reward, done, _ = env.step(action)
        
        new_state = discretize_state(new_state)
        
        agent.update_q_table(current_state, action, reward, new_state)
        
        current_state = new_state
        

print("Training has finished!")

Training has finished!


In [54]:
# Run the trained agent

episodes = 5
for episode in range(episodes):
    done = False
    current_state = discretize_state(env.reset())
    
    while not done:
        env.render()
        action = agent.choose_action(current_state)
        new_state, reward, done, _ = env.step(action)
        new_state = discretize_state(new_state)
        current_state = new_state
    
env.close()