In [7]:
import sys
sys.path.append('c:/users/sadia/documents/python scripts/envs/gym/lib/site-packages')
import random
import math
import numpy as np
import gym
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

In [8]:
# cartpole bucket hyperparameters
CARTPOLE_POSITION_BUCKETS = 2
CARTPOLE_POSITION_RANGE = (-2.0, 2.0)
CARTPOLE_VELOCITY_BUCKETS = 8
CARTPOLE_VELOCITY_RANGE = (-1.2, 1.2)
CARTPOLE_THETA_BUCKETS = 16
CARTPOLE_THETA_RANGE = (-0.08, 0.08)
CARTPOLE_THETA_VELOCITY_BUCKETS = 6
CARTPOLE_THETA_VELOCITY_RANGE = (-1.2, 1.2)

In [9]:
class Random():
    def __init__(self):
        super().__init__()

    # the training action is any random action from within the environment action space
    def action(self, env):
        return env.action_space.sample()

In [10]:
class Qlearner():
    def __init__(self, parameters):
        self.alpha = parameters['alpha']
        self.gamma = parameters['gamma']
        self.epsilon = parameters['epsilon']
        super().__init__()
    def initialize_cartpole_q_table(self, env):
            obs_space = CARTPOLE_POSITION_BUCKETS * CARTPOLE_VELOCITY_BUCKETS * CARTPOLE_THETA_BUCKETS * CARTPOLE_THETA_VELOCITY_BUCKETS
            self.q_table = np.zeros([obs_space, env.action_space.n])

            # establish weak priors to optimise training - if theta < 0, move left, if theta > 0 move right
            for i in range(obs_space):
                if (i % (CARTPOLE_THETA_BUCKETS * CARTPOLE_THETA_VELOCITY_BUCKETS) < (CARTPOLE_THETA_BUCKETS / 2)):
                    self.q_table[i][0] = 0.1
                elif (i % (CARTPOLE_THETA_BUCKETS * CARTPOLE_THETA_VELOCITY_BUCKETS) >= (CARTPOLE_THETA_BUCKETS / 2)):
                    self.q_table[i][1] = 0.1

    def cartpole_training_action(self, env, observation):
        self.previous_observation = observation
        if random.uniform(0, 1) < self.epsilon:
            return env.action_space.sample() # explore
        else:
            return np.argmax(self.q_table[self._cartpole_obs_index(observation)]) # exploit

    def cartpole_evaluation_action(self, observation):
        return np.argmax(self.q_table[self._cartpole_obs_index(observation)])

    def cartpole_update(self, observation, action, reward):
        # updates the previous observation qtable entry with the reward gained,
        # uses the maximum/best future option always
        old_value = self.q_table[self._cartpole_obs_index(self.previous_observation), action]
        next_max = np.max(self.q_table[self._cartpole_obs_index(observation)])
        new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)
        self.q_table[self._cartpole_obs_index(self.previous_observation), action] = new_value

    def _cartpole_obs_index(self, observation):
        # because cartpole observations are continuous, we have to bucket them and
        # calculate an index for the qtable
        position, velocity, theta, theta_velocity = observation

        bucketed_position = self._bucket(position, CARTPOLE_POSITION_BUCKETS, CARTPOLE_POSITION_RANGE)
        bucketed_velocity = self._bucket(velocity, CARTPOLE_VELOCITY_BUCKETS, CARTPOLE_VELOCITY_RANGE)
        bucketed_theta = self._bucket(theta, CARTPOLE_THETA_BUCKETS, CARTPOLE_THETA_RANGE)
        bucketed_theta_velocity = self._bucket(theta_velocity, CARTPOLE_THETA_VELOCITY_BUCKETS, CARTPOLE_THETA_VELOCITY_RANGE)

        position_index = (bucketed_position - 1) * CARTPOLE_VELOCITY_BUCKETS * CARTPOLE_THETA_BUCKETS * CARTPOLE_THETA_VELOCITY_BUCKETS
        velocity_index = (bucketed_velocity - 1) * CARTPOLE_THETA_BUCKETS * CARTPOLE_THETA_VELOCITY_BUCKETS
        theta_index = (bucketed_theta - 1) * CARTPOLE_THETA_VELOCITY_BUCKETS
        theta_velocity_index = (bucketed_theta_velocity - 1)

        index = position_index + velocity_index + theta_index + theta_velocity_index
        return index

    def _bucket(self, observation, num_buckets, obs_range):
        # calculate bucket number
        r_min = obs_range[0]
        r_max = obs_range[1]
        r_range = r_max - r_min
        bucket_size = r_range / num_buckets
        bucket = math.ceil((observation + r_range / 2) / bucket_size)

        # bound
        bucket = min(bucket, num_buckets)
        bucket = max(bucket, 1)
        return bucket
        

In [12]:
class Driver:
    def __init__(self, params):
        self.epochs = params['epochs']
        self.env = params['env']
        self.agent = params['agent']
        self.training_rewards = []
        self.evaluation_rewards = []
    def run_cartpole_random(self):
        training_action = lambda _observation: self.agent.action(self.env)
        update = lambda _observation, _action, _reward: None
        evaluation_action = training_action

        self.run(training_action, update, evaluation_action)

    def run_cartpole_qlearner(self):
        self.agent.initialize_cartpole_q_table(self.env)

        training_action = lambda observation: self.agent.cartpole_training_action(self.env, observation)
        update = lambda observation, action, reward: self.agent.cartpole_update(observation, action, reward)
        evaluation_action = lambda observation: self.agent.cartpole_evaluation_action(observation)

        self.run(training_action, update, evaluation_action)

    def run_cartpole_tdlearner(self):
        self.agent.initialize_cartpole_q_policy(self.env)

        training_action = lambda observation: self.agent.cartpole_training_action(self.env, observation)
        update = lambda observation, action, reward: self.agent.cartpole_update(observation, action, reward)
        evaluation_action = lambda observation: self.agent.cartpole_evaluation_action(observation)

        self.run(training_action, update, evaluation_action)
    # main engine: training and evaluation loop, plot then demonstrate
    def run(self, training_action, update, evaluation_action):
        for i in range(self.epochs):
            if ((i + 1) % 1000 == 0):
                print("progress: {}%".format(100 * (i + 1) // self.epochs))
            self.train_once(training_action, update)
            self.evaluate_once(evaluation_action)

        self.plot()
        
        try:
            self.demonstrate(evaluation_action)
        except NotImplementedError:
            print("Cannot demonstrate: render method on env not implemented.")

    # a single instance of training of the agent in the environment
    def train_once(self, training_action, update):
        observation = self.env.reset()
        done = False
        episode_reward = 0
        while not done:
            action = training_action(observation)
            observation, reward, done, info = self.env.step(action)
            episode_reward += reward
            update(observation, action, reward)
        self.training_rewards.append(episode_reward)

    # a single instance of evaluation of the agent at it's current level of training
    def evaluate_once(self, evaluation_action):
        observation = self.env.reset()
        done = False
        episode_reward = 0
        while not done:
            action = evaluation_action(observation)
            observation, reward, done, info = self.env.step(action)
            episode_reward += reward
        self.evaluation_rewards.append(episode_reward)

    # plot training and evaluation reward levels at each epoch
    def plot(self):
        plt.subplot('211')
        plt.plot(self.training_rewards, linewidth=1)
        plt.title('Training reward over time')
        plt.ylabel('reward')
        plt.xlabel('iterations')

        plt.subplot('212')
        plt.plot(self.evaluation_rewards, linewidth=1)
        plt.title('Evaluation reward over time')
        plt.ylabel('reward')
        plt.xlabel('iterations')

        plt.show()

    # use the environments render method and print some additional info
    # to the console. permit user input for repeated demonstrations in a loop
    def demonstrate(self, evaluation_action):
        user_input = 'Y'
        while (user_input == 'Y'):
            observation = self.env.reset()
            done = False
            episode_reward = 0
            reward = 0
            step = 0
            while not done:
                print(f"Step: {step} | Cumulative Reward: {episode_reward}")
                step += 1
                print("RENDERING...")
                self.env.render()
                action = evaluation_action(observation)
                print('observation: ', observation)
                print('action: ', action)
                print('reward: ', reward)
                observation, reward, done, info = self.env.step(action)
                episode_reward += reward

            user_input = input('Enter Y for another demo: ')

    

In [None]:
def cartpole_random():
    agent = Random()
    driver = Driver({
        'epochs': 1000,
        'env': gym.make('CartPole-v1'),
        'agent': agent,
    })
    driver.run_cartpole_random()

def cartpole_qlearner():
    agent = Qlearner({
        'alpha': 0.2,
        'gamma': 0.5,
        'epsilon': 0.1,
    })
    driver = Driver({
        'epochs': 50000,
        'env': gym.make('CartPole-v1'),
        'agent': agent,
    })
    driver.run_cartpole_qlearner()

def cartpole_tdlearner():
    agent = TDlearner({
        'alpha': 0.2,
        'gamma': 0.5,
        'epsilon': 0.1,
    })
    driver = Driver({
        'epochs': 50000,
        'env': gym.make('CartPole-v1'),
        'agent': agent,
    })
    driver.run_cartpole_tdlearner()
if __name__ == '__main__':
    #cartpole_random()
    cartpole_qlearner()
    #cartpole_tdlearner()

progress: 2%
progress: 4%
progress: 6%
progress: 8%
progress: 10%
progress: 12%
progress: 14%
progress: 16%
progress: 18%
progress: 20%
progress: 22%
progress: 24%
progress: 26%
progress: 28%
progress: 30%
progress: 32%
progress: 34%
progress: 36%
progress: 38%
progress: 40%
progress: 42%
progress: 44%
progress: 46%
progress: 48%
progress: 50%
progress: 52%
progress: 54%
progress: 56%
progress: 58%
progress: 60%
progress: 62%
progress: 64%
progress: 66%
progress: 68%
progress: 70%
progress: 72%
progress: 74%
progress: 76%
progress: 78%
progress: 80%
progress: 82%
progress: 84%
progress: 86%
progress: 88%
progress: 90%
progress: 92%
progress: 94%
progress: 96%
progress: 98%
progress: 100%
