# average reward actor-critic with eligibility traces

Cart Pole source code: https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gym
import time
from tqdm import tqdm
from itertools import product
import rl.features.tile_coding as tc
from rl.animations.video import write_video, show_video, simulate_episode

np.random.seed(42)

# Cart Pole environment

In [None]:
"""
Description:
    A pole is attached by an un-actuated joint to a cart, which moves along
    a frictionless track. The pendulum starts upright, and the goal is to
    prevent it from falling over by increasing and reducing the cart's
    velocity.
Observation:
    Type: Box(4)
    Num     Observation               Min                     Max
    0       Cart Position             -4.8                    4.8
    1       Cart Velocity             -Inf                    Inf
    2       Pole Angle                -0.418 rad (-24 deg)    0.418 rad (24 deg)
    3       Pole Angular Velocity     -Inf                    Inf
Actions:
    Type: Discrete(2)
    Num   Action
    0     Push cart to the left
    1     Push cart to the right
    Note: The amount the velocity that is reduced or increased is not
    fixed; it depends on the angle the pole is pointing. This is because
    the center of gravity of the pole increases the amount of energy needed
    to move the cart underneath it
Reward:
    Reward is 1 for every step taken, including the termination step
Starting State:
    All observations are assigned a uniform random value in [-0.05..0.05]
Episode Termination:
    Pole Angle is more than 12 degrees.
    Cart Position is more than 2.4 (center of the cart reaches the edge of
    the display).
    Episode length is greater than 200.
    Solved Requirements:
    Considered solved when the average return is greater than or equal to
    195.0 over 100 consecutive trials.
"""


env = gym.make("CartPole-v1")

# Agent

In [None]:
class Agent:
    
    def __init__(self, env, alpha_critic=0.01, lambda_critic=0.9, alpha_actor=0.01, lambda_actor=0.9, alpha_avg_reward=0.01, num_tilings=8, num_tiles=8, iht_size=4096):
        self.action_space = env.action_space
        self.actions = list(range(self.action_space.n))
        self.alpha_critic = alpha_critic
        self.lambda_critic = lambda_critic
        self.alpha_actor = alpha_actor
        self.lambda_actor = lambda_actor
        self.alpha_avg_reward = alpha_avg_reward
        
        self.last_action = None
        self.last_softmax_probs = None
        self.last_state = None
        self.last_features = None
        
        self.iht_size = iht_size
        self.num_tilings = num_tilings
        self.num_tiles = num_tiles
        self.iht = tc.IHT(iht_size)
        
        self.critic_w = np.ones(self.iht_size)
        self.actor_w = [np.zeros(self.iht_size) for i in range(self.action_space.n)]
        self.critic_z = None
        self.actor_z = None
        self.avg_reward = 0.
    
    def state_to_tiles(self, state):
        cart_pos, cart_vel, pole_ang, pole_vel = state
        cart_pos_scaled = ((cart_pos + 4.8) / 9.6) * self.num_tiles
        cart_vel_scaled = ((cart_vel + 5) / 10) * self.num_tiles  # assuming max cart velocity of 5
        pole_ang_scaled = ((pole_ang + 0.418) / 0.836) * self.num_tiles
        pole_vel_scaled = ((pole_vel + 5) / 10) * self.num_tiles  # assuming max pole velocity of 5
        return np.array(tc.tiles(self.iht, self.num_tilings, [cart_pos_scaled, cart_vel_scaled, pole_ang_scaled, pole_vel_scaled]))
    
    @staticmethod
    def softmax_stable(logits):
        z = logits - max(logits)
        num = np.exp(z)
        den = np.sum(num)
        return num / den
    
    def select_action(self, softmax_probs):
        action = np.random.choice(self.actions, p=softmax_probs)
        return action
    
    def agent_start(self, state):
        features = self.state_to_tiles(state)
        action_logits = self.get_all_actor_logits(features)
        softmax_probs = self.softmax_stable(action_logits)
        self.last_action = self.select_action(softmax_probs)
        self.last_softmax_probs = softmax_probs
        self.last_state = state
        self.last_features = features
        self.critic_z = np.zeros(self.iht_size)
        self.actor_z = [np.zeros(self.iht_size) for i in range(self.action_space.n)]
        return self.last_action
    
    def agent_step(self, reward, state, terminal):
        features = self.state_to_tiles(state)
        action_logits = self.get_all_actor_logits(features)
        softmax_probs = self.softmax_stable(action_logits)
        action = self.select_action(softmax_probs)
        # calculate delta
        delta = reward - self.avg_reward + self.get_critic_estimate(features) * (1 - terminal) - self.get_critic_estimate(self.last_features)
        # update average reward estimate
        self.avg_reward += self.alpha_avg_reward * delta
        # update eligibility trace vectors
        self.critic_z[self.last_features] = self.lambda_critic * self.critic_z[self.last_features] + 1  # +1 from the critic gradient active tiles
        self.critic_z[~self.last_features] = self.lambda_critic * self.critic_z[~self.last_features] + 0  # 0 since tiles are not active
        other_action = (1 - self.last_action)
        self.actor_z[self.last_action][self.last_features] = self.lambda_actor * self.actor_z[self.last_action][self.last_features] + (1 - self.last_softmax_probs[self.last_action])
        self.actor_z[self.last_action][~self.last_features] = self.lambda_actor * self.actor_z[self.last_action][~self.last_features] + 0
        self.actor_z[other_action][self.last_features] = self.lambda_actor * self.actor_z[other_action][self.last_features] + (0 - self.last_softmax_probs[other_action])
        self.actor_z[other_action][~self.last_features] = self.lambda_actor * self.actor_z[other_action][~self.last_features] + 0
        # update critic weights
        self.critic_w += self.alpha_critic * delta * self.critic_z
        # update actor weights
        self.actor_w[self.last_action] += self.alpha_actor * delta * self.actor_z[self.last_action]
        self.actor_w[other_action] += self.alpha_actor * delta * self.actor_z[other_action]
        # select next action
        self.last_action = action
        self.last_softmax_probs = softmax_probs
        self.last_state = state
        self.last_features = features
        
    def get_critic_estimate(self, features):
        return self.critic_w[features].sum()
    
    def get_actor_logit(self, features, action):
        return self.actor_w[action][features].sum()
    
    def get_all_actor_logits(self, features):
        action_values = []
        for a in range(self.action_space.n):
            action_values.append(self.get_actor_logit(features, a))
        return np.array(action_values)
    
    def state_to_action(self, state):
        features = self.state_to_tiles(state)
        action_logits = self.get_all_actor_logits(features)
        softmax_probs = self.softmax_stable(action_logits)
        return self.select_action(softmax_probs)

# Helper functions

In [None]:
def episode(env, agent):
    """Run one (training) episode.
    """
    last_observation = env.reset()
    terminal = False
    cumulative_reward = 0
    state_list = []
    agent.agent_start(last_observation)
    while not terminal:
        observation, reward, terminal, info = env.step(agent.last_action)
        cumulative_reward += reward
        state_list.append(observation)
        if info.get('TimeLimit.truncated'):
            # environment truncated the episode, therefore terminal is actually False
            agent.agent_step(reward, observation, False)
        else:
            agent.agent_step(reward, observation, terminal)
    return cumulative_reward, state_list


def run_experiment(env, agent, n_episodes=1000, title=None):
    reward_list = []
    state_lists = []
    pbar = tqdm(range(n_episodes))
    for i in pbar:
        if title:
            pbar.set_description(title)
        episode_reward, state_list = episode(env, agent)
        reward_list.append(episode_reward)
        state_lists.append(state_list)
    return reward_list, state_lists


def animate_episode(env, agent):
    if agent == 'random':
        action_fnc = lambda x: env.action_space.sample()
    else:
        action_fnc = lambda x: agent.state_to_action(x)
    obs = env.reset()
    terminal = False
    i = 0
    while not terminal and i < 500:
        obs, _, terminal, _ = env.step(action_fnc(obs))
        env.render()
        i += 1
        time.sleep(1/30)
    env.close()

# Example episodes

In [None]:
# random agent
simulate_episode(env, lambda x: env.action_space.sample(), width=500, play_type='autoplay')

# Start the experiment with hyperparameter grid search

In [None]:
# {'alpha_critic': 0.0625, 'alpha_actor': 0.0078125, 'alpha_avg_reward': 0.0078125, 'lambda_critic': 0.5, 'lambda_actor': 0.8}
grid = {
    'alpha_critic': [2**-4, 2**-5],
    'alpha_actor': [2**-7, 2**-8],
    'alpha_avg_reward': [2**-6, 2**-7],
    'lambda_critic': [0.8, 0.5],
    'lambda_actor': [0.8, 0.5]
}


def get_hyperparameter_dict(grid):
    param_names = list(grid.keys())
    combinations = product(*[grid[name] for name in param_names])
    for values in combinations:
        yield dict(zip(param_names, values))

In [None]:
results = dict()
for i, agent_setup in enumerate(get_hyperparameter_dict(grid)):
    agent = Agent(env, **agent_setup)
    rewards, observations = run_experiment(env, agent, n_episodes=1000, title='Agent Setup {}: {}'.format(i, agent_setup))
    observations = np.array([s for lst in observations for s in lst])
    results['run' + str(i)] = {'agent': agent, 'agent_setup': agent_setup, 'rewards': rewards, 'observations': observations}

In [None]:
fig, ax = plt.subplots(figsize=(16, 7))

ax.set_title("Rolling mean of total reward per episode", fontsize=20)
ax.set_xlabel("Episode Number", fontsize=16)
ax.set_ylabel("Reward per Episode", fontsize=16)
for run in results.keys():
    ax.plot(pd.Series(results[run]['rewards']).rolling(window=50).mean(), label=results[run]['agent_setup'])
ax.legend(loc='center', bbox_to_anchor=(0.5, -(len(results) / 40)));

In [None]:
for run in results.keys():
    print(results[run]['agent_setup'])
    print(pd.Series(results[run]['rewards']).rolling(window=50).mean().max())

In [None]:
# get the best agent based on the latest rewards
best_average_reward = -np.inf
for run in results.keys():
    final_average_reward = np.mean(results[run]['rewards'][-50:])
    if final_average_reward > best_average_reward:
        best_average_reward = final_average_reward
        best_agent = results[run]['agent']
        observations = results[run]['observations']

In [None]:
# trained agent
simulate_episode(env.env, best_agent.state_to_action, max_steps=2000, width=600, play_type='autoplay')