In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gym
import time
from tqdm import tqdm
from IPython.display import clear_output

np.random.seed(42)

# Taxi V3 environment


<img src="https://storage.googleapis.com/lds-media/images/Reinforcement_Learning_Taxi_Env.width-1200.png">

In [None]:
"""
Passenger locations:
- 0: R(ed)
- 1: G(reen)
- 2: Y(ellow)
- 3: B(lue)
- 4: in taxi

Destinations:
- 0: R(ed)
- 1: G(reen)
- 2: Y(ellow)
- 3: B(lue)

Actions:
There are 6 discrete deterministic actions:
- 0: move south
- 1: move north
- 2: move east
- 3: move west
- 4: pickup passenger
- 5: drop off passenger

Rewards:
There is a default per-step reward of -1,
except for delivering the passenger, which is +20,
or executing "pickup" and "drop-off" actions illegally, which is -10.
"""


env = gym.make('Taxi-v3')

# Agent class

In [None]:
class Agent:
    
    def __init__(self, env, agent_type, initial_estimate=0, step_size=0.1, discount=1., epsilon=0.1):
        self.q_values = np.ones([env.observation_space.n, env.action_space.n]) * initial_estimate
        self.action_space = env.action_space
        self.learning_step_fnc = self.get_learning_fnc(agent_type)
        self.discount = discount
        self.step_size = step_size
        self.epsilon = epsilon
        
        self.last_action = None
        self.last_state = None
        
    def get_learning_fnc(self, agent_type):
        if agent_type == 'q-learning':
            return self.q_learning_update_step
        elif agent_type == 'sarsa':
            return self.sarsa_update_step
        elif agent_type == 'expected-sarsa':
            return self.expected_sarsa_update_step
        else:
            raise ValueError("Agent type '{}' is not supported.".format(agent_type))
        
    @staticmethod
    def argmax(q_values):
        max_value = np.max(q_values)
        max_indices = np.where(q_values == max_value)[0]
        return np.random.choice(max_indices)
    
    def select_action(self, state):
        if np.random.uniform() < self.epsilon:
            return self.action_space.sample()
        return self.argmax(self.q_values[state, :])
    
    def agent_start(self, state):
        self.last_action = self.select_action(state)
        self.last_state = state
    
    def agent_step(self, reward, state):
        # select next action
        action = self.select_action(state)
        # update q-value estimates
        self.learning_step_fnc(reward, state, action)
        # select next action
        self.last_action = action
        self.last_state = state
    
    def agent_end(self, reward):
        # update q-value estimates
        previous_q_values = self.q_values[self.last_state, :]
        td_error = reward - previous_q_values[self.last_action]
        self.q_values[self.last_state, self.last_action] += self.step_size * td_error
        
    def q_learning_update_step(self, reward, state, _):
        previous_q_values = self.q_values[self.last_state, :]
        current_q_values = self.q_values[state, :]
        # q learning update: q <- q + alpha * ((r + discount * q_next) - q)
        td_error = reward + self.discount * np.max(current_q_values) - previous_q_values[self.last_action]
        self.q_values[self.last_state, self.last_action] += self.step_size * td_error
        
    def sarsa_update_step(self, reward, state, action):
        previous_q_values = self.q_values[self.last_state, :]
        current_q_values = self.q_values[state, :]
        td_error = reward + self.discount * current_q_values[action] - previous_q_values[self.last_action]
        self.q_values[self.last_state, self.last_action] += self.step_size * td_error
        
    def expected_sarsa_update_step(self, reward, state, action):
        previous_q_values = self.q_values[self.last_state, :]
        current_q_values = self.q_values[state, :]
        # get expected sarsa next value estimate
        max_action = self.argmax(current_q_values)
        expected_value = 0
        for a, q in enumerate(current_q_values):
            if a == max_action:
                expected_value += (1 - self.epsilon + (self.epsilon / self.action_space.n)) * q
            else:
                expected_value += (self.epsilon / self.action_space.n) * q
        # apply the update
        td_error = reward + self.discount * expected_value - previous_q_values[self.last_action]
        self.q_values[self.last_state, self.last_action] += self.step_size * td_error

    def get_locations(self, env, pass_loc, dest_idx):
        """
        dest_idx: 0 -> (0,0), 1 -> (0,4), 2 -> (4,0), 3 -> (4,3)
        pass_loc same as for dest_idx, except that there's also 
        loc 4 -> 'inside taxi'.
        """
        locs = []
        for i in range(5):
            for j in range(5):
                locs.append(env.encode(i, j, pass_loc=pass_loc, dest_idx=dest_idx))
        return np.array(locs)
        
    def render(self, env, pass_loc):
        dest_loc_map = {0: (0,0), 1: (0,4), 2: (4,0), 3: (4,3)}

        fig, axes = plt.subplots(2, 2, figsize=(12, 10))

        dest_indices = range(4)
        for ax, dest_idx in zip(np.ravel(axes), dest_indices):
            ax.set_title('Destination: {}'.format(dest_loc_map[dest_idx]), fontsize=18)
            locs = self.get_locations(env, pass_loc=pass_loc, dest_idx=dest_idx)
            sns.heatmap(np.max(self.q_values, axis=1)[locs].reshape((5,5)), cmap='rocket', ax=ax)

# Function to run one episode

In [None]:
def episode(env, agent):
    """Run one (training) episode.
    """
    last_observation = env.reset()
    terminal = False
    cumulative_reward = 0
    agent.agent_start(last_observation)
    while not terminal:
        observation, reward, terminal, info = env.step(agent.last_action)
        agent.agent_step(reward, observation)
        cumulative_reward += reward
    agent.agent_end(reward)
    return cumulative_reward


def run_experiment(env, agent):
    reward_list = []
    for i in tqdm(range(2000)):
        episode_reward = episode(env, agent)
        reward_list.append(episode_reward)
    return reward_list


def animate_episode(agent, interval, title=None):
    """Animates one episode.
    """
    
    def render(title):
        if title:
            print(title + env.render('ansi'))
        else:
            env.render('human')
    
    clear_output(wait=True)
    location = env.reset()
    terminal = False
    render(title)
    while not terminal:
        time.sleep(interval)
        location, _, terminal, _ = env.step(agent.select_action(location))
        clear_output(wait=True)
        render(title)

def animate_episodes(agent, n_episodes, interval):
    """Animates multiple episodes.
    """
    for i in range(1, n_episodes+1):
        title = ' episode ' + str(i) + ' \n'
        animate_episode(agent, interval, title)

# Initialize agent

In [None]:
agent_qlearning = Agent(env, 'q-learning', initial_estimate=0, epsilon=0.01, step_size=0.1)
# agent_sarsa = Agent(env, 'sarsa', initial_estimate=0, epsilon=0.01)
# agent_expected_sarsa = Agent(env, 'expected-sarsa', initial_estimate=0, epsilon=0.01)

In [None]:
animate_episodes(agent_qlearning, 2, 0.05)

# Run experiment

In [None]:
rewards_qlearning = run_experiment(env, agent_qlearning)
# rewards_sarsa = run_experiment(env, agent_sarsa)
# rewards_expected_sarsa = run_experiment(env, agent_expected_sarsa)

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
plt.plot(pd.Series(rewards_qlearning).rolling(window=20).mean());
# plt.plot(pd.Series(rewards_sarsa).rolling(window=20).mean());
# plt.plot(pd.Series(rewards_expected_sarsa).rolling(window=20).mean());

# Animate learned agent

In [None]:
animate_episodes(agent_qlearning, 5, 0.2)

In [None]:
agent_qlearning.render(env, 4)