In [55]:
import numpy as np
from collections import defaultdict

class Agent:

    def __init__(self, nA, control='Sarsa', alpha=0.01, gamma=1, epsilon=0.005):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.control = control
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        
    def __get_probs(self, state):
        if state in self.Q:
            probs = np.ones(self.nA) * self.epsilon / self.nA
            best_a = np.argmax(self.Q[state])
            probs[best_a] = 1 - self.epsilon + self.epsilon / self.nA
            return probs
        else:
            return np.ones(self.nA) / self.nA
    
    def __sarsa(self, state, action, reward, next_state, done):
        next_action = self.select_action(state)
        old_val = self.Q[state][action]
        self.Q[state][action] = old_val + self.alpha * (reward + self.gamma * self.Q[next_state][next_action] - old_val)

    def __sarsa_max(self, state, action, reward, next_state, done):
        next_action = np.argmax(self.Q[state])
        old_val = self.Q[state][action]
        self.Q[state][action] = old_val + self.alpha * (reward + self.gamma * self.Q[next_state][next_action] - old_val)
        
    def __exp_sarsa(self, state, action, reward, next_state, done):
        probs = self.__get_probs(state)
        expected_sarsa = 0
        for next_action in range(self.nA):
            expected_sarsa += probs[next_action] * self.Q[next_state][next_action]
        
        old_val = self.Q[state][action]
        self.Q[state][action] = old_val + self.alpha * (reward + self.gamma * expected_sarsa - old_val)
    
    def select_action(self, state):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        return np.random.choice(np.arange(self.nA), p=self.__get_probs(state))
      
    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        if self.control == 'Sarsa':
            return self.__sarsa(state, action, reward, next_state, done)
        elif self.control == 'Sarsamax':
            return self.__sarsa_max(state, action, reward, next_state, done)
        elif self.control == 'Expected-Sarsa':
            return self.__exp_sarsa(state, action, reward, next_state, done)

In [57]:
from monitor import interact
import gym
import numpy as np

env = gym.make('Taxi-v2')
nA = env.action_space.n
num_episodes = 20000
window = 100

In [None]:
sarsa_agent = Agent(nA, 'Sarsa')
sarsa_avg_rewards, sarsa_best_avg_reward = interact(env, sarsa_agent, num_episodes, window)

In [None]:
sarsa_max_agent = Agent(nA, 'Sarsamax')
sarsa_max_avg_rewards, sarsa_max_best_avg_reward = interact(env, sarsa_max_agent, num_episodes, window)

In [None]:
exp_sarsa_agent = Agent(nA, 'Expected-Sarsa')
exp_sarsa_rewards, exp_sarsa_avg_reward = interact(env, exp_sarsa_agent, num_episodes, window)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

x = np.linspace(0, num_episodes, len(avg_rewards), endpoint=False)
plt.plot(x, np.asarray(sarsa_avg_rewards))
plt.plot(x, np.asarray(sarsa_max_avg_rewards))
plt.plot(x, np.asarray(exp_sarsa_avg_rewards))

plt.xlabel('Episode Number')
plt.ylabel('Average Reward (Over Next %d Episodes)' % window)
plt.legend(['Sarsa', 'Sarsamax', 'Exp. Sarsa'], loc='lower right')

plt.show()