# SARSA

In [1]:
import import_ipynb
from Environments import Boxworld_Stochastic
from Environments import run_experiments
from Environments import run_single_exp
import numpy as np
from collections import namedtuple
import matplotlib.pyplot as plt
import random

# First define a namedtuple to hold useful information for actions
Action = namedtuple('Action', 'name index delta_i delta_j' )
up = Action('up', 0, -1, 0)    
down = Action('down', 1, 1, 0)    
left = Action('left', 2, 0, -1)    
right = Action('right', 3, 0, 1) 

# Use a dictionary to convert indices to actions using the index as a key
# Useful for sampling actions for a given state
index_to_actions = {}
for action in [up, down, left, right]:
    index_to_actions[action.index] = action 
    
# Helpful function to convert action in string format to the action object
str_to_actions = {}
for action in [up,down,left,right]:
    str_to_actions[action.name] = action

importing Jupyter notebook from Environments.ipynb


# Epsilon-Greedy Policy

In [2]:
 class E_Greedy_Policy():
        
        
        def __init__(self, epsilon, decay):
            
            self.eps = epsilon
            self.eps_start = epsilon
            self.decay = decay
            
            
        def __call__(self, state, q_values):
            
            greedy = random.random() > self.eps
            
            if greedy:
                action_index = np.argmax(q_values[state])
                
            
            else:
                action_index = random.choice( (0,1,2,3) )
                
            return index_to_actions[action_index].name
        
        # Each timestep the epsilon should decay in order for the policy to be GLIE
        def update_epsilon(self):
        
            self.epsilon = self.epsilon*self.decay
        
        def reset(self):
            self.epsilon = self.eps_start

# SARSA



In [3]:
class SARSA:
    
    def __init__(self, env, policy, gamma, alpha):
        
        self.env = env
        self.env_size = env.size
        self.policy = policy
        self.gamma = gamma
        self.alpha = alpha
        
        self.q_values = np.zeros((self.env_size**4, 4))
        
        self.coord_to_index = env.coord_to_index
        self.index_pairs_to_state = env.index_pairs_to_state
        
    def update_values(self, state, action, rew, state_next, act_next):
        
        self.q_values[state,action] = self.q_values[state,action] + self.alpha * ( rew + self.gamma * self.q_values[state_next,act_next] - self.q_values[state,action]) 
        
        
    # ** Need to pass env in order to get current coordinates of box (in init would only get starting coords...)        
    def display_values(self, env):
        
        value_matrix = np.zeros( (self.env_size,self.env_size) )
        
        # Get index of box coordinates
        box_index = self.coord_to_index[env.position_box[0],env.position_box[1]]
        print([env.position_box[0],env.position_box[1]])
            
        for i in range(1, self.env_size-1):
                for j in range(1, self.env_size-1):

                    agent_index = self.coord_to_index[i,j]
                    state = self.index_pairs_to_state[box_index,agent_index]
                    value_matrix[i,j] = max(self.q_values[state])
                        
        return value_matrix
    
    

# Test
Let's see how (if at all) the agent's performance improves after one episode of SARSA learning

In [16]:
# Function to train agent using SARSA over 1 episode
def train_SARSA_on_one_episode(env,sarsa,policy):
    s = env.reset()
    done = False
    while not done:
        action = epolicy(s, sarsa.q_values)
        
        state_next, rew, done = env.step(action)
        
        action_next = epolicy(state_next,sarsa.q_values)
        
        # Need to convert actions to indices for q-matrix
        a_index = str_to_actions[action].index
        a_next_index = str_to_actions[action_next].index
        
        sarsa.update_values(s,a_index,rew,state_next,a_next_index)
        
        s = state_next
        action = action_next
        
        policy.update_epsilon()

        
def run_single_exp(envir, policy, q_values):
    
        state = envir.reset()
        done = False

        total_reward = 0

        while not done:

            action = policy(state, q_values)
            state, reward, done = envir.step(action)

            total_reward += reward

        return total_reward


def run_experiments(envir, policy, policy_eval_algo, number_exp):
    all_rewards = []
    for n in range(number_exp):

        final_reward = run_single_exp(envir, policy, policy_eval_algo.q_values)
        all_rewards.append(final_reward)

    max_reward = max(all_rewards)
    mean_reward = np.mean(all_rewards)
    var_reward = np.std(all_rewards)

    return all_rewards, max_reward, mean_reward, var_reward

# Tests



In [24]:
boxworld = Boxworld_Stochastic(6)
epolicy = E_Greedy_Policy(1,0.999)
sarsa = SARSA(boxworld,epolicy,gamma=0.9,alpha=0.1)
epolicy.reset()
s = boxworld.reset()
boxworld.display()

X X X X X X 
X E . . ! X 
X . . . . X 
X . . A . X 
X E B . E X 
X X X X X X 



In [25]:
_, _, mu, var = run_experiments(boxworld, epolicy, sarsa, 50)
print(f' mean: {mu}, var: {var}')

for episode in range(1000):
    train_SARSA_on_one_episode(boxworld,sarsa,epolicy)
    epolicy.update_epsilon()
    
_, _, mu, var = run_experiments(boxworld, epolicy, sarsa, 50)
print(f' mean: {mu}, var: {var}')

 mean: -105.08, var: 127.49413162965581
 mean: -146.82, var: 193.73783213404656


In [29]:
boxworld.reset()
epolicy = E_Greedy_Policy(1,0.99999)
epolicy.reset()
sarsa = SARSA(boxworld,epolicy, gamma=0.6, alpha=0.1)

for episode in range(1000):
    train_SARSA_on_one_episode(boxworld,sarsa,epolicy)
    epolicy.update_epsilon()
    # Every 10 epochs, run some experiments to test how well the agent is performing
    if (episode % 50) ==0:
        
        _, _, mean_reward, var_reward = run_experiments(boxworld, epolicy, sarsa, 100)
        print('Episode: ', episode, ' ,epsilon: ',epolicy.epsilon, ' , mean reward: ',  mean_reward, ' , var reward: ',  var_reward)
        
        


Episode:  0  ,epsilon:  0.9998900054998355  , mean reward:  -152.49  , var reward:  182.56995891986173
Episode:  50  ,epsilon:  0.9634639706168329  , mean reward:  -105.18  , var reward:  127.8632378754738
Episode:  100  ,epsilon:  0.9317596783431792  , mean reward:  -119.74  , var reward:  143.17804440625665
Episode:  150  ,epsilon:  0.8963444158717474  , mean reward:  -135.38  , var reward:  151.55301250717517
Episode:  200  ,epsilon:  0.8650735690827434  , mean reward:  -125.81  , var reward:  140.4604353545866
Episode:  250  ,epsilon:  0.8337589817845581  , mean reward:  -150.14  , var reward:  170.4821410001646
Episode:  300  ,epsilon:  0.8039878749247285  , mean reward:  -125.54  , var reward:  166.2026726620243
Episode:  350  ,epsilon:  0.7775313935736281  , mean reward:  -108.63  , var reward:  134.58370295098885
Episode:  400  ,epsilon:  0.7495506079718819  , mean reward:  -134.75  , var reward:  153.44460726920317
Episode:  450  ,epsilon:  0.7252189959143652  , mean reward:  