In [37]:
import jdc
# --
import numpy as np
# --
from rl_glue import RLGlue
# --
from Agent import BaseAgent 
from Environment import BaseEnvironment  
# --
from manager import Manager
# --
from itertools import product
# --
from tqdm import tqdm

In [38]:
# Create empty CliffWalkEnvironment class.
# These methods will be filled in later cells.
class WindyWorld(BaseEnvironment):
    def env_init(self, env_info={}):
        raise NotImplementedError

    def env_start(self, state):
        raise NotImplementedError

    def env_step(self, state):
        raise NotImplementedError

    def env_end(self, reward):
        raise NotImplementedError
        
    def env_cleanup(self, reward):
        raise NotImplementedError
    
    # helper method
    def state(self, loc):
        raise NotImplementedError

In [39]:
%%add_to WindyWorld
def env_init(self, env_info={}):
    """Setup for the environment called when the experiment first starts.
    Note:
    Initialize a tuple with the reward, first state, boolean
    indicating if it's terminal.
    """
        
    # Note, we can setup the following variables later, in env_start() as it is equivalent. 
    # Code is left here to adhere to the note above, but these variables are initialized once more
    # in env_start() [See the env_start() function below.]
        
    reward = None
    state = None # See Aside
    termination = None
    self.reward_state_term = (reward, state, termination)
        
    # Assign width and height of the enviroment
    self.grid_w = env_info.get('grid_width', 10)
    self.grid_h = env_info.get('grid_height', 7)
        
    # Assign goal state and start state for the agent
    self.start_loc = (self.grid_h//2, 0)
    self.goal_loc = (self.grid_h//2, self.grid_w - 3)
        
    self.winds = env_info.get('winds', [0, 0, 0, 1, 1, 1, 2, 2, 1, 0])
        
    assert (len(self.winds) == self.grid_w)    
        

In [40]:
%%add_to WindyWorld
def state(self, loc):
    # Return the state for a location. Each loc has its code
    loc_cod = loc[0] * self.grid_w + loc[1]
    return loc_cod

In [72]:
a.append('a')
a

[array([1, 2, 3, 4, 5]), array([1, 2, 3, 4, 5]), 'a', 'a']

In [79]:
%%add_to WindyWorld

def display_policy(self, policy):
    policy_graph = np.zeros((self.grid_h, self.grid_w))
    for i in range(self.grid_h):
        for j in range(self.grid_w):
            loc = (i, j)
            loc_cod = self.state(loc) 
            best_action = np.argmax(policy[loc_cod])
            if best_action == 0:
                policy_graph[i,j] = 0
            elif best_action == 1:
                policy_graph[i,j] = 1
            elif best_action == 2:
                policy_graph[i,j] = 2
            else:
                policy_graph[i,j] = 3
    print('------------------------------------------')
    print(policy_graph)
            

In [42]:
%%add_to WindyWorld
def env_start(self):
    """The first method called when the episode starts, called before the
    agent starts.

    Returns:
        The first state from the environment.
    """
    reward = 0
    self.agent_loc = self.start_loc
    state = self.state(self.agent_loc)
    termination = False
    self.reward_state_term = (reward, state, termination)
    
    return state

In [43]:
%%add_to WindyWorld

def env_step(self, action):
    """A step taken by the environment.

    Args:
        action: The action taken by the agent

    Returns:
        (float, state, Boolean): a tuple of the reward, state,
            and boolean indicating if it's terminal.
    """
#     print('--------------')
#     print('Action: ',action)
#     print('Initial Agent loc: ',self.agent_loc)
    state_wind = self.winds[self.agent_loc[1]]
    loc_after_wind = None
    if self.agent_loc[0] - state_wind >= 0:
        loc_after_wind = (self.agent_loc[0] - state_wind, self.agent_loc[1])
    elif self.agent_loc[0] - state_wind == -1:
        loc_after_wind = (0, self.agent_loc[1])
    else:
        loc_after_wind = (self.agent_loc[0], self.agent_loc[1])
    
#     print('After wind loc: ',loc_after_wind, ' Wind: ', state_wind)
    
    if action == 0: # UP (Task 1)
        ### START CODE HERE ###
        # Hint: Look at the code given for the other actions and think about the logic in them.
        # Stay.
        possible_next_loc = (loc_after_wind[0] - 1, loc_after_wind[1])
        if possible_next_loc[0] >= 0:
            self.agent_loc = possible_next_loc
        else:
            pass
        ### END CODE HERE ###
    elif action == 1: # LEFT
        possible_next_loc = (loc_after_wind[0], loc_after_wind[1] - 1)
        if possible_next_loc[1] >= 0: # Within Bounds?
            self.agent_loc = possible_next_loc
        else:
            pass # Stay.
    elif action == 2: # DOWN
        possible_next_loc = (loc_after_wind[0] + 1, loc_after_wind[1])
        if possible_next_loc[0] < self.grid_h: # Within Bounds?
            self.agent_loc = possible_next_loc
        else:
            pass # Stay.
    elif action == 3: # RIGHT
        possible_next_loc = (loc_after_wind[0], loc_after_wind[1] + 1)
        if possible_next_loc[1] < self.grid_w: # Within Bounds?
            self.agent_loc = possible_next_loc
        else:
            pass # Stay.
    else: 
        raise Exception(str(action) + " not in recognized actions [0: Up, 1: Left, 2: Down, 3: Right]!")
    
#     print('Final Agent loc: ',self.agent_loc)
    reward = -1
    terminal = False

    ### START CODE HERE ###
    # Hint: Consider the initialization of reward and terminal variables above. Then, note the 
    # conditional statements and comments given below and carefully ensure to set the variables reward 
    # and terminal correctly for each case.
    if self.agent_loc == self.goal_loc: # Reached Goal!
        terminal = True      
    ### END CODE HERE ###
#     print('New Location ',self.agent_loc)
#     print('R_S_T: ', reward, self.state(self.agent_loc), terminal)
    self.reward_state_term = (reward, self.state(self.agent_loc), terminal)
    return self.reward_state_term

In [44]:
%%add_to WindyWorld

def env_cleanup(self):
    """Cleanup done after the environment ends"""
    self.agent_loc = self.start_loc

In [45]:
class WWAgent(BaseAgent):
    def agent_init(self, agent_info={}):
        raise NotImplementedError
        
    def agent_start(self, state):
        raise NotImplementedError

    def agent_step(self, reward, state):
        raise NotImplementedError

    def agent_end(self, reward):
        raise NotImplementedError

    def agent_cleanup(self):        
        raise NotImplementedError
        
    def agent_message(self, message):
        raise NotImplementedError
    
    def update_policy(self):
        raise NotImplementedError

In [46]:
%%add_to WWAgent

def agent_init(self, agent_info={}):
    """Setup for the agent called when the experiment first starts."""

    self.grid_h = agent_info.get("grid_height", 7)
    self.grid_w = agent_info.get("grid_height", 10)
    
    
    # Create a random number generator with the provided seed to seed the agent for reproducibility.
    self.rand_generator = np.random.RandomState(agent_info.get("seed",0))
    self.possible_actions = 4
    
    # Define epsilon
    self.epsilon = agent_info.get("epsilon",0.1)
    
    # Create a random policy based in epsilon - greedy
    self.policy = agent_info.get("policy", np.ones((self.grid_w * self.grid_h, self.possible_actions)))
    self.policy *= (self.epsilon / (self.possible_actions - 1))

    mask = np.random.randint(0, 4, self.policy.shape[0])
    self.policy[range(self.policy.shape[0]), mask] = 1 - self.epsilon
    
    # Discount factor (gamma) to use in the updates.
    self.discount = agent_info.get("discount")
    # The learning rate or step size parameter (alpha) to use in updates.
    self.step_size = agent_info.get("step_size")

    # Initialize an array of zeros that will hold the values.
    # Recall that the policy can be represented as a (# States, # Actions) array. With the 
    # assumption that this is the case, we can use the first dimension of the policy to
    # initialize the array for values.
    self.q_values = np.zeros_like(self.policy)
    

In [47]:
%%add_to WWAgent
def agent_start(self, state):
    """The first method called when the episode starts, called after
    the environment starts.
    Args:
        state (Numpy array): the state from the environment's env_start function.
    Returns:
        The first action the agent takes.
    """
    # The policy can be represented as a (# States, # Actions) array. So, we can use 
    # the second dimension here when choosing an action.
    action = self.rand_generator.choice(range(self.policy.shape[1]), p=self.policy[state])
    self.last_state = state
    self.last_action = action
    return action

In [48]:
%%add_to WWAgent
def update_policy(self):
    max_action = np.argmax(self.q_values[self.last_state])
    self.policy[self.last_state] = self.epsilon / (self.possible_actions - 1)
    self.policy[self.last_state, max_action] = 1 - self.epsilon
    

In [49]:
%%add_to WWAgent
def agent_step(self, reward, state, action):
    """A step taken by the agent.
    Args:
        reward (float): the reward received for taking the last action taken
        state (Numpy array): the state from the
        environment's step after the last action, i.e., where the agent ended up after the
        last action
    Returns:
        The action the agent is taking.
    """
    # Update q_values 
    self.q_values[self.last_state, self.last_action] += self.step_size * (reward + self.discount * self.q_values[state, action] - self.q_values[self.last_state, self.last_action]) 
    
    # Update policy values for last state
    self.update_policy()
    
    # Choose new action
    action = self.rand_generator.choice(range(self.policy.shape[1]), p=self.policy[state])
    self.last_state = state
    self.last_action = action

    return action

In [58]:
%%add_to WWAgent
def agent_end(self, reward):
    """Run when the agent terminates.
    Args:
        reward (float): the reward the agent received for entering the terminal state.
    """
    # Update Q values
    self.q_values[self.last_state, self.last_action] += self.step_size * (reward - self.q_values[self.last_state, self.last_action])
    
    # Update policy values for last state
    self.update_policy()
    

In [50]:
%%add_to WWAgent
def agent_cleanup(self):
    """Cleanup done after the agent ends."""
    self.last_state = None

In [53]:
%%add_to WWAgent
def agent_message(self, message):
    """A function used to pass information from the agent to the experiment.
    Args:
        message: The message passed to the agent.
    Returns:
        The response (or answer) to the message.
    """
    if message == "get_values":
        return self.q_values
    else:
        raise Exception("TDAgent.agent_message(): Message not understood!")

In [86]:
def run_experiment(env_info, agent_info, 
                   num_episodes=5000,
                   experiment_name=None,
                   plot_freq=200,
                   true_values_file=None,
                   value_error_threshold=1e-8):
    env = WindyWorld
    agent = WWAgent
    rl_glue = RLGlue(env, agent)
    rl_glue.rl_init(agent_info, env_info)
    policy = rl_glue.agent.policy
#     manager = Manager(env_info, policy, true_values_file=true_values_file, experiment_name=experiment_name)
    for episode in range(1, num_episodes + 1):
        rl_glue.rl_episode(0) # no step limit
#         print('Episode: ', episode)
        if episode % plot_freq == 0:
            print('Episode number: ', episode)
            q_values = rl_glue.agent.agent_message("get_values")
            rl_glue.environment.display_policy(rl_glue.agent.policy)
#             print('Q_Values: ', q_values)
#             manager.visualize(q_values, episode)

    q_values = rl_glue.agent.agent_message("get_values")
    #if true_values_file is not None:
        # Grading: The Manager will check that the values computed using your TD agent match 
        # the true values (within some small allowance) across the states. In addition, it also
        # checks whether the root mean squared value error is close to 0.
        #manager.run_tests(values, value_error_threshold)
    
    return q_values

In [87]:
env_info = {"grid_height": 7, "grid_width": 10, "seed": 0}
agent_info = {"discount": 1, "step_size": 0.01, "seed": 0}

true_values_file = "optimal_policy_q_value_fn.npy"
_ = run_experiment(env_info, agent_info, num_episodes=30000, experiment_name="SARSA in Windy Grid World",
                   plot_freq=500, true_values_file=true_values_file)

Episode number:  500
------------------------------------------
[[0. 2. 2. 0. 0. 1. 0. 3. 2. 2.]
 [0. 2. 1. 1. 0. 0. 3. 2. 3. 2.]
 [2. 0. 3. 2. 1. 2. 0. 0. 2. 2.]
 [2. 3. 3. 3. 2. 2. 2. 3. 2. 2.]
 [1. 3. 0. 2. 2. 2. 1. 3. 1. 1.]
 [1. 1. 3. 3. 2. 0. 0. 3. 2. 2.]
 [3. 0. 3. 1. 3. 1. 2. 0. 2. 3.]]
Episode number:  1000
------------------------------------------
[[0. 0. 1. 2. 3. 3. 3. 3. 3. 2.]
 [0. 0. 2. 0. 3. 0. 1. 0. 3. 2.]
 [3. 1. 1. 1. 2. 2. 0. 0. 2. 2.]
 [3. 0. 2. 0. 2. 2. 0. 3. 2. 2.]
 [2. 1. 1. 2. 2. 0. 1. 2. 1. 1.]
 [3. 1. 3. 2. 2. 0. 0. 2. 2. 3.]
 [0. 2. 3. 3. 3. 1. 2. 0. 1. 2.]]
Episode number:  1500
------------------------------------------
[[0. 2. 1. 2. 1. 3. 3. 2. 3. 2.]
 [1. 1. 2. 0. 0. 3. 1. 3. 0. 2.]
 [3. 3. 3. 2. 2. 2. 3. 1. 1. 2.]
 [0. 0. 2. 2. 1. 2. 0. 3. 1. 2.]
 [2. 3. 3. 2. 2. 2. 1. 2. 1. 1.]
 [3. 1. 3. 2. 3. 0. 0. 3. 1. 1.]
 [1. 1. 0. 2. 3. 1. 2. 0. 3. 0.]]
Episode number:  2000
------------------------------------------
[[3. 3. 2. 2. 3. 3. 2. 3. 3. 2.]
 [1. 0. 2. 0

Episode number:  14500
------------------------------------------
[[0. 3. 3. 0. 2. 2. 3. 3. 3. 2.]
 [0. 1. 3. 3. 3. 3. 3. 3. 0. 2.]
 [0. 1. 2. 0. 3. 0. 2. 0. 2. 2.]
 [2. 3. 0. 0. 3. 3. 2. 3. 2. 2.]
 [2. 3. 2. 3. 3. 3. 1. 2. 1. 1.]
 [1. 2. 2. 3. 0. 0. 0. 2. 2. 2.]
 [3. 3. 3. 1. 3. 1. 2. 0. 3. 2.]]
Episode number:  15000
------------------------------------------
[[0. 0. 0. 2. 3. 3. 3. 3. 3. 2.]
 [2. 3. 2. 3. 3. 3. 3. 3. 0. 2.]
 [0. 0. 3. 3. 3. 0. 3. 0. 2. 2.]
 [3. 2. 2. 3. 3. 3. 2. 3. 2. 2.]
 [3. 3. 1. 3. 3. 3. 1. 2. 1. 1.]
 [1. 3. 2. 3. 3. 0. 0. 2. 1. 2.]
 [1. 3. 3. 3. 3. 1. 2. 0. 2. 2.]]
Episode number:  15500
------------------------------------------
[[0. 2. 1. 0. 3. 3. 3. 3. 3. 2.]
 [0. 2. 2. 0. 0. 3. 0. 3. 0. 2.]
 [3. 2. 3. 3. 1. 3. 3. 0. 3. 2.]
 [3. 3. 2. 3. 3. 2. 3. 3. 2. 2.]
 [2. 2. 2. 3. 3. 3. 1. 2. 1. 1.]
 [1. 3. 2. 3. 3. 0. 0. 2. 2. 1.]
 [1. 3. 3. 3. 3. 1. 2. 0. 2. 2.]]
Episode number:  16000
------------------------------------------
[[0. 0. 1. 0. 2. 3. 3. 3. 3. 2.]
 [0. 1.

Episode number:  28500
------------------------------------------
[[2. 3. 0. 0. 0. 3. 3. 3. 3. 2.]
 [3. 0. 1. 0. 3. 0. 0. 3. 3. 2.]
 [0. 1. 3. 1. 3. 3. 0. 0. 3. 2.]
 [2. 3. 0. 3. 3. 0. 3. 3. 2. 2.]
 [2. 1. 3. 3. 2. 3. 1. 2. 1. 1.]
 [2. 2. 2. 3. 3. 0. 0. 2. 2. 2.]
 [1. 3. 1. 3. 3. 1. 2. 0. 3. 1.]]
Episode number:  29000
------------------------------------------
[[0. 2. 3. 3. 3. 3. 3. 3. 3. 2.]
 [0. 1. 3. 1. 3. 3. 3. 3. 0. 2.]
 [1. 1. 3. 3. 3. 3. 3. 2. 1. 2.]
 [0. 0. 3. 3. 3. 2. 3. 3. 2. 2.]
 [3. 3. 2. 3. 3. 0. 1. 2. 1. 1.]
 [0. 3. 2. 3. 2. 0. 0. 2. 1. 2.]
 [3. 2. 3. 3. 3. 1. 2. 0. 1. 1.]]
Episode number:  29500
------------------------------------------
[[2. 3. 0. 0. 3. 3. 3. 3. 3. 2.]
 [0. 3. 3. 3. 1. 3. 3. 3. 3. 2.]
 [0. 3. 3. 0. 3. 3. 3. 0. 2. 2.]
 [3. 2. 3. 2. 0. 3. 1. 3. 2. 2.]
 [2. 2. 0. 2. 3. 3. 1. 2. 1. 1.]
 [2. 3. 2. 3. 3. 0. 0. 2. 2. 3.]
 [3. 2. 3. 3. 3. 1. 2. 0. 3. 2.]]
Episode number:  30000
------------------------------------------
[[3. 2. 3. 0. 3. 3. 3. 3. 3. 2.]
 [1. 0.