# TD with State Aggregation

In [6]:
!pip install -q jdc

In [7]:
import jdc
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
#@title base classes for agent, environment and their interaction

#!/usr/bin/env python

from __future__ import print_function
from abc import ABCMeta, abstractmethod

### An abstract class that specifies the Agent API

class BaseAgent:
    """Implements the agent for an RL environment.
    Note:
        agent_init, agent_start, agent_step, agent_end, agent_cleanup, and
        agent_message are required methods.
    """

    __metaclass__ = ABCMeta

    def __init__(self):
        pass

    @abstractmethod
    def agent_init(self, agent_info= {}):
        """Setup for the agent called when the experiment first starts."""

    @abstractmethod
    def agent_start(self, observation):
        """The first method called when the experiment starts, called after
        the environment starts.
        Args:
            observation (Numpy array): the state observation from the environment's evn_start function.
        Returns:
            The first action the agent takes.
        """

    @abstractmethod
    def agent_step(self, reward, observation):
        """A step taken by the agent.
        Args:
            reward (float): the reward received for taking the last action taken
            observation (Numpy array): the state observation from the
                environment's step based, where the agent ended up after the
                last step
        Returns:
            The action the agent is taking.
        """

    @abstractmethod
    def agent_end(self, reward):
        """Run when the agent terminates.
        Args:
            reward (float): the reward the agent received for entering the terminal state.
        """

    @abstractmethod
    def agent_cleanup(self):
        """Cleanup done after the agent ends."""

    @abstractmethod
    def agent_message(self, message):
        """A function used to pass information from the agent to the experiment.
        Args:
            message: The message passed to the agent.
        Returns:
            The response (or answer) to the message.
        """


### Abstract environment base class 

class BaseEnvironment:
    """Implements the environment for an RLGlue environment

    Note:
        env_init, env_start, env_step, env_cleanup, and env_message are required
        methods.
    """

    __metaclass__ = ABCMeta

    def __init__(self):
        reward = None
        observation = None
        termination = None
        self.reward_obs_term = (reward, observation, termination)

    @abstractmethod
    def env_init(self, env_info={}):
        """Setup for the environment called when the experiment first starts.

        Note:
            Initialize a tuple with the reward, first state observation, boolean
            indicating if it's terminal.
        """

    @abstractmethod
    def env_start(self):
        """The first method called when the experiment starts, called before the
        agent starts.

        Returns:
            The first state observation from the environment.
        """

    @abstractmethod
    def env_step(self, action):
        """A step taken by the environment.

        Args:
            action: The action taken by the agent

        Returns:
            (float, state, Boolean): a tuple of the reward, state observation,
                and boolean indicating if it's terminal.
        """

    @abstractmethod
    def env_cleanup(self):
        """Cleanup done after the environment ends"""

    @abstractmethod
    def env_message(self, message):
        """A message asking the environment for information

        Args:
            message: the message passed to the environment

        Returns:
            the response (or answer) to the message
        """


### connects together an experiment, agent, and environment.

class RLGlue:
    """RLGlue class

    args:
        env_name (string): the name of the module where the Environment class can be found
        agent_name (string): the name of the module where the Agent class can be found
    """

    def __init__(self, env_class, agent_class):
        self.environment = env_class()
        self.agent = agent_class()

        self.total_reward = None
        self.last_action = None
        self.num_steps = None
        self.num_episodes = None

    def rl_init(self, agent_init_info={}, env_init_info={}):
        """Initial method called when RLGlue experiment is created"""
        self.environment.env_init(env_init_info)
        self.agent.agent_init(agent_init_info)

        self.total_reward = 0.0
        self.num_steps = 0
        self.num_episodes = 0

    def rl_start(self, agent_start_info={}, env_start_info={}):
        """Starts RLGlue experiment

        Returns:
            tuple: (state, action)
        """

        last_state = self.environment.env_start()
        self.last_action = self.agent.agent_start(last_state)

        observation = (last_state, self.last_action)

        return observation

    def rl_agent_start(self, observation):
        """Starts the agent.

        Args:
            observation: The first observation from the environment

        Returns:
            The action taken by the agent.
        """
        return self.agent.agent_start(observation)

    def rl_agent_step(self, reward, observation):
        """Step taken by the agent

        Args:
            reward (float): the last reward the agent received for taking the
                last action.
            observation : the state observation the agent receives from the
                environment.

        Returns:
            The action taken by the agent.
        """
        return self.agent.agent_step(reward, observation)

    def rl_agent_end(self, reward):
        """Run when the agent terminates

        Args:
            reward (float): the reward the agent received when terminating
        """
        self.agent.agent_end(reward)

    def rl_env_start(self):
        """Starts RL-Glue environment.

        Returns:
            (float, state, Boolean): reward, state observation, boolean
                indicating termination
        """
        self.total_reward = 0.0
        self.num_steps = 1

        this_observation = self.environment.env_start()

        return this_observation

    def rl_env_step(self, action):
        """Step taken by the environment based on action from agent

        Args:
            action: Action taken by agent.

        Returns:
            (float, state, Boolean): reward, state observation, boolean
                indicating termination.
        """
        ro = self.environment.env_step(action)
        (this_reward, _, terminal) = ro

        self.total_reward += this_reward

        if terminal:
            self.num_episodes += 1
        else:
            self.num_steps += 1

        return ro

    def rl_step(self):
        """Step taken by RLGlue, takes environment step and either step or
            end by agent.

        Returns:
            (float, state, action, Boolean): reward, last state observation,
                last action, boolean indicating termination
        """

        (reward, last_state, term) = self.environment.env_step(self.last_action)

        self.total_reward += reward

        if term:
            self.num_episodes += 1
            self.agent.agent_end(reward)
            roat = (reward, last_state, None, term)
        else:
            self.num_steps += 1
            self.last_action = self.agent.agent_step(reward, last_state)
            roat = (reward, last_state, self.last_action, term)

        return roat

    def rl_cleanup(self):
        """Cleanup done at end of experiment."""
        self.environment.env_cleanup()
        self.agent.agent_cleanup()

    def rl_agent_message(self, message):
        """Message passed to communicate with agent during experiment

        Args:
            message: the message (or question) to send to the agent

        Returns:
            The message back (or answer) from the agent

        """

        return self.agent.agent_message(message)

    def rl_env_message(self, message):
        """Message passed to communicate with environment during experiment

        Args:
            message: the message (or question) to send to the environment

        Returns:
            The message back (or answer) from the environment

        """
        return self.environment.env_message(message)

    def rl_episode(self, max_steps_this_episode):
        """Runs an RLGlue episode

        Args:
            max_steps_this_episode (Int): the maximum steps for the experiment to run in an episode

        Returns:
            Boolean: if the episode should terminate
        """
        is_terminal = False

        self.rl_start()

        while (not is_terminal) and ((max_steps_this_episode == 0) or
                                     (self.num_steps < max_steps_this_episode)):
            rl_step_result = self.rl_step()
            is_terminal = rl_step_result[3]

        return is_terminal

    def rl_return(self):
        """The total reward

        Returns:
            float: the total reward
        """
        return self.total_reward

    def rl_num_steps(self):
        """The total number of steps taken

        Returns:
            Int: the total number of steps taken
        """
        return self.num_steps

    def rl_num_episodes(self):
        """The number of episodes

        Returns
            Int: the total number of episodes

        """
        return self.num_episodes


In [None]:
#@title random walk policy evaluation - create true_V.npy file


def compute_value_function(num_states, action_range):
    """
    Computes the value function for the 1000 state random walk as described in Sutton and Barto (2017).
    :return: The value function for states 1 to 1000. Index 0 is not used in this array (i.e. should remain 0).
    """
    state_prob = 0.5 / float(action_range)
    gamma = 1
    theta = 0.000001

    V = np.zeros(num_states+1)

    delta = np.infty
    i = 0
    while delta > theta:
        i += 1
        delta = 0.0
        for s in range(1, num_states+1):
            v = V[s]
            value_sum = 0.0
            for transition in range(1, action_range+1):
                right = s + transition
                right_reward = 0
                if right > num_states:
                    right_reward = 1
                    right = 0

                left = s - transition
                left_reward = 0
                if left < 1:
                    left_reward = -1
                    left = 0

                value_sum += state_prob * ((right_reward + gamma * V[right]) + (left_reward + gamma * V[left]))

            V[s] = value_sum
            delta = max(delta, np.abs(v-V[s]))
    
    np.save("./true_V", V[1:])
    return V[1:]

### if __name__ == '__main__':
num_states = 500
action_range = 100
V = compute_value_function(num_states, action_range)

np.save("./true_V", V)
plt.plot(range(1, num_states), V[1:])
plt.xlabel('State')
plt.ylabel('Value')
plt.show()

### data1 = np.load('./test.npy') 
### data2 = np.load('./true_V.npy')
### if (data1 == data2).all:
###   print('all right')
### np.save('./true_V.npy', np.array('here goes the array'))

In [10]:
#@title we have written known actual state distribution - in state_distribution.npy file

# data = np.load('./state_distribution.npy') 
# data

np.save('./state_distribution.npy', np.array([0.00050342, 0.00051537, 0.00051983, 0.00052783, 0.00054043,
       0.00054095, 0.00055063, 0.0005632 , 0.0005723 , 0.00057897,
       0.00058859, 0.00059669, 0.00060697, 0.00060958, 0.00062441,
       0.00062788, 0.00064452, 0.00065015, 0.00065469, 0.00066292,
       0.00067739, 0.00068159, 0.00069353, 0.00069643, 0.00071412,
       0.00071868, 0.00072884, 0.00075045, 0.00074827, 0.0007581 ,
       0.00077134, 0.00077444, 0.00079118, 0.00080066, 0.00080477,
       0.00081647, 0.00082221, 0.00083824, 0.00085436, 0.00085694,
       0.00085677, 0.00088564, 0.00088572, 0.00089417, 0.00090236,
       0.00092084, 0.00093196, 0.00093285, 0.0009458 , 0.00096468,
       0.00097549, 0.00098674, 0.00099336, 0.00100658, 0.0010176 ,
       0.00103931, 0.00103993, 0.00105593, 0.00106808, 0.00108118,
       0.00109397, 0.00110813, 0.00111901, 0.0011267 , 0.0011282 ,
       0.00114946, 0.00116087, 0.00118098, 0.00118296, 0.00120058,
       0.00121217, 0.00122346, 0.00123674, 0.00125523, 0.00125821,
       0.00128223, 0.00128554, 0.00130661, 0.00132195, 0.00132658,
       0.001337  , 0.00134872, 0.00136579, 0.00136663, 0.00139404,
       0.00139848, 0.001419  , 0.00142815, 0.00144499, 0.00146277,
       0.00147143, 0.00148438, 0.00149689, 0.00150875, 0.00151946,
       0.00152384, 0.00154264, 0.00156307, 0.00157091, 0.00158682,
       0.00161182, 0.001613  , 0.0016272 , 0.00163835, 0.00164645,
       0.00165212, 0.00165297, 0.00168245, 0.00168208, 0.00170092,
       0.0017003 , 0.00172867, 0.0017209 , 0.0017406 , 0.00175318,
       0.00177206, 0.00177385, 0.00179813, 0.00179566, 0.00181001,
       0.00180087, 0.00182957, 0.00183724, 0.00185188, 0.00185671,
       0.00187875, 0.00187678, 0.00189374, 0.00189855, 0.00192199,
       0.00193324, 0.0019392 , 0.00193704, 0.00194829, 0.00196719,
       0.00198096, 0.00197397, 0.00200054, 0.00200993, 0.00201649,
       0.00201641, 0.00202741, 0.00204063, 0.00205622, 0.00206629,
       0.00208476, 0.00208844, 0.00209567, 0.00210024, 0.00231221,
       0.00232935, 0.00233713, 0.00234784, 0.00236371, 0.00235567,
       0.00238131, 0.00238263, 0.00238604, 0.00239073, 0.00240666,
       0.00240008, 0.00241935, 0.00243903, 0.00243995, 0.0024497 ,
       0.00246551, 0.002471  , 0.00247267, 0.00248375, 0.00249487,
       0.00250789, 0.00250768, 0.00251874, 0.00251356, 0.00254146,
       0.00255166, 0.00254615, 0.00255837, 0.00256789, 0.0025661 ,
       0.00258586, 0.0025787 , 0.0025969 , 0.00259521, 0.00259499,
       0.00261569, 0.0026059 , 0.00263882, 0.00262394, 0.00263241,
       0.0026314 , 0.00264764, 0.00263598, 0.00266424, 0.00267746,
       0.00266757, 0.00268087, 0.00269125, 0.00268402, 0.00269576,
       0.00270081, 0.00270205, 0.00270602, 0.00272495, 0.00273038,
       0.00272314, 0.00273873, 0.00273984, 0.00273815, 0.00275035,
       0.0027457 , 0.00276034, 0.00276651, 0.00276281, 0.00278392,
       0.00277704, 0.00278361, 0.00279305, 0.00280185, 0.00278799,
       0.00279638, 0.00278878, 0.0028044 , 0.00280381, 0.00281524,
       0.00279908, 0.00281635, 0.00282225, 0.00281668, 0.00283035,
       0.00282996, 0.00282696, 0.00283561, 0.00283615, 0.00283524,
       0.00283788, 0.00285564, 0.00285225, 0.0028593 , 0.00284709,
       0.0028474 , 0.00285042, 0.00286586, 0.00285936, 0.00286968,
       0.00286822, 0.00285015, 0.00285607, 0.00285977, 0.04377746,
       0.00286391, 0.00286415, 0.00286389, 0.00285858, 0.00285776,
       0.00284733, 0.00284927, 0.00285587, 0.00284777, 0.00285377,
       0.00284738, 0.00286175, 0.00284187, 0.00284762, 0.00284123,
       0.00283183, 0.00283757, 0.00284797, 0.00282714, 0.00281623,
       0.00282297, 0.00280843, 0.00280987, 0.0028102 , 0.00280245,
       0.00280249, 0.00281425, 0.0027992 , 0.00279464, 0.0027992 ,
       0.00279439, 0.0027763 , 0.00277307, 0.00277019, 0.00278148,
       0.00277169, 0.00275489, 0.00276468, 0.00275734, 0.00274451,
       0.00274257, 0.00274218, 0.00275072, 0.00273704, 0.00271091,
       0.00270943, 0.00271375, 0.00271696, 0.00271064, 0.00270924,
       0.00269711, 0.00269526, 0.00267912, 0.00268229, 0.00267865,
       0.00266763, 0.00264579, 0.00264951, 0.00264949, 0.00263432,
       0.00261388, 0.00262694, 0.00263121, 0.00261701, 0.00261951,
       0.00260331, 0.00259392, 0.00259246, 0.00258876, 0.00258065,
       0.00256513, 0.00256525, 0.00255693, 0.00253721, 0.00253501,
       0.00253614, 0.00252541, 0.00251543, 0.00250009, 0.00248048,
       0.00248539, 0.00249251, 0.002474  , 0.002468  , 0.00243952,
       0.00243311, 0.00244324, 0.00243121, 0.00242437, 0.00242182,
       0.00241158, 0.00237987, 0.00238674, 0.00237942, 0.00236205,
       0.00235565, 0.00234366, 0.00234175, 0.00232925, 0.00232238,
       0.00209493, 0.00210441, 0.00208087, 0.00207885, 0.00207061,
       0.00206829, 0.00204614, 0.00204075, 0.00202141, 0.00202227,
       0.00201676, 0.00199749, 0.00198952, 0.00198503, 0.0019751 ,
       0.00196309, 0.00193906, 0.00193038, 0.00192386, 0.00192063,
       0.00191264, 0.00188757, 0.00187937, 0.00187628, 0.00185891,
       0.00184988, 0.00184497, 0.00183835, 0.00182716, 0.00181147,
       0.00181871, 0.00177915, 0.00178643, 0.00177317, 0.00176385,
       0.00175148, 0.00174691, 0.00173135, 0.00171385, 0.00170172,
       0.00170096, 0.00167898, 0.00167445, 0.00165476, 0.00165786,
       0.0016416 , 0.00163549, 0.00162126, 0.00160757, 0.00159714,
       0.00157578, 0.00156311, 0.00156005, 0.00154052, 0.00152658,
       0.00152329, 0.00150449, 0.00149567, 0.00147009, 0.00146162,
       0.00146175, 0.00142883, 0.00142605, 0.00140757, 0.00139762,
       0.00139032, 0.00137479, 0.00136601, 0.00134701, 0.00133754,
       0.00132832, 0.00131237, 0.00129362, 0.00127879, 0.0012761 ,
       0.00126337, 0.00125122, 0.00124542, 0.00122344, 0.00120191,
       0.00119003, 0.00118888, 0.00116669, 0.0011473 , 0.00114438,
       0.00113435, 0.00112177, 0.00110673, 0.00109323, 0.00109464,
       0.00107809, 0.00106341, 0.00104904, 0.00102916, 0.00102669,
       0.00101378, 0.00100317, 0.00098875, 0.00098067, 0.00096445,
       0.00095721, 0.00094804, 0.00094091, 0.00091977, 0.00092059,
       0.00090069, 0.00089598, 0.00088747, 0.00087217, 0.00086631,
       0.00085375, 0.00084779, 0.00083345, 0.00082541, 0.00081696,
       0.00079618, 0.00078721, 0.00077798, 0.00077422, 0.00076572,
       0.00075419, 0.00074821, 0.0007377 , 0.00072146, 0.0007267 ,
       0.00070789, 0.00069964, 0.00069625, 0.00067577, 0.00067271,
       0.00066238, 0.00065126, 0.00064614, 0.00063833, 0.00062539,
       0.00061828, 0.00060413, 0.00060095, 0.00059326, 0.00058701,
       0.00057868, 0.000573  , 0.00056377, 0.0005522 , 0.00054181,
       0.00053344, 0.00052228, 0.00051599, 0.00051122, 0.00050052]))

In [11]:
#@title helpers for visualization

# Function to plot result
def plot_result(agent_parameters, directory):
    
    true_V = np.load('./true_V.npy')

    for num_g in agent_parameters["num_groups"]:
        plt1_agent_sweeps = []
        plt2_agent_sweeps = []
        
        # two plots: learned state-value and learning curve (RMSVE)
        fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
            
        for step_size in agent_parameters["step_size"]:
            
            # plot1
            filename = 'V_TD_agent_agg_states_{}_step_size_{}'.format(num_g, step_size).replace('.','')
            current_agent_V = np.load('{}/{}.npy'.format(directory, filename))

            plt1_x_legend = range(1,len(current_agent_V[:]) + 1)
            graph_current_agent_V, = ax[0].plot(plt1_x_legend, current_agent_V[:], label="approximate values: state aggregation: {}, step-size: {}".format(num_g, step_size))
            plt1_agent_sweeps.append(graph_current_agent_V)
            
            # plot2
            filename = 'RMSVE_TD_agent_agg_states_{}_step_size_{}'.format(num_g, step_size).replace('.','')
            current_agent_RMSVE = np.load('{}/{}.npy'.format(directory, filename))

            plt2_x_legend = range(1,len(current_agent_RMSVE[:]) + 1)
            graph_current_agent_RMSVE, = ax[1].plot(plt2_x_legend, current_agent_RMSVE[:], label="approximate values: state aggregation: {}, step-size: {}".format(num_g, step_size))
            plt2_agent_sweeps.append(graph_current_agent_RMSVE)
            
          
        # plot1: 
        # add True V
        plt1_x_legend = range(1,len(true_V[:]) + 1)
        graph_true_V, = ax[0].plot(plt1_x_legend, true_V[:], label="$v_\pi$")
        
        ax[0].legend(handles=[*plt1_agent_sweeps, graph_true_V])
        
        ax[0].set_title("Learned State Value after 2000 episodes")
        ax[0].set_xlabel('State')
        ax[0].set_ylabel('Value\n scale', rotation=0, labelpad=15)

        plt1_xticks = [1, 100, 200, 300, 400, 500]#, 600, 700, 800, 900, 1000]
        plt1_yticks = [-1.0, 0.0, 1.0]
        ax[0].set_xticks(plt1_xticks)
        ax[0].set_xticklabels(plt1_xticks)
        ax[0].set_yticks(plt1_yticks)
        ax[0].set_yticklabels(plt1_yticks)
        
        
        # plot2:
        ax[1].legend(handles=plt2_agent_sweeps)
        
        ax[1].set_title("Learning Curve")
        ax[1].set_xlabel('Episodes')
        ax[1].set_ylabel('RMSVE\n averaged over 50 runs', rotation=0, labelpad=40)

        plt2_xticks = range(0, 210, 20) # [0, 10, 20, 30, 40, 50, 60, 70, 80]
        plt2_xticklabels = range(0, 2100, 200) # [0, 100, 200, 300, 400, 500, 600, 700, 800]
        plt2_yticks = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
        ax[1].set_xticks(plt2_xticks)
        ax[1].set_xticklabels(plt2_xticklabels)
        ax[1].set_yticks(plt2_yticks)
        ax[1].set_yticklabels(plt2_yticks)
        
        plt.tight_layout()
        plt.suptitle("{}-State Aggregation".format(num_g),fontsize=16, fontweight='bold', y=1.03)
        plt.show()      


In [12]:
class RandomWalkEnvironment(BaseEnvironment):
    def env_init(self, env_info={}):
        """
        Setup for the environment called when the experiment first starts.
        
        Set parameters needed to setup the 500-state random walk environment.
        
        Assume env_info dict contains:
        {
            num_states: 500 [int],
            start_state: 250 [int],
            left_terminal_state: 0 [int],
            right_terminal_state: 501 [int],
            seed: int
        }
        """
        
        # set random seed for each run
        self.rand_generator = np.random.RandomState(env_info.get("seed")) 
        
        # set each class attribute
        self.num_states = env_info["num_states"] 
        self.start_state = env_info["start_state"] 
        self.left_terminal_state = env_info["left_terminal_state"] 
        self.right_terminal_state = env_info["right_terminal_state"]

    def env_start(self):
        """
        The first method called when the experiment starts, called before the
        agent starts.

        Returns:
            The first state from the environment.
        """

        # set self.reward_state_term tuple
        reward = 0.0
        state = self.start_state
        is_terminal = False
                
        self.reward_state_term = (reward, state, is_terminal)
        
        # return first state from the environment
        return self.reward_state_term[1]
        
    def env_step(self, action):
        """A step taken by the environment.

        Args:
            action: The action taken by the agent

        Returns:
            (float, state, Boolean): a tuple of the reward, state,
                and boolean indicating if it's terminal.
        """
        
        last_state = self.reward_state_term[1]
        
        # set reward, current_state, and is_terminal
        #
        # action: specifies direction of movement - 0 (indicating left) or 1 (indicating right)  [int]
        # current state: next state after taking action from the last state [int]
        # reward: -1 if terminated left, 1 if terminated right, 0 otherwise [float]
        # is_terminal: indicates whether the episode terminated [boolean]
        #
        # Given action (direction of movement), determine how much to move in that direction from last_state
        # All transitions beyond the terminal state are absorbed into the terminal state.
        
        if action == 0: # left
            current_state = max(self.left_terminal_state, last_state + self.rand_generator.choice(range(-100,0)))
        elif action == 1: # right
            current_state = min(self.right_terminal_state, last_state + self.rand_generator.choice(range(1,101)))
        else: 
            raise ValueError("Wrong action value")
        
        # terminate left
        if current_state == self.left_terminal_state: 
            reward = -1.0
            is_terminal = True
        
        # terminate right
        elif current_state == self.right_terminal_state:
            reward = 1.0
            is_terminal = True
        
        else:
            reward = 0.0
            is_terminal = False
        
        self.reward_state_term = (reward, current_state, is_terminal)
        
        return self.reward_state_term
        

In [13]:
def agent_policy(rand_generator, state):
    """
    Given random number generator and state, returns an action according to the agent's policy.
    
    Args:
        rand_generator: Random number generator

    Returns:
        chosen action [int]
    """
    
    # set chosen_action as 0 or 1 with equal probability
    # state is unnecessary for this agent policy
    chosen_action = rand_generator.choice([0,1])
    
    return chosen_action

In [14]:
def get_state_feature(num_states_in_group, num_groups, state):
    """
    Given state, return the feature of that state
    
    Args:
        num_states_in_group [int]
        num_groups [int] 
        state [int] : 1~500

    Returns:
        one_hot_vector [numpy array]
    """
    
    ### Generate state feature (2~4 lines)
    # Create one_hot_vector with size of the num_groups, according to state
    # For simplicity, assume num_states is always perfectly divisible by num_groups
    # Note that states start from index 1, not 0!
    
    # Example:
    # If num_states = 100, num_states_in_group = 20, num_groups = 5,
    # one_hot_vector would be of size 5.
    # For states 1~20, one_hot_vector would be: [1, 0, 0, 0, 0]
    one_hot_vector = np.zeros(num_groups)
    one_hot_vector[(state - 1) // num_states_in_group] = 1    
    
    return one_hot_vector


In [None]:
# -----------
# Tested Cell
# -----------

# Given that num_states = 10 and num_groups = 5, test get_state_feature()
# There are states 1~10, and the state feature vector would be of size 5.
# Only one element would be active for any state feature vector.

# get_state_feature() should support various values of num_states, num_groups, not just this example
# For simplicity, assume num_states will always be perfectly divisible by num_groups
num_states = 10
num_groups = 5
num_states_in_group = int(num_states / num_groups)

# Test 1st group, state = 1
state = 1
features = get_state_feature(num_states_in_group, num_groups, state)
print("1st group: {}".format(features))

assert np.all(features == [1, 0, 0, 0, 0])

# Test 2nd group, state = 3
state = 3
features = get_state_feature(num_states_in_group, num_groups, state)
print("2nd group: {}".format(features))

assert np.all(features == [0, 1, 0, 0, 0])

# Test 3rd group, state = 6
state = 6
features = get_state_feature(num_states_in_group, num_groups, state)
print("3rd group: {}".format(features))

assert np.all(features == [0, 0, 1, 0, 0])

# Test 4th group, state = 7
state = 7
features = get_state_feature(num_states_in_group, num_groups, state)
print("4th group: {}".format(features))

assert np.all(features == [0, 0, 0, 1, 0])

# Test 5th group, state = 10
state = 10
features = get_state_feature(num_states_in_group, num_groups, state)
print("5th group: {}".format(features))

assert np.all(features == [0, 0, 0, 0, 1])

In [16]:
# Create TDAgent
class TDAgent(BaseAgent):
    def __init__(self):
        self.num_states = None
        self.num_groups = None
        self.step_size = None
        self.discount_factor = None
        
    def agent_init(self, agent_info={}):
        """Setup for the agent called when the experiment first starts.

        Set parameters needed to setup the semi-gradient TD(0) state aggregation agent.

        Assume agent_info dict contains:
        {
            num_states: 500 [int],
            num_groups: int, 
            step_size: float, 
            discount_factor: float,
            seed: int
        }
        """

        # set random seed for each run
        self.rand_generator = np.random.RandomState(agent_info.get("seed")) 

        # set class attributes
        self.num_states = agent_info.get("num_states")
        self.num_groups = agent_info.get("num_groups")
        self.step_size = agent_info.get("step_size")
        self.discount_factor = agent_info.get("discount_factor")

        # pre-compute all observable features
        num_states_in_group = int(self.num_states / self.num_groups)
        self.all_state_features = np.array([get_state_feature(num_states_in_group, self.num_groups, state) for state in range(1, self.num_states + 1)])

        # initialize all weights to zero using numpy array with correct size
        self.weights = np.zeros(self.num_groups)

        self.last_state = None
        self.last_action = None

    def agent_start(self, state):
        """The first method called when the experiment starts, called after
        the environment starts.
        Args:
            state [int]: the state from the
                environment's evn_start function.
        Returns:
            self.last_action [int] : The first action the agent takes.
        """

        ### select action given state (using agent_policy), and save current state and action
        # Use self.rand_generator for agent_policy
        self.last_state = state
        self.last_action = agent_policy(self.rand_generator, state)        

        return self.last_action

    def agent_step(self, reward, state):
        """A step taken by the agent.
        Args:
            reward [float]: the reward received for taking the last action taken
            state [int]: the state from the environment's step, where the agent ended up after the last step
        Returns:
            self.last_action [int] : The action the agent is taking.
        """
        
        # get relevant feature
        current_state_feature = self.all_state_features[state-1] 
        last_state_feature = self.all_state_features[self.last_state-1] 
        
        ### update weights and select action
        # (Hint: np.dot method is useful!)
        #
        # Update weights:
        #     use self.weights, current_state_feature, and last_state_feature
        #
        # Select action:
        #     use self.rand_generator for agent_policy
        #
        # Current state and selected action should be saved to self.last_state and self.last_action at the end
        last_value = last_state_feature @ self.weights
        value = current_state_feature @ self.weights
        delta = reward + self.discount_factor * value - last_value
        self.weights += self.step_size * delta * last_state_feature
        self.last_state = state
        self.last_action = agent_policy(self.rand_generator, state)        

        return self.last_action

    def agent_end(self, reward):
        """Run when the agent terminates.
        Args:
            reward (float): the reward the agent received for entering the
                terminal state.
        """

        # get relevant feature
        last_state_feature = self.all_state_features[self.last_state-1]
        
        ### update weights
        last_value = last_state_feature @ self.weights
        delta = reward - last_value
        self.weights += self.step_size * delta * last_state_feature        
        
        return
        
    def agent_message(self, message):
        # We will implement this method later
        raise NotImplementedError



Run the following code to verify `agent_init()`

In [None]:
# -----------
# Tested Cell
# -----------

agent_info = {
    "num_states": 500,
    "num_groups": 10,
    "step_size": 0.1,
    "discount_factor": 1.0,
    "seed": 1,
}

agent = TDAgent()
agent.agent_init(agent_info)

assert np.all(agent.weights == 0)
assert agent.weights.shape == (10,)

# check attributes
print("num_states: {}".format(agent.num_states))
print("num_groups: {}".format(agent.num_groups))
print("step_size: {}".format(agent.step_size))
print("discount_factor: {}".format(agent.discount_factor))

print("weights shape: {}".format(agent.weights.shape))
print("weights init. value: {}".format(agent.weights))


In [None]:
# -----------
# Tested Cell
# -----------

agent_info = {
    "num_states": 500,
    "num_groups": 10,
    "step_size": 0.1,
    "discount_factor": 1.0,
    "seed": 1,
}

# Suppose state = 250
state = 250

agent = TDAgent()
agent.agent_init(agent_info)
action = agent.agent_start(state)

assert action == 1
assert agent.last_state == 250
assert agent.last_action == 1

print("Agent state: {}".format(agent.last_state))
print("Agent selected action: {}".format(agent.last_action))


In [None]:
# -----------
# Tested Cell
# -----------

agent_info = {
    "num_states": 500,
    "num_groups": 10,
    "step_size": 0.1,
    "discount_factor": 0.9,
    "seed": 1,
}

agent = TDAgent()
agent.agent_init(agent_info)

# Initializing the weights to arbitrary values to verify the correctness of weight update
agent.weights = np.array([-1.5, 0.5, 1., -0.5, 1.5, -0.5, 1.5, 0.0, -0.5, -1.0])

# Assume the agent started at State 50
start_state = 50
action = agent.agent_start(start_state)

assert action == 1

# Assume the reward was 10.0 and the next state observed was State 120
reward = 10.0
next_state = 120
action = agent.agent_step(reward, next_state)

assert action == 1

print("Updated weights: {}".format(agent.weights))
assert np.allclose(agent.weights, [-0.26, 0.5, 1., -0.5, 1.5, -0.5, 1.5, 0., -0.5, -1.])

assert agent.last_state == 120
assert agent.last_action == 1

print("last state: {}".format(agent.last_state))
print("last action: {}".format(agent.last_action))

# let's do another
reward = -22
next_state = 222
action = agent.agent_step(reward, next_state)

assert action == 0

assert np.allclose(agent.weights, [-0.26, 0.5, -1.165, -0.5, 1.5, -0.5, 1.5, 0, -0.5, -1])
assert agent.last_state == 222
assert agent.last_action == 0

In [None]:
# -----------
# Tested Cell
# -----------

agent_info = {
    "num_states": 500,
    "num_groups": 10,
    "step_size": 0.1,
    "discount_factor": 0.9,
    "seed": 1,
}

agent = TDAgent()
agent.agent_init(agent_info)

# Initializing the weights to arbitrary values to verify the correctness of weight update
agent.weights = np.array([-1.5, 0.5, 1., -0.5, 1.5, -0.5, 1.5, 0.0, -0.5, -1.0])

# Assume the agent started at State 50
start_state = 50
action = agent.agent_start(start_state)

assert action == 1

# Assume the reward was 10.0 and reached the terminal state
agent.agent_end(10.0)
print("Updated weights: {}".format(agent.weights))

assert np.allclose(agent.weights, [-0.35, 0.5, 1., -0.5, 1.5, -0.5, 1.5, 0., -0.5, -1.])

In [21]:
%%add_to TDAgent

def agent_message(self, message):
    if message == 'get state value':
        
        ### return state_value
        state_value = self.all_state_features @ self.weights
        
        return state_value

In [None]:
# -----------
# Tested Cell
# -----------

agent_info = {
    "num_states": 20,
    "num_groups": 5,
    "step_size": 0.1,
    "discount_factor": 1.0,
}

agent = TDAgent()
agent.agent_init(agent_info)
test_state_val = agent.agent_message('get state value')

assert test_state_val.shape == (20,)
assert np.all(test_state_val == 0)

print("State value shape: {}".format(test_state_val.shape))
print("Initial State value for all states: {}".format(test_state_val))

In [23]:
# Here we provide you with the true state value and state distribution
true_state_val = np.load('./true_V.npy')    
state_distribution = np.load('./state_distribution.npy')

def calc_RMSVE(learned_state_val):
    assert(len(true_state_val) == len(learned_state_val) == len(state_distribution))
    MSVE = np.sum(np.multiply(state_distribution, np.square(true_state_val - learned_state_val)))
    RMSVE = np.sqrt(MSVE)
    return RMSVE

In [24]:
import os

# Define function to run experiment
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):

    rl_glue = RLGlue(environment, agent)
    
    # Sweep Agent parameters
    for num_agg_states in agent_parameters["num_groups"]:
        for step_size in agent_parameters["step_size"]:
            
            # save rmsve at the end of each evaluation episode
            # size: num_episode / episode_eval_frequency + 1 (includes evaluation at the beginning of training)
            agent_rmsve = np.zeros(int(experiment_parameters["num_episodes"]/experiment_parameters["episode_eval_frequency"]) + 1)
            
            # save learned state value at the end of each run
            agent_state_val = np.zeros(environment_parameters["num_states"])

            env_info = {"num_states": environment_parameters["num_states"],
                        "start_state": environment_parameters["start_state"],
                        "left_terminal_state": environment_parameters["left_terminal_state"],
                        "right_terminal_state": environment_parameters["right_terminal_state"]}

            agent_info = {"num_states": environment_parameters["num_states"],
                          "num_groups": num_agg_states,
                          "step_size": step_size,
                          "discount_factor": environment_parameters["discount_factor"]}

            print('Setting - num. agg. states: {}, step_size: {}'.format(num_agg_states, step_size))
            os.system('sleep 0.2')
            
            # one agent setting
            for run in tqdm(range(1, experiment_parameters["num_runs"]+1)):
                env_info["seed"] = run
                agent_info["seed"] = run
                rl_glue.rl_init(agent_info, env_info)
                
                # Compute initial RMSVE before training
                current_V = rl_glue.rl_agent_message("get state value")
                agent_rmsve[0] += calc_RMSVE(current_V)
                    
                for episode in range(1, experiment_parameters["num_episodes"]+1):
                    # run episode
                    rl_glue.rl_episode(0) # no step limit
                    
                    if episode % experiment_parameters["episode_eval_frequency"] == 0:
                        current_V = rl_glue.rl_agent_message("get state value")
                        agent_rmsve[int(episode/experiment_parameters["episode_eval_frequency"])] += calc_RMSVE(current_V)
                        
                # store only one run of state value
                if run == 50:
                    agent_state_val = rl_glue.rl_agent_message("get state value")
            
            # rmsve averaged over runs
            agent_rmsve /= experiment_parameters["num_runs"]
            
            save_name = "{}_agg_states_{}_step_size_{}".format('TD_agent', num_agg_states, step_size).replace('.','')
            
            if not os.path.exists('results'):
                os.makedirs('results')
    
            # save avg. state value
            np.save("results/V_{}".format(save_name), agent_state_val)

            # save avg. rmsve
            np.save("results/RMSVE_{}".format(save_name), agent_rmsve)

In [None]:
#### Run Experiment

# Experiment parameters
experiment_parameters = {
    "num_runs" : 50,
    "num_episodes" : 2000,
    "episode_eval_frequency" : 10 # evaluate every 10 episodes
}

# Environment parameters
environment_parameters = {
    "num_states" : 500, 
    "start_state" : 250,
    "left_terminal_state" : 0,
    "right_terminal_state" : 501, 
    "discount_factor" : 1.0
}

# Agent parameters
# Each element is an array because we will be later sweeping over multiple values
agent_parameters = {
    "num_groups": [10],
    "step_size": [0.01, 0.05, 0.1]
}

current_env = RandomWalkEnvironment
current_agent = TDAgent

run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters)
plot_result(agent_parameters, './results')