# Semi-gradient TD with a Neural Network

In [14]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os, shutil
from tqdm import tqdm

In [15]:
#@title base classes for agent, environment and their interaction

#!/usr/bin/env python

from __future__ import print_function
from abc import ABCMeta, abstractmethod

### An abstract class that specifies the Agent API

class BaseAgent:
    """Implements the agent for an RL environment.
    Note:
        agent_init, agent_start, agent_step, agent_end, agent_cleanup, and
        agent_message are required methods.
    """

    __metaclass__ = ABCMeta

    def __init__(self):
        pass

    @abstractmethod
    def agent_init(self, agent_info= {}):
        """Setup for the agent called when the experiment first starts."""

    @abstractmethod
    def agent_start(self, observation):
        """The first method called when the experiment starts, called after
        the environment starts.
        Args:
            observation (Numpy array): the state observation from the environment's evn_start function.
        Returns:
            The first action the agent takes.
        """

    @abstractmethod
    def agent_step(self, reward, observation):
        """A step taken by the agent.
        Args:
            reward (float): the reward received for taking the last action taken
            observation (Numpy array): the state observation from the
                environment's step based, where the agent ended up after the
                last step
        Returns:
            The action the agent is taking.
        """

    @abstractmethod
    def agent_end(self, reward):
        """Run when the agent terminates.
        Args:
            reward (float): the reward the agent received for entering the terminal state.
        """

    @abstractmethod
    def agent_cleanup(self):
        """Cleanup done after the agent ends."""

    @abstractmethod
    def agent_message(self, message):
        """A function used to pass information from the agent to the experiment.
        Args:
            message: The message passed to the agent.
        Returns:
            The response (or answer) to the message.
        """


### Abstract environment base class 

class BaseEnvironment:
    """Implements the environment for an RLGlue environment

    Note:
        env_init, env_start, env_step, env_cleanup, and env_message are required
        methods.
    """

    __metaclass__ = ABCMeta

    def __init__(self):
        reward = None
        observation = None
        termination = None
        self.reward_obs_term = (reward, observation, termination)

    @abstractmethod
    def env_init(self, env_info={}):
        """Setup for the environment called when the experiment first starts.

        Note:
            Initialize a tuple with the reward, first state observation, boolean
            indicating if it's terminal.
        """

    @abstractmethod
    def env_start(self):
        """The first method called when the experiment starts, called before the
        agent starts.

        Returns:
            The first state observation from the environment.
        """

    @abstractmethod
    def env_step(self, action):
        """A step taken by the environment.

        Args:
            action: The action taken by the agent

        Returns:
            (float, state, Boolean): a tuple of the reward, state observation,
                and boolean indicating if it's terminal.
        """

    @abstractmethod
    def env_cleanup(self):
        """Cleanup done after the environment ends"""

    @abstractmethod
    def env_message(self, message):
        """A message asking the environment for information

        Args:
            message: the message passed to the environment

        Returns:
            the response (or answer) to the message
        """


### connects together an experiment, agent, and environment.

class RLGlue:
    """RLGlue class

    args:
        env_name (string): the name of the module where the Environment class can be found
        agent_name (string): the name of the module where the Agent class can be found
    """

    def __init__(self, env_class, agent_class):
        self.environment = env_class()
        self.agent = agent_class()

        self.total_reward = None
        self.last_action = None
        self.num_steps = None
        self.num_episodes = None

    def rl_init(self, agent_init_info={}, env_init_info={}):
        """Initial method called when RLGlue experiment is created"""
        self.environment.env_init(env_init_info)
        self.agent.agent_init(agent_init_info)

        self.total_reward = 0.0
        self.num_steps = 0
        self.num_episodes = 0

    def rl_start(self, agent_start_info={}, env_start_info={}):
        """Starts RLGlue experiment

        Returns:
            tuple: (state, action)
        """

        last_state = self.environment.env_start()
        self.last_action = self.agent.agent_start(last_state)

        observation = (last_state, self.last_action)

        return observation

    def rl_agent_start(self, observation):
        """Starts the agent.

        Args:
            observation: The first observation from the environment

        Returns:
            The action taken by the agent.
        """
        return self.agent.agent_start(observation)

    def rl_agent_step(self, reward, observation):
        """Step taken by the agent

        Args:
            reward (float): the last reward the agent received for taking the
                last action.
            observation : the state observation the agent receives from the
                environment.

        Returns:
            The action taken by the agent.
        """
        return self.agent.agent_step(reward, observation)

    def rl_agent_end(self, reward):
        """Run when the agent terminates

        Args:
            reward (float): the reward the agent received when terminating
        """
        self.agent.agent_end(reward)

    def rl_env_start(self):
        """Starts RL-Glue environment.

        Returns:
            (float, state, Boolean): reward, state observation, boolean
                indicating termination
        """
        self.total_reward = 0.0
        self.num_steps = 1

        this_observation = self.environment.env_start()

        return this_observation

    def rl_env_step(self, action):
        """Step taken by the environment based on action from agent

        Args:
            action: Action taken by agent.

        Returns:
            (float, state, Boolean): reward, state observation, boolean
                indicating termination.
        """
        ro = self.environment.env_step(action)
        (this_reward, _, terminal) = ro

        self.total_reward += this_reward

        if terminal:
            self.num_episodes += 1
        else:
            self.num_steps += 1

        return ro

    def rl_step(self):
        """Step taken by RLGlue, takes environment step and either step or
            end by agent.

        Returns:
            (float, state, action, Boolean): reward, last state observation,
                last action, boolean indicating termination
        """

        (reward, last_state, term) = self.environment.env_step(self.last_action)

        self.total_reward += reward

        if term:
            self.num_episodes += 1
            self.agent.agent_end(reward)
            roat = (reward, last_state, None, term)
        else:
            self.num_steps += 1
            self.last_action = self.agent.agent_step(reward, last_state)
            roat = (reward, last_state, self.last_action, term)

        return roat

    def rl_cleanup(self):
        """Cleanup done at end of experiment."""
        self.environment.env_cleanup()
        self.agent.agent_cleanup()

    def rl_agent_message(self, message):
        """Message passed to communicate with agent during experiment

        Args:
            message: the message (or question) to send to the agent

        Returns:
            The message back (or answer) from the agent

        """

        return self.agent.agent_message(message)

    def rl_env_message(self, message):
        """Message passed to communicate with environment during experiment

        Args:
            message: the message (or question) to send to the environment

        Returns:
            The message back (or answer) from the environment

        """
        return self.environment.env_message(message)

    def rl_episode(self, max_steps_this_episode):
        """Runs an RLGlue episode

        Args:
            max_steps_this_episode (Int): the maximum steps for the experiment to run in an episode

        Returns:
            Boolean: if the episode should terminate
        """
        is_terminal = False

        self.rl_start()

        while (not is_terminal) and ((max_steps_this_episode == 0) or
                                     (self.num_steps < max_steps_this_episode)):
            rl_step_result = self.rl_step()
            is_terminal = rl_step_result[3]

        return is_terminal

    def rl_return(self):
        """The total reward

        Returns:
            float: the total reward
        """
        return self.total_reward

    def rl_num_steps(self):
        """The total number of steps taken

        Returns:
            Int: the total number of steps taken
        """
        return self.num_steps

    def rl_num_episodes(self):
        """The number of episodes

        Returns
            Int: the total number of episodes

        """
        return self.num_episodes


In [16]:
#  An additional abstract class that specifies the optimizer API 


from abc import ABCMeta, abstractmethod

class BaseOptimizer:
	__metaclass__ = ABCMeta

	def __init__(self):
		pass

	@abstractmethod
	def optimizer_init(self, optimizer_info):
		"""Setup for the optimizer."""

	@abstractmethod
	def update_weights(self, weights, g):
		"""
        Given weights and update g, return updated weights
        """


In [17]:
#@title RandomWalk environment 


class RandomWalkEnvironment(BaseEnvironment):
    def env_init(self, env_info={}):
        """
        Setup for the environment called when the experiment first starts.
        
        Set parameters needed to setup the 500-state random walk environment.
        
        Assume env_info dict contains:
        {
            num_states: 500,
            start_state: 250,
            left_terminal_state: 0,
            right_terminal_state: 501,
            seed: int
        }
        """
        # set random seed for each run
        self.rand_generator = np.random.RandomState(env_info.get("seed")) 
        
        ### Set each attributes correctly 
        self.num_states = env_info["num_states"] 
        self.start_state = env_info["start_state"] 
        self.left_terminal_state = env_info["left_terminal_state"] 
        self.right_terminal_state = env_info["right_terminal_state"]
        
    def env_start(self):
        """
        The first method called when the experiment starts, called before the
        agent starts.

        Returns:
            The first state observation from the environment.
        """

        ### set self.reward_obs_term tuple accordingly 
        reward = 0.0
        observation = self.start_state
        is_terminal = False
        
        self.reward_obs_term = (reward, observation, is_terminal)
        
        # return first state observation from the environment
        return self.reward_obs_term[1]
        
    def env_step(self, action):
        """A step taken by the environment.

        Args:
            action: The action taken by the agent

        Returns:
            (float, state, Boolean): a tuple of the reward, state observation,
                and boolean indicating if it's terminal.
        """
        
        ### set reward, current_state, and is_terminal correctly
        # current state: next state after taking action from the last state [int]
        # action: represents how many states to move from the last state [int]
        last_state = self.reward_obs_term[1]
        
        if action == 0: # left
            current_state = max(self.left_terminal_state, last_state + self.rand_generator.choice(range(-100,0)))
        elif action == 1: # right
            current_state = min(self.right_terminal_state, last_state + self.rand_generator.choice(range(1,101)))
        else: 
            raise ValueError("Wrong action value")
        
        reward = 0.0
        is_terminal = False
        if current_state == self.left_terminal_state: 
            reward = -1.0
            is_terminal = True

        elif current_state == self.right_terminal_state:
            reward = 1.0
            is_terminal = True

        self.reward_obs_term = (reward, current_state, is_terminal)
        
        return self.reward_obs_term

In [18]:
#@title helpers for visualization

plt1_legend_dict = {"td_agent": "approximate values learned by\n TD with neural network", 
                    "td_agent_5000_episodes": "approximate values learned by\n TD with neural network",
                    "td_agent_tilecoding": "approximate values learned by\n TD with tile-coding"}


plt2_legend_dict = {"td_agent": "TD with neural network", 
                    "td_agent_5000_episodes": "TD with neural network",
                    "td_agent_tilecoding": "TD with tile-coding"}


plt2_label_dict = {"td_agent": "RMSVE\n averaged\n over\n 20 runs", 
                   "td_agent_5000_episodes": "RMSVE\n averaged\n over\n 20 runs",
                   "td_agent_tilecoding": "RMSVE\n averaged\n over\n 20 runs"}


# Function to plot result
def plot_result(data_name_array):
    
    true_V = np.load('./true_V.npy')

    plt1_agent_sweeps = []
    plt2_agent_sweeps = []
    
    # two plots: learned state-value and learning curve (RMSVE)
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
    
    for data_name in data_name_array:

        # plot1
        filename = 'V_{}'.format(data_name).replace('.','')
        current_agent_V = np.load('./results/{}.npy'.format(filename))
        current_agent_V = current_agent_V[-1, :]


        plt1_x_legend = range(1,len(current_agent_V[:]) + 1)
        graph_current_agent_V, = ax[0].plot(plt1_x_legend, current_agent_V[:], label=plt1_legend_dict[data_name])
        plt1_agent_sweeps.append(graph_current_agent_V)
        
        # plot2
        filename = 'RMSVE_{}'.format(data_name).replace('.','')
        RMSVE_data = np.load('./results/{}.npz'.format(filename))
        current_agent_RMSVE = np.mean(RMSVE_data["rmsve"], axis = 0)

        plt2_x_legend = np.arange(0, RMSVE_data["num_episodes"]+1, RMSVE_data["eval_freq"])
        graph_current_agent_RMSVE, = ax[1].plot(plt2_x_legend, current_agent_RMSVE[:], label=plt2_legend_dict[data_name])
        plt2_agent_sweeps.append(graph_current_agent_RMSVE)
                
          
    # plot1: 
    # add True V
    plt1_x_legend = range(1,len(true_V[:]) + 1)
    graph_true_V, = ax[0].plot(plt1_x_legend, true_V[:], label="$v_{\pi}$")
    
    ax[0].legend(handles=[*plt1_agent_sweeps, graph_true_V], fontsize = 13)
    
    ax[0].set_title("State Value", fontsize = 15)
    ax[0].set_xlabel('State', fontsize = 14)
    ax[0].set_ylabel('Value\n scale', rotation=0, labelpad=15, fontsize = 14)

    plt1_xticks = [1, 100, 200, 300, 400, 500]
    plt1_yticks = [-1.0, 0.0, 1.0]
    ax[0].set_xticks(plt1_xticks)
    ax[0].set_xticklabels(plt1_xticks, fontsize=13)
    ax[0].set_yticks(plt1_yticks)
    ax[0].set_yticklabels(plt1_yticks, fontsize=13)
    
    
    # plot2:
    ax[1].legend(handles=plt2_agent_sweeps, fontsize = 13)
    
    ax[1].set_title("Learning Curve", fontsize = 15)
    ax[1].set_xlabel('Episodes', fontsize = 14)
    ax[1].set_ylabel(plt2_label_dict[data_name_array[0]], rotation=0, labelpad=40, fontsize = 14)

    plt2_yticks = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
    ax[1].tick_params(axis="x", labelsize=13)
    ax[1].set_yticks(plt2_yticks)
    ax[1].set_yticklabels(plt2_yticks, fontsize = 13)

    plt.tight_layout()
    plt.show()      

In [19]:
def my_matmul(x1, x2):
    """
    Given matrices x1 and x2, return the multiplication of them
    """
    
    result = np.zeros((x1.shape[0], x2.shape[1]))
    x1_non_zero_indices = x1.nonzero()
    if x1.shape[0] == 1 and len(x1_non_zero_indices[1]) == 1:
        result = x2[x1_non_zero_indices[1], :]
    elif x1.shape[1] == 1 and len(x1_non_zero_indices[0]) == 1:
        result[x1_non_zero_indices[0], :] = x2 * x1[x1_non_zero_indices[0], 0]
    else:
        result = np.matmul(x1, x2)
    return result

In [20]:
def get_value(s, weights):
    """
    Compute value of input s given the weights of a neural network
    """
    ### Compute the ouput of the neural network, v, for input s

    psi = my_matmul(s, weights[0]["W"]) + weights[0]["b"]
    x = np.maximum(psi, 0)
    v = my_matmul(x, weights[1]["W"]) + weights[1]["b"]    
    return v

In [22]:
def get_gradient(s, weights):
    """
    Given inputs s and weights, return the gradient of v with respect to the weights
    """

    ### Compute the gradient of the value function with respect to W0, b0, W1, b1 for input s
    grads = [dict() for i in range(len(weights))]
    x = np.maximum(my_matmul(s, weights[0]["W"]) + weights[0]["b"], 0)
    grads[0]["W"] = my_matmul(s.T, (weights[1]["W"].T * (x > 0)))
    grads[0]["b"] = weights[1]["W"].T * (x > 0)
    grads[1]["W"] = x.T
    grads[1]["b"] = 1    

    return grads


In [24]:
class SGD(BaseOptimizer):
    def __init__(self):
        pass
    
    def optimizer_init(self, optimizer_info):
        """Setup for the optimizer.

        Set parameters needed to setup the stochastic gradient descent method.

        Assume optimizer_info dict contains:
        {
            step_size: float
        }
        """
        self.step_size = optimizer_info.get("step_size")
    
    def update_weights(self, weights, g):
        """
        Given weights and update g, return updated weights
        """
        for i in range(len(weights)):
            for param in weights[i].keys():
                
                ### update weights
                weights[i][param] += self.step_size * g[i][param]
                
        return weights

In [26]:
class Adam(BaseOptimizer):
    def __init__(self):
        pass
    
    def optimizer_init(self, optimizer_info):
        """Setup for the optimizer.

        Set parameters needed to setup the Adam algorithm.

        Assume optimizer_info dict contains:
        {
            num_states: integer,
            num_hidden_layer: integer,
            num_hidden_units: integer,
            step_size: float, 
            self.beta_m: float
            self.beta_v: float
            self.epsilon: float
        }
        """
        
        self.num_states = optimizer_info.get("num_states")
        self.num_hidden_layer = optimizer_info.get("num_hidden_layer")
        self.num_hidden_units = optimizer_info.get("num_hidden_units")

        # Specify Adam algorithm's hyper parameters
        self.step_size = optimizer_info.get("step_size")
        self.beta_m = optimizer_info.get("beta_m")
        self.beta_v = optimizer_info.get("beta_v")
        self.epsilon = optimizer_info.get("epsilon")

        self.layer_size = np.array([self.num_states, self.num_hidden_units, 1])

        # Initialize Adam algorithm's m and v
        self.m = [dict() for i in range(self.num_hidden_layer+1)]
        self.v = [dict() for i in range(self.num_hidden_layer+1)]

        for i in range(self.num_hidden_layer+1):

            # Initialize self.m[i]["W"], self.m[i]["b"], self.v[i]["W"], self.v[i]["b"] to zero
            self.m[i]["W"] = np.zeros((self.layer_size[i], self.layer_size[i+1]))
            self.m[i]["b"] = np.zeros((1, self.layer_size[i+1]))
            self.v[i]["W"] = np.zeros((self.layer_size[i], self.layer_size[i+1]))
            self.v[i]["b"] = np.zeros((1, self.layer_size[i+1]))

        # Initialize beta_m_product and beta_v_product to be later used for computing m_hat and v_hat
        self.beta_m_product = self.beta_m
        self.beta_v_product = self.beta_v

    def update_weights(self, weights, g):
        """
        Given weights and update g, return updated weights
        """
        
        for i in range(len(weights)):
            for param in weights[i].keys():

                ### update self.m and self.v
                self.m[i][param] = self.beta_m * self.m[i][param] + (1 - self.beta_m) * g[i][param]
                self.v[i][param] = self.beta_v * self.v[i][param] + (1 - self.beta_v) * (g[i][param] * g[i][param])

                ### compute m_hat and v_hat
                m_hat = self.m[i][param] / (1 - self.beta_m_product)
                v_hat = self.v[i][param] / (1 - self.beta_v_product)

                ### update weights
                weights[i][param] += self.step_size * m_hat / (np.sqrt(v_hat) + self.epsilon)
                
        ### update self.beta_m_product and self.beta_v_product
        self.beta_m_product *= self.beta_m
        self.beta_v_product *= self.beta_v
        
        return weights


In [27]:
def one_hot(state, num_states):
    """
    Given num_state and a state, return the one-hot encoding of the state
    """
    # Create the one-hot encoding of state
    # one_hot_vector is a numpy array of shape (1, num_states)
    
    one_hot_vector = np.zeros((1, num_states))
    one_hot_vector[0, int((state - 1))] = 1
    
    return one_hot_vector

In [28]:
class TDAgent(BaseAgent):
    def __init__(self):
        self.name = "td_agent"
        pass

    def agent_init(self, agent_info={}):
        """Setup for the agent called when the experiment first starts.

        Set parameters needed to setup the semi-gradient TD with a Neural Network.

        Assume agent_info dict contains:
        {
            num_states: integer,
            num_hidden_layer: integer,
            num_hidden_units: integer,
            step_size: float, 
            discount_factor: float,
            self.beta_m: float
            self.beta_v: float
            self.epsilon: float
            seed: int
        }
        """
    
        # Set random seed for weights initialization for each run
        self.rand_generator = np.random.RandomState(agent_info.get("seed")) 
        
        # Set random seed for policy for each run
        self.policy_rand_generator = np.random.RandomState(agent_info.get("seed"))

        # Set attributes according to agent_info
        self.num_states = agent_info.get("num_states")
        self.num_hidden_layer = agent_info.get("num_hidden_layer")
        self.num_hidden_units = agent_info.get("num_hidden_units")
        self.discount_factor = agent_info.get("discount_factor")

        ### Define the neural network's structure
        self.layer_size = np.array([self.num_states, self.num_hidden_units, 1])
        # Initialize the neural network's parameter
        self.weights = [dict() for i in range(self.num_hidden_layer+1)]
        for i in range(self.num_hidden_layer+1):

            ### Initialize self.weights[i]["W"] and self.weights[i]["b"] using self.rand_generator.normal()
            # Note that The parameters of self.rand_generator.normal are mean of the distribution, 
            # standard deviation of the distribution, and output shape in the form of tuple of integers.
            # To specify output shape, use self.layer_size.
            ins, outs = self.layer_size[i], self.layer_size[i+1]
            self.weights[i]['W'] = self.rand_generator.normal(0, np.sqrt(2/ins), (ins, outs))
            self.weights[i]['b'] = self.rand_generator.normal(0, np.sqrt(2/ins), (1, outs))            
        
        # Specify the optimizer
        self.optimizer = Adam()
        self.optimizer.optimizer_init({
            "num_states": agent_info["num_states"],
            "num_hidden_layer": agent_info["num_hidden_layer"],
            "num_hidden_units": agent_info["num_hidden_units"],
            "step_size": agent_info["step_size"],
            "beta_m": agent_info["beta_m"],
            "beta_v": agent_info["beta_v"],
            "epsilon": agent_info["epsilon"],
        })
        
        self.last_state = None
        self.last_action = None

    def agent_policy(self, state):

        ### Set chosen_action as 0 or 1 with equal probability. 
        chosen_action = self.policy_rand_generator.choice([0,1])    
        return chosen_action

    def agent_start(self, state):
        """The first method called when the experiment starts, called after
        the environment starts.
        Args:
            state (Numpy array): the state from the
                environment's evn_start function.
        Returns:
            The first action the agent takes.
        """
        ### select action given state (using self.agent_policy()), and save current state and action
        self.last_state = state
        self.last_action = self.agent_policy(state)        

        return self.last_action

    def agent_step(self, reward, state):
        """A step taken by the agent.
        Args:
            reward (float): the reward received for taking the last action taken
            state (Numpy array): the state from the
                environment's step based, where the agent ended up after the
                last step
        Returns:
            The action the agent is taking.
        """
        
        ### Compute TD error
        last_state_vec = one_hot(self.last_state, self.num_states)
        last_value = get_value(last_state_vec, self.weights)
        
        state_vec = one_hot(state, self.num_states)
        value = get_value(state_vec, self.weights)
        
        delta = reward + self.discount_factor * value - last_value        

        ### Retrieve gradients
        grads = get_gradient(last_state_vec, self.weights)

        ### Compute g (1 line)
        g = [dict() for i in range(self.num_hidden_layer+1)]
        for i in range(self.num_hidden_layer+1):
            for param in self.weights[i].keys():

                # g[i][param] = None
                g[i][param] = delta * grads[i][param]

        ### update the weights using self.optimizer
        self.weights = self.optimizer.update_weights(self.weights, g)

        ### update self.last_state and self.last_action
        self.last_state = state
        self.last_action = self.agent_policy(state)        

        return self.last_action

    def agent_end(self, reward):
        """Run when the agent terminates.
        Args:
            reward (float): the reward the agent received for entering the
                terminal state.
        """

        ### compute TD error
        last_state_vec = one_hot(self.last_state, self.num_states)
        last_value = get_value(last_state_vec, self.weights)
        delta = reward - last_value        

        ### Retrieve gradients
        grads = get_gradient(last_state_vec, self.weights) 

        ### Compute g
        g = [dict() for i in range(self.num_hidden_layer+1)]
        for i in range(self.num_hidden_layer+1):
            for param in self.weights[i].keys():

                g[i][param] = delta * grads[i][param]

        ### update the weights using self.optimizer
        self.weights = self.optimizer.update_weights(self.weights, g)

    def agent_message(self, message):
        if message == 'get state value':
            state_value = np.zeros(self.num_states)
            for state in range(1, self.num_states + 1):
                s = one_hot(state, self.num_states)
                state_value[state - 1] = get_value(s, self.weights)
            return state_value

In [None]:
#@title random walk policy evaluation - create true_V.npy file


def compute_value_function(num_states, action_range):
    """
    Computes the value function for the 1000 state random walk as described in Sutton and Barto (2017).
    :return: The value function for states 1 to 1000. Index 0 is not used in this array (i.e. should remain 0).
    """
    state_prob = 0.5 / float(action_range)
    gamma = 1
    theta = 0.000001

    V = np.zeros(num_states+1)

    delta = np.infty
    i = 0
    while delta > theta:
        i += 1
        delta = 0.0
        for s in range(1, num_states+1):
            v = V[s]
            value_sum = 0.0
            for transition in range(1, action_range+1):
                right = s + transition
                right_reward = 0
                if right > num_states:
                    right_reward = 1
                    right = 0

                left = s - transition
                left_reward = 0
                if left < 1:
                    left_reward = -1
                    left = 0

                value_sum += state_prob * ((right_reward + gamma * V[right]) + (left_reward + gamma * V[left]))

            V[s] = value_sum
            delta = max(delta, np.abs(v-V[s]))
    
    np.save("./true_V", V[1:])
    return V[1:]

### if __name__ == '__main__':
num_states = 500
action_range = 100
V = compute_value_function(num_states, action_range)

np.save("./true_V", V)
plt.plot(range(1, num_states), V[1:])
plt.xlabel('State')
plt.ylabel('Value')
plt.show()

### data1 = np.load('./test.npy') 
### data2 = np.load('./true_V.npy')
### if (data1 == data2).all:
###   print('all right')
### np.save('./true_V.npy', np.array('here goes the array'))

In [34]:
#@title we have written known actual state distribution - in state_distribution.npy file

# data = np.load('./state_distribution.npy') 
# data

np.save('./state_distribution.npy', np.array([0.00050342, 0.00051537, 0.00051983, 0.00052783, 0.00054043,
       0.00054095, 0.00055063, 0.0005632 , 0.0005723 , 0.00057897,
       0.00058859, 0.00059669, 0.00060697, 0.00060958, 0.00062441,
       0.00062788, 0.00064452, 0.00065015, 0.00065469, 0.00066292,
       0.00067739, 0.00068159, 0.00069353, 0.00069643, 0.00071412,
       0.00071868, 0.00072884, 0.00075045, 0.00074827, 0.0007581 ,
       0.00077134, 0.00077444, 0.00079118, 0.00080066, 0.00080477,
       0.00081647, 0.00082221, 0.00083824, 0.00085436, 0.00085694,
       0.00085677, 0.00088564, 0.00088572, 0.00089417, 0.00090236,
       0.00092084, 0.00093196, 0.00093285, 0.0009458 , 0.00096468,
       0.00097549, 0.00098674, 0.00099336, 0.00100658, 0.0010176 ,
       0.00103931, 0.00103993, 0.00105593, 0.00106808, 0.00108118,
       0.00109397, 0.00110813, 0.00111901, 0.0011267 , 0.0011282 ,
       0.00114946, 0.00116087, 0.00118098, 0.00118296, 0.00120058,
       0.00121217, 0.00122346, 0.00123674, 0.00125523, 0.00125821,
       0.00128223, 0.00128554, 0.00130661, 0.00132195, 0.00132658,
       0.001337  , 0.00134872, 0.00136579, 0.00136663, 0.00139404,
       0.00139848, 0.001419  , 0.00142815, 0.00144499, 0.00146277,
       0.00147143, 0.00148438, 0.00149689, 0.00150875, 0.00151946,
       0.00152384, 0.00154264, 0.00156307, 0.00157091, 0.00158682,
       0.00161182, 0.001613  , 0.0016272 , 0.00163835, 0.00164645,
       0.00165212, 0.00165297, 0.00168245, 0.00168208, 0.00170092,
       0.0017003 , 0.00172867, 0.0017209 , 0.0017406 , 0.00175318,
       0.00177206, 0.00177385, 0.00179813, 0.00179566, 0.00181001,
       0.00180087, 0.00182957, 0.00183724, 0.00185188, 0.00185671,
       0.00187875, 0.00187678, 0.00189374, 0.00189855, 0.00192199,
       0.00193324, 0.0019392 , 0.00193704, 0.00194829, 0.00196719,
       0.00198096, 0.00197397, 0.00200054, 0.00200993, 0.00201649,
       0.00201641, 0.00202741, 0.00204063, 0.00205622, 0.00206629,
       0.00208476, 0.00208844, 0.00209567, 0.00210024, 0.00231221,
       0.00232935, 0.00233713, 0.00234784, 0.00236371, 0.00235567,
       0.00238131, 0.00238263, 0.00238604, 0.00239073, 0.00240666,
       0.00240008, 0.00241935, 0.00243903, 0.00243995, 0.0024497 ,
       0.00246551, 0.002471  , 0.00247267, 0.00248375, 0.00249487,
       0.00250789, 0.00250768, 0.00251874, 0.00251356, 0.00254146,
       0.00255166, 0.00254615, 0.00255837, 0.00256789, 0.0025661 ,
       0.00258586, 0.0025787 , 0.0025969 , 0.00259521, 0.00259499,
       0.00261569, 0.0026059 , 0.00263882, 0.00262394, 0.00263241,
       0.0026314 , 0.00264764, 0.00263598, 0.00266424, 0.00267746,
       0.00266757, 0.00268087, 0.00269125, 0.00268402, 0.00269576,
       0.00270081, 0.00270205, 0.00270602, 0.00272495, 0.00273038,
       0.00272314, 0.00273873, 0.00273984, 0.00273815, 0.00275035,
       0.0027457 , 0.00276034, 0.00276651, 0.00276281, 0.00278392,
       0.00277704, 0.00278361, 0.00279305, 0.00280185, 0.00278799,
       0.00279638, 0.00278878, 0.0028044 , 0.00280381, 0.00281524,
       0.00279908, 0.00281635, 0.00282225, 0.00281668, 0.00283035,
       0.00282996, 0.00282696, 0.00283561, 0.00283615, 0.00283524,
       0.00283788, 0.00285564, 0.00285225, 0.0028593 , 0.00284709,
       0.0028474 , 0.00285042, 0.00286586, 0.00285936, 0.00286968,
       0.00286822, 0.00285015, 0.00285607, 0.00285977, 0.04377746,
       0.00286391, 0.00286415, 0.00286389, 0.00285858, 0.00285776,
       0.00284733, 0.00284927, 0.00285587, 0.00284777, 0.00285377,
       0.00284738, 0.00286175, 0.00284187, 0.00284762, 0.00284123,
       0.00283183, 0.00283757, 0.00284797, 0.00282714, 0.00281623,
       0.00282297, 0.00280843, 0.00280987, 0.0028102 , 0.00280245,
       0.00280249, 0.00281425, 0.0027992 , 0.00279464, 0.0027992 ,
       0.00279439, 0.0027763 , 0.00277307, 0.00277019, 0.00278148,
       0.00277169, 0.00275489, 0.00276468, 0.00275734, 0.00274451,
       0.00274257, 0.00274218, 0.00275072, 0.00273704, 0.00271091,
       0.00270943, 0.00271375, 0.00271696, 0.00271064, 0.00270924,
       0.00269711, 0.00269526, 0.00267912, 0.00268229, 0.00267865,
       0.00266763, 0.00264579, 0.00264951, 0.00264949, 0.00263432,
       0.00261388, 0.00262694, 0.00263121, 0.00261701, 0.00261951,
       0.00260331, 0.00259392, 0.00259246, 0.00258876, 0.00258065,
       0.00256513, 0.00256525, 0.00255693, 0.00253721, 0.00253501,
       0.00253614, 0.00252541, 0.00251543, 0.00250009, 0.00248048,
       0.00248539, 0.00249251, 0.002474  , 0.002468  , 0.00243952,
       0.00243311, 0.00244324, 0.00243121, 0.00242437, 0.00242182,
       0.00241158, 0.00237987, 0.00238674, 0.00237942, 0.00236205,
       0.00235565, 0.00234366, 0.00234175, 0.00232925, 0.00232238,
       0.00209493, 0.00210441, 0.00208087, 0.00207885, 0.00207061,
       0.00206829, 0.00204614, 0.00204075, 0.00202141, 0.00202227,
       0.00201676, 0.00199749, 0.00198952, 0.00198503, 0.0019751 ,
       0.00196309, 0.00193906, 0.00193038, 0.00192386, 0.00192063,
       0.00191264, 0.00188757, 0.00187937, 0.00187628, 0.00185891,
       0.00184988, 0.00184497, 0.00183835, 0.00182716, 0.00181147,
       0.00181871, 0.00177915, 0.00178643, 0.00177317, 0.00176385,
       0.00175148, 0.00174691, 0.00173135, 0.00171385, 0.00170172,
       0.00170096, 0.00167898, 0.00167445, 0.00165476, 0.00165786,
       0.0016416 , 0.00163549, 0.00162126, 0.00160757, 0.00159714,
       0.00157578, 0.00156311, 0.00156005, 0.00154052, 0.00152658,
       0.00152329, 0.00150449, 0.00149567, 0.00147009, 0.00146162,
       0.00146175, 0.00142883, 0.00142605, 0.00140757, 0.00139762,
       0.00139032, 0.00137479, 0.00136601, 0.00134701, 0.00133754,
       0.00132832, 0.00131237, 0.00129362, 0.00127879, 0.0012761 ,
       0.00126337, 0.00125122, 0.00124542, 0.00122344, 0.00120191,
       0.00119003, 0.00118888, 0.00116669, 0.0011473 , 0.00114438,
       0.00113435, 0.00112177, 0.00110673, 0.00109323, 0.00109464,
       0.00107809, 0.00106341, 0.00104904, 0.00102916, 0.00102669,
       0.00101378, 0.00100317, 0.00098875, 0.00098067, 0.00096445,
       0.00095721, 0.00094804, 0.00094091, 0.00091977, 0.00092059,
       0.00090069, 0.00089598, 0.00088747, 0.00087217, 0.00086631,
       0.00085375, 0.00084779, 0.00083345, 0.00082541, 0.00081696,
       0.00079618, 0.00078721, 0.00077798, 0.00077422, 0.00076572,
       0.00075419, 0.00074821, 0.0007377 , 0.00072146, 0.0007267 ,
       0.00070789, 0.00069964, 0.00069625, 0.00067577, 0.00067271,
       0.00066238, 0.00065126, 0.00064614, 0.00063833, 0.00062539,
       0.00061828, 0.00060413, 0.00060095, 0.00059326, 0.00058701,
       0.00057868, 0.000573  , 0.00056377, 0.0005522 , 0.00054181,
       0.00053344, 0.00052228, 0.00051599, 0.00051122, 0.00050052]))

In [None]:
true_state_val = np.load('./true_V.npy')    
state_distribution = np.load('./state_distribution.npy')

def calc_RMSVE(learned_state_val):
    assert(len(true_state_val) == len(learned_state_val) == len(state_distribution))
    MSVE = np.sum(np.multiply(state_distribution, np.square(true_state_val - learned_state_val)))
    RMSVE = np.sqrt(MSVE)
    return RMSVE

# Define function to run experiment
def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters):
    
    rl_glue = RLGlue(environment, agent)
        
    # save rmsve at the end of each episode
    agent_rmsve = np.zeros((experiment_parameters["num_runs"], 
                            int(experiment_parameters["num_episodes"]/experiment_parameters["episode_eval_frequency"]) + 1))
    
    # save learned state value at the end of each run
    agent_state_val = np.zeros((experiment_parameters["num_runs"], 
                                environment_parameters["num_states"]))

    env_info = {"num_states": environment_parameters["num_states"],
                "start_state": environment_parameters["start_state"],
                "left_terminal_state": environment_parameters["left_terminal_state"],
                "right_terminal_state": environment_parameters["right_terminal_state"]}

    agent_info = {"num_states": environment_parameters["num_states"],
                  "num_hidden_layer": agent_parameters["num_hidden_layer"],
                  "num_hidden_units": agent_parameters["num_hidden_units"],
                  "step_size": agent_parameters["step_size"],
                  "discount_factor": environment_parameters["discount_factor"],
                  "beta_m": agent_parameters["beta_m"],
                  "beta_v": agent_parameters["beta_v"],
                  "epsilon": agent_parameters["epsilon"]
                 }
    
    print('Setting - Neural Network with 100 hidden units')
    os.system('sleep 1')

    # one agent setting
    for run in tqdm(range(1, experiment_parameters["num_runs"]+1)):
        env_info["seed"] = run
        agent_info["seed"] = run
        rl_glue.rl_init(agent_info, env_info)
        
        # Compute initial RMSVE before training
        current_V = rl_glue.rl_agent_message("get state value")
        agent_rmsve[run-1, 0] = calc_RMSVE(current_V)
        
        for episode in range(1, experiment_parameters["num_episodes"]+1):
            # run episode
            rl_glue.rl_episode(0) # no step limit

            if episode % experiment_parameters["episode_eval_frequency"] == 0:
                current_V = rl_glue.rl_agent_message("get state value")
                agent_rmsve[run-1, int(episode/experiment_parameters["episode_eval_frequency"])] = calc_RMSVE(current_V)
            elif episode == experiment_parameters["num_episodes"]: # if last episode
                current_V = rl_glue.rl_agent_message("get state value")

        agent_state_val[run-1, :] = current_V

    save_name = "{}".format(rl_glue.agent.name).replace('.','')
    
    if not os.path.exists('./results'):
                os.makedirs('./results')
    
    # save avg. state value
    np.save("./results/V_{}".format(save_name), agent_state_val)

    # save avg. rmsve
    np.savez("./results/RMSVE_{}".format(save_name), rmsve = agent_rmsve,
                                                   eval_freq = experiment_parameters["episode_eval_frequency"],
                                                   num_episodes = experiment_parameters["num_episodes"])


# Run Experiment

# Experiment parameters
experiment_parameters = {
    "num_runs" : 20,
    "num_episodes" : 1000,
    "episode_eval_frequency" : 10 # evaluate every 10 episode
}

# Environment parameters
environment_parameters = {
    "num_states" : 500,
    "start_state" : 250,
    "left_terminal_state" : 0,
    "right_terminal_state" : 501,
    "discount_factor" : 1.0
}

# Agent parameters
agent_parameters = {
    "num_hidden_layer": 1,
    "num_hidden_units": 100,
    "step_size": 0.001,
    "beta_m": 0.9,
    "beta_v": 0.999,
    "epsilon": 0.0001,
}

current_env = RandomWalkEnvironment
current_agent = TDAgent

# run experiment
run_experiment(current_env, current_agent, environment_parameters, agent_parameters, experiment_parameters)

# plot result
plot_result(["td_agent"])

shutil.make_archive('results', 'zip', './results')