# Building the environment

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import numpy as np
import gym
from gym.spaces import Discrete, Box, Dict
from gym.envs.registration import EnvSpec

import ray
from ray import tune
from ray.rllib.agents.dqn.dqn_policy_graph import *
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.env import MultiAgentEnv
from ray.rllib.models.preprocessors import DictFlatteningPreprocessor, Preprocessor

from ray.tune import run_experiments
from ray.tune.registry import register_env

### Environment Below


In [1]:
def reward_calculator(travel_time, marginal_cost, soc_fac):
    rew_dict = {}
    for agent in travel_time.keys():
        rew_dict[agent] = travel_time[agent] + soc_fac * marginal_cost[agent]
    return rew_dict

## Routing Environment 
class RoutingEnv(MultiAgentEnv):
    """
    Description:
        The cars start at the same origin point, Point A, and need to reach the same destination, Point Z. 
        Each car can reach Point Z via a variety of routing choices described in a given network, in which 
        each route introduces different travel times and congestion. 
        The goal is to minimize the average travel times amongst each of the cars.
    
    Observation: 
        ## FILL OUT WHEN DONE
        Num	Observation                 Min          Max
        0	Previous Route Choice        0      total_routes-1
        1	Route Travel Time            0           +Inf
        2	Comm Message               -Inf          +Inf
        
    Actions:
        ## FILL OUT WHEN DONE
        Num	Action                      Min          Max
        0	Future Path_Choice           0      total_routes-1
        1	Comm Message               -Inf          +Inf
            
    Reward:
        Reward for each car is determined by the following formula: 
        marginal_cost = d[t(x_e)]/d[x_e]
        Cost = route_travel_time + λ(marginal_cost)
        Reward = -Cost
        ***
        route_travel_time: Travel time of the route previously taken by the car
        marginal_cost: Cost that the car's route choice imposes on everyone else. 
                       The formula above captures the change in the travel flow 
                       with respect to the change in vehicle flow on a given road.
        λ: Weight Toward Social Good (between 0 and 1)
        
    
    Starting State:
        All observations are assigned -1 for path choice and travel times.
    
    Episode Termination:
        Cars keeps a consistent routing distribution.
        Episode length is greater than 200
        Solved Requirements
        Considered solved when the average travel time is less than or equal to the theorical social optimum. 
    """
    
    def __init__(self, config):
        """
        FILL IN HERE.
        """
        self.network_name = config['network']
        self.num_paths = config['num_paths']
        self.soc_fac = config['soc_fac']
        self.num_veh = config['num_veh']
        self.num_obs = 2
        self.num_actions = 1
        self.state = None
        # Make observation space
        obs_spaces = {
            'prev_route': Discrete(self.num_paths),
            'prev_time': Box(low=0, 
                             high=float('+inf'), 
                             shape=(1,), 
                             dtype=np.float32)
        }
        self.preprocessor = DictFlatteningPreprocessor(Dict(obs_spaces))
        self.observation_space = self.preprocessor.observation_space
        # Make the action space
        self.action_space = Discrete(self.num_paths) # int between 0 and num_paths-1

    def get_state(self, **kwargs):
        return self.state
    
    def reset(self):
        """
        FILL IN HERE.
        """
        # Create initial observations for each vehicle
        start = {
            'prev_route': 0,
            'prev_time': 0
        }
        self.state = {'car_{}'.format(i): self.preprocessor.transform(start) for i in range(self.num_veh)}
        return self.state
    
    def step(self, action_dict):
        """
        FILL IN HERE.
        """
        obs_dict, rew_dict, done, info_dict = {}, {}, {}, {}
        
        # Apply the actions of every agent at the same time
        paths_flow_dict = {}
        for agent, rl_action in action_dict.items():
            # agent is one string that represent the id of the agent
            # rl_action is one number that represent the path choice of the agent,
            # rl_action should be a int between 0 and nb_paths-1
            assert type(rl_action) == int and rl_action > -1 and rl_action < network.nb_paths
            # we built a dictionnary paths_flow_dict that store the path flow on every path
            if rl_action in paths_flow_dict:
                paths_flow_dict[rl_action] += 1
            else:
                paths_flow_dict[rl_action] = 1

        # update the path travel times of the network given the path flows
        network.update_flow_from_dict(paths_flow_dict)
        
        
        # Calculate states, reward, and done for each agent
        travel_time = {}
        marginal_cost = {}
        
        for agent, path_choice in action_dict.items():
            assert type(path_choice) == int and path_choice > -1 and path_choice < network.nb_paths
            # network travel time ( path ) return the travel time of the path
            travel_time[agent] = network.travel_time(path_choice)
            # network marginal cost ( path ) return the marginal cost of the path
            marginal_cost[agent] = network.marginal_cost(path_choice)
            new_obs = {
                'prev_route': path_choice,
                'prev_time': travel_time[agent]
            }
            obs_dict[agent] = self.preprocessor.transform(new_obs)
            # Cost is the path_time
            # rew_dict[agent] = reward_calculator(agent, marginal_cost)
            # -path_choice # TO-DO: CHANGE THIS! 
            # Set done and infos
            done[agent] = True
            info_dict[agent] = {}
        rew_dict = reward_calculator(travel_time, marginal_cost, soc_fac)
        self.state = obs_dict   
        done["__all__"] = True
         
        return obs_dict, rew_dict, done, info_dict

NameError: name 'MultiAgentEnv' is not defined

### The following code runs the experiment for the multiagent problem.

In [None]:
# Setup policies for each vehicle

network_name = 'Braess'
nb_veh = 1
# network init should build the Network object
# ----- TO DO -----: import the network class
network = network(network_name, nb_veh)
# nb_path should be a property method

"""
define a function (class instantiation) which have for parameter a network name, 
and the number of vehicles, 
the return the num paths.

interface of the class network
class network:
    def __init__(self, network_name, nb_veh):
        load the network which correspond to the network_name
        define the nb_veh as the nb_veh
        from nb_veh and the intern demand define the number of flow that each veh represent
        also define __nb_paths to give it to the Env

    @property
    def nb_paths(self):
        return self.__nb_paths
"""

nb_path = network.nb_paths

env_config = {
    'network': network_name,
    'num_veh': nb_veh,
    'num_paths': nb_path,
    'soc_fac': 0.5
}
routing_env = RoutingEnv(env_config)
car_obs_space = routing_env.observation_space
car_act_space = routing_env.action_space
config = {"gamma": 0.85}
policy_graphs = {
    'vehicles': (DQNPolicyGraph, car_obs_space, car_act_space, config)
}

if __name__ == "__main__":
    env_creator_name = 'multi_routing'
    register_env(env_creator_name, lambda config: RoutingEnv(config))
    ray.init()
    experiments = {
        'route-DQN': {
            'run': 'DQN',
            'env': 'multi_routing',
            'stop': {
                'training_iteration': 10
            },
            'config': {
                'env_config': env_config,
                'multiagent': {
                    'policy_graphs': policy_graphs,
                    'policy_mapping_fn': tune.function(lambda agent_id: 'vehicles')
                }
            }
        },
        # put additional experiments to run concurrently here
    }
    
    run_experiments(experiments)