# Building the environment

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import Networks
import numpy as np
import gym
from gym.spaces import Discrete, Box, Dict
from gym.envs.registration import EnvSpec

import ray
from ray import tune
from ray.rllib.agents.dqn.dqn_policy_graph import *
from ray.rllib.agents.ppo.ppo_policy_graph import *
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.env import MultiAgentEnv
from ray.rllib.models.preprocessors import DictFlatteningPreprocessor, Preprocessor

from ray.tune import run_experiments
from ray.tune.registry import register_env

---- TESTING ----
The Nash case
[3.75, 3.75, 3.75]
[1.5, 0.75, 0.75]
The social optimum case
[3.25, 3.5, 3.5]
[1.0, 0.5, 0.5]


### Environment Below


In [2]:
def reward_calculator(travel_time, marginal_cost, soc_fac):
    # ADD THE BREMIAN DIVERGENCE!!!
    rew_dict = {}
    for agent in travel_time.keys():
        rew_dict[agent] = - (travel_time[agent] + soc_fac * marginal_cost[agent])
    return rew_dict

## Routing Environment 
class RoutingEnv(MultiAgentEnv):
    """
    Description:
        The cars start at the same origin point, Point A, and need to reach the same destination, Point Z. 
        Each car can reach Point Z via a variety of routing choices described in a given network, in which 
        each route introduces different travel times and congestion. 
        The goal is to minimize the average travel times amongst each of the cars.
    
    Observation: 
        ## FILL OUT WHEN DONE
        Num	Observation                 Min          Max
        0	Previous Route Choice        0      total_routes-1
        1	Route Travel Time            0           +Inf
        2	Comm Message               -Inf          +Inf
        
    Actions:
        ## FILL OUT WHEN DONE
        Num	Action                      Min          Max
        0	Future Path_Choice           0      total_routes-1
        1	Comm Message               -Inf          +Inf
            
    Reward:
        Reward for each car is determined by the following formula: 
        marginal_cost = d[t(x_e)]/d[x_e]
        Cost = route_travel_time + λ(marginal_cost)
        Reward = -Cost
        ***
        route_travel_time: Travel time of the route previously taken by the car
        marginal_cost: Cost that the car's route choice imposes on everyone else. 
                       The formula above captures the change in the travel flow 
                       with respect to the change in vehicle flow on a given road.
        λ: Weight Toward Social Good (between 0 and 1)
        
    
    Starting State:
        All observations are assigned -1 for path choice and travel times.
    
    Episode Termination:
        Cars keeps a consistent routing distribution.
        Episode length is greater than 200
        Solved Requirements
        Considered solved when the average travel time is less than or equal to the theorical social optimum. 
    """
    
    def __init__(self, config):
        """
        FILL IN HERE.
        """
        self.network_name = config['network']
        self.num_paths = config['num_paths']
        self.soc_fac = config['soc_fac']
        self.num_veh = config['num_veh']
        self.num_obs = 2
        self.num_actions = 1
        self.state = None
        # Make observation space
        obs_spaces = {
            'prev_route': Discrete(self.num_paths),
            'prev_time': Box(low=0, 
                             high=float('+inf'), 
                             shape=(1,), 
                             dtype=np.float32)
        }
        self.preprocessor = DictFlatteningPreprocessor(Dict(obs_spaces))
        self.observation_space = self.preprocessor.observation_space
        # Make the action space
        self.action_space = Discrete(self.num_paths) # int between 0 and num_paths-1

    def get_state(self, **kwargs):
        return self.state
    
    def reset(self):
        """
        FILL IN HERE.
        """
        # Create initial observations for each vehicle
        start = {
            'prev_route': 0,
            'prev_time': 0
        }
        self.state = {'car_{}'.format(i): self.preprocessor.transform(start) for i in range(self.num_veh)}
        return self.state
    
    def step(self, action_dict):
        ### ADD THE COMMUNICATION CHANNEL
        """
        FILL IN HERE.
        """
        obs_dict, rew_dict, done, info_dict = {}, {}, {}, {}
        
        # Apply the actions of every agent at the same time
        paths_flow_dict = {}
        
        for agent, rl_action in action_dict.items():
            # agent is one string that represent the id of the agent
            # rl_action is one number that represent the path choice of the agent,
            # rl_action should be a int between 0 and nb_paths-1
            rl_action = int(rl_action)
            assert type(rl_action) == int and rl_action > -1 and rl_action < network.nb_paths
            # we built a dictionnary paths_flow_dict that store the path flow on every path
            if rl_action in paths_flow_dict:
                paths_flow_dict[rl_action] += 1
            else:
                paths_flow_dict[rl_action] = 1

        # update the path travel times of the network given the path flows
        network.update_flow_from_dict(paths_flow_dict)
        
        
        # Calculate states, reward, and done for each agent
        travel_time = {}
        marginal_cost = {}
        
        for agent, path_choice in action_dict.items():
            path_choice = int(path_choice)
            assert type(path_choice) == int and path_choice > -1 and path_choice < network.nb_paths
            # network travel time ( path ) return the travel time of the path
            travel_time[agent] = network.travel_time(path_choice)
            # network marginal cost ( path ) return the marginal cost of the path
            marginal_cost[agent] = network.marginal_cost(path_choice)
            new_obs = {
                'prev_route': path_choice,
                'prev_time': travel_time[agent]
            }
            obs_dict[agent] = self.preprocessor.transform(new_obs)
            # Cost is the path_time
            # rew_dict[agent] = reward_calculator(agent, marginal_cost)
            # -path_choice # TO-DO: CHANGE THIS! 
            # Set done and infos
            done[agent] = True
            info_dict[agent] = {}
        rew_dict = reward_calculator(travel_time, marginal_cost, self.soc_fac)
        self.state = obs_dict   
        
        self.file = open("/Users/theophile/Documents/Classes/FLOW/Project/learning_wardrop/test_lambda_" + str(self.soc_fac) + "_gamma_" + str(0) + "_trail_" + str(1), 'a')
        self.file.write("Actions: " + str(action_dict) + '\n')
        self.file.write("Reward: " + str(rew_dict) + '\n')
        self.file.close()
        
        done["__all__"] = True
         
        return obs_dict, rew_dict, done, info_dict

### The following code runs the experiment for the multiagent problem.

Remark:
On the Braess network using 4 vehicles, we should get:
- if the social factor is 0, Nash: a reward of -3.75 in average, 2 cars on the first path, 1 on the second and third path
- if the social factor is 1, Social optimum: a travel time of -3.5 in average (a reward of ), 2 cars on the first path, 1 on the second and third path

In [None]:

# Setup policies for each vehicle

network_name = 'Braess'
nb_veh = 4
# network init should build the Network object
# ----- TO DO -----: import the network class
network = Networks.network(network_name, nb_veh)
# nb_path should be a property method

"""
define a function (class instantiation) which have for parameter a network name, 
and the number of vehicles, 
the return the num paths.

interface of the class network
class network:
    def __init__(self, network_name, nb_veh):
        load the network which correspond to the network_name
        define the nb_veh as the nb_veh
        from nb_veh and the intern demand define the number of flow that each veh represent
        also define __nb_paths to give it to the Env

    @property
    def nb_paths(self):
        return self.__nb_paths
"""

nb_path = network.nb_paths

env_config = {
    'network': network_name,
    'num_veh': nb_veh,
    'num_paths': nb_path,
    'soc_fac': 0 # to change
}
routing_env = RoutingEnv(env_config)
car_obs_space = routing_env.observation_space
car_act_space = routing_env.action_space
config = {"gamma": 0.0}
policy_graphs = {
    'vehicles': (DQNPolicyGraph, car_obs_space, car_act_space, config)
}

if __name__ == "__main__":
    print("Begin")
    env_creator_name = 'multi_routing'
    register_env(env_creator_name, lambda config: RoutingEnv(config))
    ray.init()
    experiments = {
        'route-DQN': {
            'run': 'DQN',
            'env': 'multi_routing',
            'stop': {
                'training_iteration': 100
            },
            'config': {
                'env_config': env_config,
                'multiagent': {
                    'policy_graphs': policy_graphs,
                    'policy_mapping_fn': tune.function(lambda agent_id: 'vehicles')
                }
            }
        },
        # put additional experiments to run concurrently here
    }
    print("End")
    
    run_experiments(experiments)

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2018-12-02_14-17-27_4123/logs.
Waiting for redis server at 127.0.0.1:46916 to respond...


Begin


Waiting for redis server at 127.0.0.1:15033 to respond...
Starting the Plasma object store with 6.871947672999999 GB memory using /tmp.

View the web UI at http://localhost:8889/notebooks/ray_ui.ipynb?token=e307b41ab410f6df019cf44bd7b3f7debfeccd4e5ee6fb2e

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs
Memory usage on this node: 8.9/17.2 GB

Created LogSyncer for /Users/theophile/ray_results/route-DQN/DQN_multi_routing_0_2018-12-02_14-17-28058wpch7 -> 
== Status ==
Using FIFO scheduling algorithm.
Resources requested: 1/8 CPUs, 0/0 GPUs
Memory usage on this node: 8.9/17.2 GB
Result logdir: /Users/theophile/ray_results/route-DQN
RUNNING trials:
 - DQN_multi_routing_0:	RUNNING



End


Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-17-42
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.0
  episode_reward_mean: -15.0235
  episode_reward_min: -17.0
  episodes_this_iter: 1000
  episodes_total: 1000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: .nan
    max_exploration: 1.0
    min_exploration: 1.0
    num_steps_sampled: 1000
    num_steps_trained: 0
    num_target_updates: 1
    opt_peak_throughput: 0.0
    opt_samples: .nan
    replay_time_ms: .nan
    sample_time_ms: 10.741
    update_time_ms: 0.001
  iterations_since_restore: 1
  node_ip: 10.142.38.66
  num_metric_batches_dropped: 0
  pid: 4140
  policy_reward_mean:
    vehicles: -3.755875
  time_since_restore: 3.111064910888672
  time_this_iter_s: 3.111064910888672
  time_total_s: 3.111064910888672
  timestamp: 1543789062
  timesteps_since_restore: 1000
  timesteps_this_iter: 1000
  timesteps_total: 1000
  training_iter

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 1/8 CPUs, 0/0 GPUs
Memory usage on this node: 9.8/17.2 GB
Result logdir: /Users/theophile/ray_results/route-DQN
RUNNING trials:
 - DQN_multi_routing_0:	RUNNING [pid=4140], 36 s, 7 iter, 7000 ts, -15.4 rew

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-18-22
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.0
  episode_reward_mean: -15.537
  episode_reward_min: -17.0
  episodes_this_iter: 1000
  episodes_total: 8000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 7.15
    max_exploration: 0.31400000000000006
    min_exploration: 0.31400000000000006
    num_steps_sampled: 8000
    num_steps_trained: 56000
    num_target_updates: 15
    opt_peak_throughput: 4475.297
    opt_samples: 32.0
    replay_time_ms: 3.322
    sample_time_ms: 11.562
    update_time_ms: 0.001
  iterations_since_restore: 8
  node_ip: 10.142.38.66
  num_metr

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-18-56
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.5
  episode_reward_mean: -16.0205
  episode_reward_min: -17.0
  episodes_this_iter: 1000
  episodes_total: 14000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 7.165
    max_exploration: 0.020000000000000018
    min_exploration: 0.020000000000000018
    num_steps_sampled: 14000
    num_steps_trained: 104000
    num_target_updates: 27
    opt_peak_throughput: 4465.96
    opt_samples: 32.0
    replay_time_ms: 3.063
    sample_time_ms: 11.439
    update_time_ms: 0.001
  iterations_since_restore: 14
  node_ip: 10.142.38.66
  num_metric_batches_dropped: 0
  pid: 4140
  policy_reward_mean:
    vehicles: -4.005125
  time_since_restore: 76.59287691116333
  time_this_iter_s: 5.600429058074951
  time_total_s: 76.59287691116333
  timestamp: 1543789136
  timesteps_since_restore: 14000
  timesteps_this_

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 1/8 CPUs, 0/0 GPUs
Memory usage on this node: 9.9/17.2 GB
Result logdir: /Users/theophile/ray_results/route-DQN
RUNNING trials:
 - DQN_multi_routing_0:	RUNNING [pid=4140], 110 s, 20 iter, 20000 ts, -15.9 rew

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-19-36
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.5
  episode_reward_mean: -15.9765
  episode_reward_min: -17.0
  episodes_this_iter: 1000
  episodes_total: 21000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 6.908
    max_exploration: 0.020000000000000018
    min_exploration: 0.020000000000000018
    num_steps_sampled: 21000
    num_steps_trained: 160000
    num_target_updates: 41
    opt_peak_throughput: 4632.319
    opt_samples: 32.0
    replay_time_ms: 3.157
    sample_time_ms: 11.234
    update_time_ms: 0.001
  iterations_since_restore: 21
  node_ip: 10.142.38.66

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-20-10
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.5
  episode_reward_mean: -15.979
  episode_reward_min: -17.0
  episodes_this_iter: 1000
  episodes_total: 27000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 7.051
    max_exploration: 0.020000000000000018
    min_exploration: 0.020000000000000018
    num_steps_sampled: 27000
    num_steps_trained: 208000
    num_target_updates: 53
    opt_peak_throughput: 4538.354
    opt_samples: 32.0
    replay_time_ms: 3.311
    sample_time_ms: 11.609
    update_time_ms: 0.001
  iterations_since_restore: 27
  node_ip: 10.142.38.66
  num_metric_batches_dropped: 0
  pid: 4140
  policy_reward_mean:
    vehicles: -3.99475
  time_since_restore: 150.59256219863892
  time_this_iter_s: 5.739639043807983
  time_total_s: 150.59256219863892
  timestamp: 1543789210
  timesteps_since_restore: 27000
  timesteps_this

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 1/8 CPUs, 0/0 GPUs
Memory usage on this node: 9.9/17.2 GB
Result logdir: /Users/theophile/ray_results/route-DQN
RUNNING trials:
 - DQN_multi_routing_0:	RUNNING [pid=4140], 184 s, 33 iter, 33000 ts, -16 rew

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-20-50
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.0
  episode_reward_mean: -16.029
  episode_reward_min: -17.0
  episodes_this_iter: 1000
  episodes_total: 34000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 6.835
    max_exploration: 0.020000000000000018
    min_exploration: 0.020000000000000018
    num_steps_sampled: 34000
    num_steps_trained: 264000
    num_target_updates: 67
    opt_peak_throughput: 4681.584
    opt_samples: 32.0
    replay_time_ms: 3.207
    sample_time_ms: 11.974
    update_time_ms: 0.001
  iterations_since_restore: 34
  node_ip: 10.142.38.66
  

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-21-24
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.5
  episode_reward_mean: -15.9505
  episode_reward_min: -16.0
  episodes_this_iter: 1000
  episodes_total: 40000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 6.988
    max_exploration: 0.020000000000000018
    min_exploration: 0.020000000000000018
    num_steps_sampled: 40000
    num_steps_trained: 312000
    num_target_updates: 79
    opt_peak_throughput: 4579.247
    opt_samples: 32.0
    replay_time_ms: 3.08
    sample_time_ms: 11.013
    update_time_ms: 0.001
  iterations_since_restore: 40
  node_ip: 10.142.38.66
  num_metric_batches_dropped: 0
  pid: 4140
  policy_reward_mean:
    vehicles: -3.987625
  time_since_restore: 223.81462216377258
  time_this_iter_s: 5.8126490116119385
  time_total_s: 223.81462216377258
  timestamp: 1543789284
  timesteps_since_restore: 40000
  timesteps_th

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 1/8 CPUs, 0/0 GPUs
Memory usage on this node: 9.7/17.2 GB
Result logdir: /Users/theophile/ray_results/route-DQN
RUNNING trials:
 - DQN_multi_routing_0:	RUNNING [pid=4140], 256 s, 46 iter, 46000 ts, -16 rew

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-22-02
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.5
  episode_reward_mean: -15.9395
  episode_reward_min: -16.0
  episodes_this_iter: 1000
  episodes_total: 47000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 7.664
    max_exploration: 0.020000000000000018
    min_exploration: 0.020000000000000018
    num_steps_sampled: 47000
    num_steps_trained: 368000
    num_target_updates: 93
    opt_peak_throughput: 4175.514
    opt_samples: 32.0
    replay_time_ms: 3.537
    sample_time_ms: 13.268
    update_time_ms: 0.001
  iterations_since_restore: 47
  node_ip: 10.142.38.66
 

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-22-36
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.5
  episode_reward_mean: -15.975
  episode_reward_min: -17.0
  episodes_this_iter: 1000
  episodes_total: 53000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 7.505
    max_exploration: 0.020000000000000018
    min_exploration: 0.020000000000000018
    num_steps_sampled: 53000
    num_steps_trained: 416000
    num_target_updates: 105
    opt_peak_throughput: 4264.102
    opt_samples: 32.0
    replay_time_ms: 3.593
    sample_time_ms: 13.009
    update_time_ms: 0.001
  iterations_since_restore: 53
  node_ip: 10.142.38.66
  num_metric_batches_dropped: 0
  pid: 4140
  policy_reward_mean:
    vehicles: -3.99375
  time_since_restore: 295.90749430656433
  time_this_iter_s: 5.801539659500122
  time_total_s: 295.90749430656433
  timestamp: 1543789356
  timesteps_since_restore: 53000
  timesteps_thi

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 1/8 CPUs, 0/0 GPUs
Memory usage on this node: 9.8/17.2 GB
Result logdir: /Users/theophile/ray_results/route-DQN
RUNNING trials:
 - DQN_multi_routing_0:	RUNNING [pid=4140], 329 s, 59 iter, 59000 ts, -16.1 rew

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-23-16
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.5
  episode_reward_mean: -15.944
  episode_reward_min: -16.0
  episodes_this_iter: 1000
  episodes_total: 60000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 6.638
    max_exploration: 0.020000000000000018
    min_exploration: 0.020000000000000018
    num_steps_sampled: 60000
    num_steps_trained: 472000
    num_target_updates: 119
    opt_peak_throughput: 4821.056
    opt_samples: 32.0
    replay_time_ms: 3.257
    sample_time_ms: 11.405
    update_time_ms: 0.001
  iterations_since_restore: 60
  node_ip: 10.142.38.66

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-23-50
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.5
  episode_reward_mean: -15.952
  episode_reward_min: -16.0
  episodes_this_iter: 1000
  episodes_total: 66000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 6.785
    max_exploration: 0.020000000000000018
    min_exploration: 0.020000000000000018
    num_steps_sampled: 66000
    num_steps_trained: 520000
    num_target_updates: 130
    opt_peak_throughput: 4716.527
    opt_samples: 32.0
    replay_time_ms: 3.122
    sample_time_ms: 11.269
    update_time_ms: 0.001
  iterations_since_restore: 66
  node_ip: 10.142.38.66
  num_metric_batches_dropped: 0
  pid: 4140
  policy_reward_mean:
    vehicles: -3.988
  time_since_restore: 369.21275067329407
  time_this_iter_s: 5.526295900344849
  time_total_s: 369.21275067329407
  timestamp: 1543789430
  timesteps_since_restore: 66000
  timesteps_this_

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 1/8 CPUs, 0/0 GPUs
Memory usage on this node: 9.8/17.2 GB
Result logdir: /Users/theophile/ray_results/route-DQN
RUNNING trials:
 - DQN_multi_routing_0:	RUNNING [pid=4140], 402 s, 72 iter, 72000 ts, -16 rew

Result for DQN_multi_routing_0:
  custom_metrics: {}
  date: 2018-12-02_14-24-28
  done: false
  episode_len_mean: 1.0
  episode_reward_max: -14.5
  episode_reward_mean: -16.0075
  episode_reward_min: -17.0
  episodes_this_iter: 1000
  episodes_total: 73000
  experiment_id: e64e0f202d094d40a940a456ddf5bd06
  hostname: C02X23AUJHD3
  info:
    grad_time_ms: 7.071
    max_exploration: 0.020000000000000018
    min_exploration: 0.020000000000000018
    num_steps_sampled: 73000
    num_steps_trained: 576000
    num_target_updates: 144
    opt_peak_throughput: 4525.424
    opt_samples: 32.0
    replay_time_ms: 3.167
    sample_time_ms: 11.058
    update_time_ms: 0.001
  iterations_since_restore: 73
  node_ip: 10.142.38.66


In [None]:
def get_tt_mc(action_dict, network, soc_fact):
    paths_flow_dict = {}
    for agent, rl_action in action_dict.items():
        rl_action = int(rl_action)
        if rl_action in paths_flow_dict:
            paths_flow_dict[rl_action] += 1
        else:
            paths_flow_dict[rl_action] = 1
    network.update_flow_from_dict(paths_flow_dict)

    travel_time = {}
    marginal_cost = {}
    for agent, path_choice in action_dict.items():
        path_choice = int(path_choice)
        travel_time[agent] = network.travel_time(path_choice)
        marginal_cost[agent] = network.marginal_cost(path_choice)
    rew_dict = reward_calculator(travel_time, marginal_cost, soc_fact)
    return travel_time, marginal_cost, rew_dict

In [None]:
import ast

file = open("/Users/theophile/Documents/Classes/FLOW/Project/learning_wardrop/test_lambda_1_gamma_0_trail_1", 'r')
j = 0
# we want to plot the evolution of the path choice, of the reward and of the travel time
Actions_plot = np.array([[0, 0, 0, 0]])
Reward_plot = np.array([[0, 0, 0, 0]])
Travel_time_plot = np.array([[0, 0, 0, 0]])
while(True):
    j = j+1
    try:
        actions = file.readline()
        rewards = file.readline()
        action_dict = ast.literal_eval("{" + actions.split('{')[1].split('}')[0]+ "}")
        reward_dict = ast.literal_eval("{" + rewards.split('{')[1].split('}')[0]+ "}")
    
        network = Networks.network(network_name, nb_veh)
        travel_time, marginal_cost, rew_dict = get_tt_mc(action_dict, network, 1)
        
        actions_np = np.fromiter(action_dict.values(), dtype=int)
        Actions_plot = np.append(Actions_plot, [actions_np], axis=0)
        rewards_np = np.fromiter(reward_dict.values(), dtype=float)
        Reward_plot = np.append(Reward_plot, [rewards_np], axis=0)
        travel_time_np = np.fromiter(travel_time.values(), dtype=float)
        Travel_time_plot = np.append(Travel_time_plot, [travel_time_np], axis=0)
        if(j==1):
            print("------ First iteration ------")
            print("Path choice: " + str(action_dict))
            print("Reward ray: " + str(reward_dict))
            print("Travel time paths: " + str({"path " + str(i): network.travel_time(i) for i in range(3)}))
            print("Travel time cars: " + str(travel_time))
            print("Marginal cost: " + str(marginal_cost))
            print("Reward network: " + str(rew_dict))
    except:
        print()
        print("------ Last iteration ------")
        print("Path choice: " + str(action_dict))
        print("Reward ray: " + str(reward_dict))
        print("Travel time paths: " + str({"path " + str(i): network.travel_time(i) for i in range(3)}))
        print("Travel time cars: " + str(travel_time))
        print("Marginal cost: " + str(marginal_cost))
        print("Reward network: " + str(rew_dict))
        break


In [None]:
import matplotlib.pyplot as plt

for i in range(3):
    plt.plot(Actions_plot[1:,i], "+")
    plt.ylabel("Path choice of car " + str(i))
    plt.xlabel("Iteration")
    plt.show()
    
    plt.plot(Reward_plot[1:,i], "+")
    plt.ylabel("Rewards of car " + str(i))
    plt.xlabel("Iteration")
    plt.show()
    
    plt.plot(Travel_time_plot[1:,i], "+")
    plt.ylabel("Travel time of car " + str(i))
    plt.xlabel("Iteration")
    plt.show()
