In [1]:
from typing import Set
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import random as rnd
import math
from gymnasium.spaces import Discrete, Box, Sequence, Dict
from gymnasium.spaces.utils import flatten, flatten_space
import numpy as np
from IPython.display import clear_output

from ray.rllib.utils.typing import AgentID

class PointCoverageEnv(MultiAgentEnv):

    actions_dict = [(0,-1),(0,1),(1,0),(-1,0),(0,0)]

    def __init__(self, config):
        self.observations_memory = config["observations_memory"] if "observations_memory" in config.keys() else 1
        self.width = config["width"]
        self.height = config["height"]
        self.n_agents = config["n_agents"]
        self.n_targets = config["n_targets"]
        self.max_steps = config["max_steps"] if "max_steps" in config.keys() else None
        self.use_nested_observation = config["use_nested_observation"] if "use_nested_observation" in config.keys() else False
        self.agents = ['agent-' + str(i) for i in range(self.n_agents)]
        self.observation_space = self.observation_space('agent-0')
        self.action_space = Discrete(5)

    def unflatten_observation_space(self, agent):
        coordinates_space = Box(low=np.array([0.0, 0.0], dtype=np.float32), high=np.array([1.0, 1.0], dtype=np.float32), dtype=np.float32)
        obs_space = {"position": coordinates_space,
                     "targets": Dict({f"target-{i}": coordinates_space for i in range(self.n_targets)})}
        if self.n_agents > 1:
            obs_space = {"position": coordinates_space,
                        "other_agents": Dict({f"other_agent-{i}": coordinates_space for i in range(self.n_agents-1)}),
                        "targets": Dict({f"target-{i}": coordinates_space for i in range(self.n_targets)})}
        
        obs_space = Dict(obs_space)

        if self.observations_memory > 1:
            return Dict({f"t(-{i})": obs_space for i in range(self.observations_memory)})
        return obs_space

    def observation_space(self, agent):
       if self.use_nested_observation:
           return self.unflatten_observation_space(agent)
       return flatten_space(self.unflatten_observation_space(agent))

    def action_space(self, agent):
        return Discrete(5)
    
    def __get_other_agents(self, agent):
        return [other for other in self.agents if other != agent]

    def __get_random_point(self):
        return (rnd.randint(0, self.width-1), rnd.randint(0, self.height-1))
    
    def __get_normalized_position(self, position):
        return (position[0]/self.width, position[1]/self.height)

    def __get_unflatten_time_t_observation(self, agent):
        time_t_obs = {"position": self.__get_normalized_position(self.agent_pos[agent]),
               "targets": {f"target-{i}": self.__get_normalized_position(pos) for i, pos in enumerate(self.targets)}}
        if self.n_agents > 1:
            time_t_obs = {"position": self.__get_normalized_position(self.agent_pos[agent]),
               "other_agents": {f"other_agent-{i}": self.__get_normalized_position(self.agent_pos[other]) for i, other in enumerate(self.__get_other_agents(agent))},
               "targets": {f"target-{i}": self.__get_normalized_position(pos) for i, pos in enumerate(self.targets)}}
        return time_t_obs

    def __get_observation(self, agent):
        time_t_obs = self.__get_unflatten_time_t_observation(agent)

        obs = {}
        if self.observations_memory > 1:
            self.agents_memory[agent].pop(0)
            self.agents_memory[agent].append(time_t_obs)
            obs = {f"t(-{i})": self.agents_memory[agent][self.observations_memory-1-i] for i in range(self.observations_memory)}
        else:
            obs = time_t_obs

        if self.use_nested_observation:
            return obs
        return flatten(self.unflatten_observation_space(agent), obs)

    def __get_not_covered_targets(self):
        return set(self.targets) - set(self.agent_pos.values())

    def __is_target_contended(self, target):
        return len([t for t in self.agent_pos.values() if target == t]) > 1

    def __get_reward(self, agent):
        return -1 + self.__get_global_reward()
        if self.agent_pos[agent] in self.targets:
            if self.agent_pos[agent] in [pos[1] for pos in self.old_agent_pos if pos[0] != agent]:
                return -1 # someone was already covering the target -> no +10 reward
            if self.__is_target_contended(self.agent_pos[agent]):
                return -2 # someone arrived at the target at the same time of me -> someone has to leave
            return 10
        else:
            return -1
    
    def __get_global_reward(self):
        return (len(self.not_covered_target) - len(set(self.not_covered_target) - set(self.agent_pos.values())))*10
    
    def __update_agent_position(self, agent, x, y):
        self.agent_pos[agent] = (max(min(self.agent_pos[agent][0] + x, self.width-1), 0),
                                 max(min(self.agent_pos[agent][1] + y, self.height-1), 0))

    def reset(self, seed=None, options=None):
        self.agent_pos = {agent: self.__get_random_point() for agent in self.agents}
        self.targets = [self.__get_random_point() for _ in range(self.n_targets)]
        self.not_covered_target = self.targets.copy()
        self.steps = 0;
        self.agents_memory = {agent: [self.__get_unflatten_time_t_observation(agent)]*self.observations_memory for agent in self.agents}
        return {agent: self.__get_observation(agent) for agent in self.agents}, {}
     
    def step(self, actions):
        self.steps += 1
        observations, rewards, terminated, truncated, infos = {}, {}, {}, {}, {}

        self.old_agent_pos = self.agent_pos.copy()
        for agent, action in actions.items():
            self.__update_agent_position(agent, self.actions_dict[action][0], self.actions_dict[action][1])

        for agent in actions.keys():
            if not (self.agent_pos[agent] in self.targets and not self.__is_target_contended(self.agent_pos[agent])):
                observations[agent] = self.__get_observation(agent)
                rewards[agent] = self.__get_reward(agent)
                terminated[agent] = False
                truncated[agent] = False
                infos[agent] = {}
        
        if self.max_steps != None and self.steps > self.max_steps:
            truncated['__all__'] = True
        else:
            truncated['__all__'] = False

        self.not_covered_target = list(set(self.not_covered_target) - set(self.agent_pos.values())) 

        terminated['__all__'] = len(self.__get_not_covered_targets()) == 0
        return observations, rewards, terminated, truncated, infos
     
    def render(self, mode='text'):
        str = '_' * (self.width+2) + '\n'
        for i in range(self.height):
            str = str + "|"
            for j in range(self.width):
                if (j,i) in self.agent_pos.values() and (j,i) in self.targets:
                    str = str + '*'
                elif (j,i) in self.agent_pos.values():
                    str = str + 'o'
                elif (j,i) in self.targets:
                    str = str + 'x'
                else:
                    str = str + ' '
            str = str + '|\n'
        str = str + '‾' * (self.width+2)
        print(str)

    def get_agent_ids(self):
       return self.agents

  from .autonotebook import tqdm as notebook_tqdm
2024-05-17 10:58:42,008	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-05-17 10:58:46,602	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
import json

observations_memory = 2

env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "use_nested_observation": False, "observations_memory": observations_memory})
obs, _ = env.reset() 
print(obs)
#print(json.dumps(obs['agent-0'], indent=2))
env.render()

#obs, _, _, _, _ = env.step({'agent-0': 1, 'agent-1': 2})
#print(json.dumps(obs['agent-0'], indent=2))

#obs, _, _, _, _ = env.step({'agent-0': 1, 'agent-1': 2})
#print(json.dumps(obs['agent-0'], indent=2))

{'agent-0': array([0.3, 0.2, 0.8, 0.8, 0.1, 0.3, 0.1, 0.8, 0.3, 0.2, 0.8, 0.8, 0.1,
       0.3, 0.1, 0.8], dtype=float32), 'agent-1': array([0.8, 0.8, 0.3, 0.2, 0.1, 0.3, 0.1, 0.8, 0.8, 0.8, 0.3, 0.2, 0.1,
       0.3, 0.1, 0.8], dtype=float32)}
____________
|          |
|          |
|   o      |
| x        |
|          |
|          |
|          |
|          |
| x      o |
|          |
‾‾‾‾‾‾‾‾‾‾‾‾


In [5]:
#import ray

def customResultPrint(result):
    print(f"iteration [{result['training_iteration']}] => " +
          f"episode_reward_mean: {result['sampler_results']['episode_reward_mean']}, " +
          f"episode_len_mean: {result['sampler_results']['episode_len_mean']}, " +
          f"agent_steps_trained: {result['info']['num_agent_steps_trained']}, " +
          f"env_steps_trained: {result['info']['num_env_steps_trained']}, " + 
          f"entropy: {result['info']['learner']['default_policy']['learner_stats']['entropy']}, " +
          f"learning_rate: {result['info']['learner']['default_policy']['learner_stats']['cur_lr']}")

#ray.shutdown()
#ray.init()

## Single agent

### no memory

In [5]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 1
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 1, "n_targets": 1, "max_steps": 100, "use_nested_observation": False, "observations_memory": observations_memory}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 30

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo = (
    PPOConfig()
    .training(gamma=0.99, 
              lr=0.001,
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


number of different environment steps: 61440




iteration [1] => episode_reward_mean: -85.78260869565217, episode_len_mean: 86.1304347826087, agent_steps_trained: 2048, env_steps_trained: 2048, entropy: 1.6025252640247345, learning_rate: 0.0010000000000000002
Checkpoint saved in directory /tmp/tmp7jl06sar
iteration [2] => episode_reward_mean: -86.38297872340425, episode_len_mean: 86.65957446808511, agent_steps_trained: 4096, env_steps_trained: 4096, entropy: 1.5903215453028678, learning_rate: 0.0010000000000000002
iteration [3] => episode_reward_mean: -76.10126582278481, episode_len_mean: 76.51898734177215, agent_steps_trained: 6144, env_steps_trained: 6144, entropy: 1.5736982330679894, learning_rate: 0.0010000000000000002
iteration [4] => episode_reward_mean: -72.35, episode_len_mean: 72.82, agent_steps_trained: 8192, env_steps_trained: 8192, entropy: 1.5585408955812454, learning_rate: 0.0010000000000000002
iteration [5] => episode_reward_mean: -66.82, episode_len_mean: 67.4, agent_steps_trained: 10240, env_steps_trained: 10240, en

### memory = 2

In [6]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 2
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 1, "n_targets": 1, "max_steps": 100, "observations_memory": observations_memory}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 30

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo = (
    PPOConfig()
    .training(gamma=0.99, 
              lr=0.001,
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 61440




iteration [1] => episode_reward_mean: -86.17391304347827, episode_len_mean: 86.43478260869566, agent_steps_trained: 2048, env_steps_trained: 2048, entropy: 1.6000442996621131, learning_rate: 0.0010000000000000002
Checkpoint saved in directory /tmp/tmp5lv0z5ba
iteration [2] => episode_reward_mean: -75.31481481481481, episode_len_mean: 75.70370370370371, agent_steps_trained: 4096, env_steps_trained: 4096, entropy: 1.5908732578158378, learning_rate: 0.0010000000000000002
iteration [3] => episode_reward_mean: -76.29113924050633, episode_len_mean: 76.65822784810126, agent_steps_trained: 6144, env_steps_trained: 6144, entropy: 1.5680509522557258, learning_rate: 0.0010000000000000002
iteration [4] => episode_reward_mean: -74.5, episode_len_mean: 74.91, agent_steps_trained: 8192, env_steps_trained: 8192, entropy: 1.5522387847304344, learning_rate: 0.0010000000000000002
iteration [5] => episode_reward_mean: -68.86, episode_len_mean: 69.36, agent_steps_trained: 10240, env_steps_trained: 10240, e

### memory = 3

In [7]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 3
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 1, "n_targets": 1, "max_steps": 100, "observations_memory": observations_memory}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 30

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo = (
    PPOConfig()
    .training(gamma=0.99, 
              lr=0.001,
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 61440




iteration [1] => episode_reward_mean: -86.95652173913044, episode_len_mean: 87.1304347826087, agent_steps_trained: 2048, env_steps_trained: 2048, entropy: 1.6030417501926422, learning_rate: 0.0010000000000000002
Checkpoint saved in directory /tmp/tmp_dyj7nff
iteration [2] => episode_reward_mean: -82.06122448979592, episode_len_mean: 82.3265306122449, agent_steps_trained: 4096, env_steps_trained: 4096, entropy: 1.5916752144694328, learning_rate: 0.0010000000000000002
iteration [3] => episode_reward_mean: -75.23456790123457, episode_len_mean: 75.62962962962963, agent_steps_trained: 6144, env_steps_trained: 6144, entropy: 1.57153390198946, learning_rate: 0.0010000000000000002
iteration [4] => episode_reward_mean: -74.33, episode_len_mean: 74.77, agent_steps_trained: 8192, env_steps_trained: 8192, entropy: 1.5667954951524734, learning_rate: 0.0010000000000000002
iteration [5] => episode_reward_mean: -64.91, episode_len_mean: 65.53, agent_steps_trained: 10240, env_steps_trained: 10240, entr

In [8]:
from IPython.display import clear_output
import time
import torch
from gymnasium.spaces.utils import flatten

env = PointCoverageEnv({"height": 20, "width": 100, "n_agents": 1, "n_targets": 1, "observations_memory": observations_memory})
obs_space = env.observation_space
obs, _ = env.reset()
print(obs)
env.render()

for i in range(100):
    actions = algo.compute_actions({agent: o for agent, o in obs.items()})
    print(actions, "\n")
    
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[40]
______________________________________________________________________________________________________
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                   

## Two Agents

### no memory

In [9]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 1
register_env("my_env", lambda _: PointCoverageEnv({"height": 5, "width": 5, "n_agents": 2, "n_targets": 2, "max_steps": 30, "observations_memory": observations_memory}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 50

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo2 = (
    PPOConfig()
    .training(gamma=0.99, 
              #lr=0.001,
              lr_schedule=[
                [0, 0.005],  
                [1000, 0.001],  
                [10000, 0.001],  
              ],
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter, 
              entropy_coeff_schedule = [
                [0, 0.8],  # Start with relatively high entropy coefficient
                [40480, 0],  # Gradually decrease entropy coefficient over 10,000 iterations
              ])
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo2.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo2.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


number of different environment steps: 102400




iteration [1] => episode_reward_mean: -30.39189189189189, episode_len_mean: 27.60810810810811, agent_steps_trained: 3010, env_steps_trained: 2048, entropy: 1.6090558713132685, learning_rate: 0.004999999999999999
Checkpoint saved in directory /tmp/tmp_7zdus8e
iteration [2] => episode_reward_mean: -28.07, episode_len_mean: 26.64, agent_steps_trained: 5954, env_steps_trained: 4096, entropy: 1.5991164966063065, learning_rate: 0.0010000000000000005
iteration [3] => episode_reward_mean: -27.16, episode_len_mean: 27.11, agent_steps_trained: 8806, env_steps_trained: 6144, entropy: 1.5966651743108575, learning_rate: 0.0010000000000000005
iteration [4] => episode_reward_mean: -20.95, episode_len_mean: 24.35, agent_steps_trained: 11559, env_steps_trained: 8192, entropy: 1.5901681077480316, learning_rate: 0.0010000000000000002
iteration [5] => episode_reward_mean: -21.06, episode_len_mean: 23.63, agent_steps_trained: 14426, env_steps_trained: 10240, entropy: 1.5812660033052617, learning_rate: 0.00

## memory = 2

In [3]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 2
env = PointCoverageEnv({"height": 5, "width": 5, "n_agents": 2, "n_targets": 2, "max_steps": 30, "observations_memory": observations_memory})
register_env("my_env", lambda _: PointCoverageEnv({"height": 5, "width": 5, "n_agents": 2, "n_targets": 2, "max_steps": 30, "observations_memory": observations_memory}))

train_batch_size = 4096
sgd_minibatch_size = 256
num_sgd_iter = 20
trainings = 50

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")

#def my_policy_mapping_fn(agent_id, episode):
    # return "agent-policy"

algo2 = (
    PPOConfig()
    .training(gamma=0.99, 
              lr=0.0005,
              kl_coeff=0.2, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    #.multi_agent(policies={"agent-policy": (None, env.observation_space, env.action_space, {})},
    #             policy_mapping_fn=my_policy_mapping_fn)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

#print(algo2.config.is_multi_agent())

for i in range(trainings):
    result = algo2.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo2.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


number of different environment steps: 204800


2024-05-17 10:15:55,338	INFO worker.py:1749 -- Started a local Ray instance.
2024-05-17 10:16:09,711	INFO trainable.py:161 -- Trainable.setup took 17.566 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2024-05-17 10:16:09,738	ERROR actor_manager.py:519 -- Ray error, taking actor 1 out of service. [36mray::RolloutWorker.apply()[39m (pid=32151, ip=172.18.171.36, actor_id=3803ba9c2b5304c3c2171c6101000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f4ae8f2d1d0>)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/utils/actor_manager.py", line 189, in apply
    raise e
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/utils/actor_manager.py", line 178, in apply
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

True


RayTaskError(TypeError): [36mray::RolloutWorker.apply()[39m (pid=32151, ip=172.18.171.36, actor_id=3803ba9c2b5304c3c2171c6101000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f4ae8f2d1d0>)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/utils/actor_manager.py", line 189, in apply
    raise e
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/utils/actor_manager.py", line 178, in apply
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/execution/rollout_ops.py", line 99, in <lambda>
    (lambda w: w.sample())
               ^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/evaluation/rollout_worker.py", line 685, in sample
    batches = [self.input_reader.next()]
               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/evaluation/sampler.py", line 91, in next
    batches = [self.get_data()]
               ^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/evaluation/sampler.py", line 273, in get_data
    item = next(self._env_runner)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/evaluation/env_runner_v2.py", line 348, in run
    outputs = self.step()
              ^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/evaluation/env_runner_v2.py", line 374, in step
    active_envs, to_eval, outputs = self._process_observations(
                                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/evaluation/env_runner_v2.py", line 540, in _process_observations
    policy_id: PolicyID = episode.policy_for(agent_id)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/rllib/evaluation/episode_v2.py", line 119, in policy_for
    policy_id = self._agent_to_policy[agent_id] = self.policy_mapping_fn(
                                                  ^^^^^^^^^^^^^^^^^^^^^^^
TypeError: my_policy_mapping_fn() got an unexpected keyword argument 'worker'

In [8]:
from IPython.display import clear_output
import time

observations_memory = 2
env = PointCoverageEnv({"height": 5, "width": 5, "n_agents": 2, "n_targets": 2, "observations_memory": observations_memory})
obs, _ = env.reset()
env.render()

for i in range(100):
    actions = algo2.compute_actions(obs)
    print(actions, "\n")
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    #print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


_______
|o    |
|   x |
|  o  |
|    x|
|     |
‾‾‾‾‾‾‾
deepmind


RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x16 and 24x256)

## Three Agents

In [80]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit


register_env("my_env", lambda _: PointCoverageEnv({"height": 5, "width": 5, "n_agents": 3, "n_targets": 3, "max_steps": 30}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 50

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo3 = (
    PPOConfig()
    .training(gamma=0.99, 
              #lr=0.001,
              lr_schedule=[
                [0, 0.005],  
                [1000, 0.001],  
                [10000, 0.001],  
              ],
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter, 
              entropy_coeff_schedule = [
                [0, 0.8],  # Start with relatively high entropy coefficient
                [40480, 0],  # Gradually decrease entropy coefficient over 10,000 iterations
              ])
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo3.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo3.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 102400




iteration [1] => episode_reward_mean: -18.105263157894736, episode_len_mean: 26.92105263157895, agent_steps_trained: 3826, env_steps_trained: 2048, entropy: 1.609091877085822, learning_rate: 0.005
Checkpoint saved in directory /tmp/tmpxadcgia4
iteration [2] => episode_reward_mean: -18.36, episode_len_mean: 27.42, agent_steps_trained: 7508, env_steps_trained: 4096, entropy: 1.603918524299349, learning_rate: 0.001
iteration [3] => episode_reward_mean: -13.75, episode_len_mean: 27.31, agent_steps_trained: 10915, env_steps_trained: 6144, entropy: 1.595703953046065, learning_rate: 0.001
iteration [4] => episode_reward_mean: -11.53, episode_len_mean: 26.55, agent_steps_trained: 14476, env_steps_trained: 8192, entropy: 1.5948491188196035, learning_rate: 0.001
iteration [5] => episode_reward_mean: -11.56, episode_len_mean: 26.83, agent_steps_trained: 18006, env_steps_trained: 10240, entropy: 1.5882749713384188, learning_rate: 0.001
iteration [6] => episode_reward_mean: -12.65, episode_len_mean

In [86]:
from IPython.display import clear_output
import time

env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 3, "n_targets": 3})
obs, _ = env.reset()
env.render()

for i in range(100):
    actions = algo3.compute_actions(obs)
    print(actions, "\n")
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[99]
____________
|   o      |
|          |
|      x   |
| x        |
|          |
|          |
|        o |
|          |
|      o   |
|         x|
‾‾‾‾‾‾‾‾‾‾‾‾
{'agent-0': array([0.8, 0.6, 0.3, 0. , 0.6, 0.8, 0.1, 0.3, 0.6, 0.2, 0.9, 0.9],
      dtype=float32), 'agent-1': array([0.6, 0.8, 0.3, 0. , 0.8, 0.6, 0.1, 0.3, 0.6, 0.2, 0.9, 0.9],
      dtype=float32), 'agent-2': array([0.6, 0.8, 0.8, 0.6, 0.3, 0. , 0.1, 0.3, 0.6, 0.2, 0.9, 0.9],
      dtype=float32)}
{'agent-0': -1, 'agent-1': -1, 'agent-2': -1}


## Two agents, DQN

In [10]:
from ray.rllib.algorithms.dqn.dqn import DQNConfig
from ray.rllib.algorithms import DQN
from ray.tune.registry import register_env

observations_memory = 3
register_env("my_env", lambda _: PointCoverageEnv({"height": 5, "width": 5, "n_agents": 2, "n_targets": 2, "max_steps": 30, "observations_memory": observations_memory}))

config = DQNConfig()

replay_config = {
        "type": "MultiAgentPrioritizedReplayBuffer",
        "capacity": 60000,
        "prioritized_replay_alpha": 0.5,
        "prioritized_replay_beta": 0.5,
        "prioritized_replay_eps": 3e-6,
    }

config = config.training(replay_buffer_config=replay_config)
config = config.resources(num_gpus=0)
config = config.env_runners(num_env_runners=1)
config = config.environment("my_env")
config.sample_timeout_s *= 5 
algo2 = DQN(config=config)
for i in range(30):
    result = algo2.train()
    print(f"[{i}] mean_reward: {result['sampler_results']['episode_reward_mean']}, mean_len: {result['sampler_results']['episode_len_mean']}")
    #customResultPrint(result)

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


KeyboardInterrupt: 

# Tianshou

In [3]:
import tianshou
import torch
from tianshou.highlevel.config import SamplingConfig
from tianshou.highlevel.env import (
    EnvFactoryRegistered,
    VectorEnvType,
)
from tianshou.highlevel.experiment import DQNExperimentBuilder, ExperimentConfig,  PPOExperimentBuilder
from tianshou.highlevel.params.policy_params import DQNParams
from tianshou.highlevel.trainer import (
    EpochTestCallbackDQNSetEps,
    EpochTrainCallbackDQNSetEps,
    EpochStopCallbackRewardThreshold
)
from tianshou.env import DummyVectorEnv
from tianshou.utils.net.discrete import (Actor, Net, Critic)
from tianshou.utils.net.common import ActorCritic
from tianshou.policy.base import BasePolicy
from tianshou.policy.modelfree.ppo import PPOPolicy
from tianshou.data.collector import Collector
from tianshou.data.buffer.vecbuf import VectorReplayBuffer
from tianshou.trainer.base import OnpolicyTrainer
from tianshou.data.batch import Batch 
from tianshou.policy.multiagent.mapolicy import MultiAgentPolicyManager
print(tianshou.__version__)

1.0.0


In [None]:
# https://tianshou.org/en/stable/02_notebooks/L7_Experiment.html

In [9]:
env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "use_nested_observation": False})
obs, _ = env.reset()
print(obs)
print(Batch(obs)) 
print(env.observation_space.shape)

{'agent-0': array([0.7, 0.8, 0.8, 0. , 0.9, 0.5, 0.1, 0.3], dtype=float32), 'agent-1': array([0.8, 0. , 0.7, 0.8, 0.9, 0.5, 0.1, 0.3], dtype=float32)}
Batch(
    agent-0: array([0.7, 0.8, 0.8, 0. , 0.9, 0.5, 0.1, 0.3], dtype=float32),
    agent-1: array([0.8, 0. , 0.7, 0.8, 0.9, 0.5, 0.1, 0.3], dtype=float32),
)
(8,)


In [5]:
train_size, test_size = (20, 10)
device = "cpu"

env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "use_nested_observation": False})
train_envs = DummyVectorEnv([lambda: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "use_nested_observation": False}) for _ in range(train_size)])
test_envs = DummyVectorEnv([lambda: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "use_nested_observation": False}) for _ in range(test_size)])

assert env.observation_space.shape is not None
assert isinstance(env.action_space, Discrete) 

net = Net(state_shape=env.observation_space.shape, hidden_sizes=[64, 64], device=device)
actor = Actor(preprocess_net=net, action_shape=env.action_space.n, device=device).to(device)
critic = Critic(preprocess_net=net, device=device).to(device)
actor_critic = ActorCritic(actor=actor, critic=critic)

# optimizer of the actor and the critic
optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)

dist = torch.distributions.Categorical
policy: BasePolicy
policy = PPOPolicy(
    actor=actor,
    critic=critic,
    optim=optim,
    dist_fn=dist,
    action_space=env.action_space,
    deterministic_eval=True,
    action_scaling=False,
)

mapolicy_manager = MultiAgentPolicyManager(policies=[policy, policy], env=env)


train_collector = Collector(
    policy=mapolicy_manager,
    env=train_envs,
    buffer=VectorReplayBuffer(20000, len(train_envs)),
)
test_collector = Collector(policy=mapolicy_manager, env=test_envs)

result = OnpolicyTrainer(
    policy=mapolicy_manager,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=10,
    step_per_epoch=50000,
    repeat_per_collect=10,
    episode_per_test=10,
    batch_size=256,
    step_per_collect=2000,
    stop_fn=lambda mean_reward: mean_reward >= 195,
).run()

result.pprint_asdict()

AttributeError: 'dict' object has no attribute 'agent_id'