In [56]:
from typing import Set
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import random as rnd
import math
from gymnasium.spaces import Discrete, Box, Sequence, Dict
from gymnasium.spaces.utils import flatten, flatten_space
import numpy as np
from IPython.display import clear_output

from ray.rllib.utils.typing import AgentID

class PointCoverageEnv(MultiAgentEnv):

    actions_dict = [(0,-1),(0,1),(1,0),(-1,0),(0,0)]

    def __init__(self, config):
        self.width = config["width"]
        self.height = config["height"]
        self.n_agents = config["n_agents"]
        self.n_targets = config["n_targets"]
        self.max_steps = config["max_steps"] if "max_steps" in config.keys() else None
        self.agents = ['agent-' + str(i) for i in range(self.n_agents)]
        self.observation_space = self.observation_space('agent-0')
        self.action_space = Discrete(5)

    def unflatten_observation_space(self, agent):
       coordinates_space = Box(low=np.array([0.0, 0.0], dtype=np.float32), high=np.array([1.0, 1.0], dtype=np.float32), dtype=np.float32)
       if self.n_agents > 1:
            return Dict({
                "position": coordinates_space,
                "other_agents": Dict({f"other_agent-{i}": coordinates_space for i in range(self.n_agents-1)}),
                "targets": Dict({f"target-{i}": coordinates_space for i in range(self.n_targets)})
            })
       else:
           return Dict({
                "position": coordinates_space,
                "targets": Dict({f"target-{i}": coordinates_space for i in range(self.n_targets)})
            })

    def observation_space(self, agent):
       return flatten_space(self.unflatten_observation_space(agent))
       #return self.unflatten_observation_space(agent)

    def action_space(self, agent):
        return Discrete(5)
    
    def __get_other_agents(self, agent):
        return [other for other in self.agents if other != agent]

    def __get_random_point(self):
        return (rnd.randint(0, self.width-1), rnd.randint(0, self.height-1))
    
    def __get_normalized_position(self, position):
        return (position[0]/self.width, position[1]/self.height)

    def __get_observation(self, agent):
        if self.n_agents > 1:
            return flatten(self.unflatten_observation_space(agent), 
                {
                    "position": self.__get_normalized_position(self.agent_pos[agent]),
                    "other_agents": {f"other_agent-{i}": self.__get_normalized_position(self.agent_pos[other]) for i, other in enumerate(self.__get_other_agents(agent))},
                    "targets": {f"target-{i}": self.__get_normalized_position(pos) for i, pos in enumerate(self.targets)}
                }
            )
        else:
            return flatten(self.unflatten_observation_space(agent), 
                {
                    "position": self.__get_normalized_position(self.agent_pos[agent]),
                    "targets": {f"target-{i}": self.__get_normalized_position(pos) for i, pos in enumerate(self.targets)}
                }
            )

    def __get_not_covered_targets(self):
        return set(self.targets) - set(self.agent_pos.values())

    def __is_target_contended(self, target):
        return len([t for t in self.agent_pos.values() if target == t]) > 1

    def __get_reward(self, agent):
        return -1 + self.__get_global_reward()
        if self.agent_pos[agent] in self.targets:
            if self.agent_pos[agent] in [pos[1] for pos in self.old_agent_pos if pos[0] != agent]:
                return -1 # someone was already covering the target -> no +10 reward
            if self.__is_target_contended(self.agent_pos[agent]):
                return -2 # someone arrived at the target at the same time of me -> someone has to leave
            return 10
        else:
            return -1
    
    def __get_global_reward(self):
        return (len(self.not_covered_target) - len(set(self.not_covered_target) - set(self.agent_pos.values())))*10
    
    def __update_agent_position(self, agent, x, y):
        self.agent_pos[agent] = (max(min(self.agent_pos[agent][0] + x, self.width-1), 0),
                                 max(min(self.agent_pos[agent][1] + y, self.height-1), 0))

    def reset(self, seed=None, options=None):
        self.agent_pos = {agent: self.__get_random_point() for agent in self.agents}
        self.targets = [self.__get_random_point() for _ in range(self.n_targets)]
        self.not_covered_target = self.targets.copy()
        self.steps = 0;
        return {agent: self.__get_observation(agent) for agent in self.agents}, {}
     
    def step(self, actions):
        self.steps += 1
        observations, rewards, terminated, truncated, infos = {}, {}, {}, {}, {}

        self.old_agent_pos = self.agent_pos.copy()
        for agent, action in actions.items():
            self.__update_agent_position(agent, self.actions_dict[action][0], self.actions_dict[action][1])

        for agent in actions.keys():
            if not (self.agent_pos[agent] in self.targets and not self.__is_target_contended(self.agent_pos[agent])):
                observations[agent] = self.__get_observation(agent)
                rewards[agent] = self.__get_reward(agent)
                terminated[agent] = False
                truncated[agent] = False
                infos[agent] = {}
        
        if self.max_steps != None and self.steps > self.max_steps:
            truncated['__all__'] = True
        else:
            truncated['__all__'] = False

        self.not_covered_target = list(set(self.not_covered_target) - set(self.agent_pos.values())) 

        terminated['__all__'] = len(self.__get_not_covered_targets()) == 0
        return observations, rewards, terminated, truncated, infos
     
    def render(self, mode='text'):
        str = '_' * (self.width+2) + '\n'
        for i in range(self.height):
            str = str + "|"
            for j in range(self.width):
                if (j,i) in self.agent_pos.values() and (j,i) in self.targets:
                    str = str + '*'
                elif (j,i) in self.agent_pos.values():
                    str = str + 'o'
                elif (j,i) in self.targets:
                    str = str + 'x'
                else:
                    str = str + ' '
            str = str + '|\n'
        str = str + '‾' * (self.width+2)
        print(str)

    def get_agent_ids(self):
       return self.agents

In [53]:
env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2})
obs, _ = env.reset() 
print(obs['agent-0'])
print(obs['agent-1'])
env.render()

[0.9 0.4 0.3 0.3 0.2 0.7 0.6 0.1]
[0.3 0.3 0.9 0.4 0.2 0.7 0.6 0.1]
____________
|          |
|      x   |
|          |
|   o      |
|         o|
|          |
|          |
|  x       |
|          |
|          |
‾‾‾‾‾‾‾‾‾‾‾‾


In [79]:
import ray

def customResultPrint(result):
    print(f"iteration [{result['training_iteration']}] => " +
          f"episode_reward_mean: {result['sampler_results']['episode_reward_mean']}, " +
          f"episode_len_mean: {result['sampler_results']['episode_len_mean']}, " +
          f"agent_steps_trained: {result['info']['num_agent_steps_trained']}, " +
          f"env_steps_trained: {result['info']['num_env_steps_trained']}, " + 
          f"entropy: {result['info']['learner']['default_policy']['learner_stats']['entropy']}, " +
          f"learning_rate: {result['info']['learner']['default_policy']['learner_stats']['cur_lr']}")

ray.shutdown()
ray.init()

2024-05-16 15:37:03,228	INFO worker.py:1740 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.11.9
Ray version:,2.21.0
Dashboard:,http://127.0.0.1:8265


## Single agent

In [29]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit


register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 1, "n_targets": 1, "max_steps": 100}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 30

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo = (
    PPOConfig()
    .training(gamma=0.99, 
              lr=0.001,
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


number of different environment steps: 61440


2024-05-16 15:06:23,505	INFO trainable.py:161 -- Trainable.setup took 18.406 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


iteration [1] => episode_reward_mean: -83.45833333333333, episode_len_mean: 83.66666666666667, agent_steps_trained: 2048, env_steps_trained: 2048, entropy: 1.6018952220678329, learning_rate: 0.0010000000000000002
Checkpoint saved in directory /tmp/tmpf1je2vgt
iteration [2] => episode_reward_mean: -73.2909090909091, episode_len_mean: 73.7090909090909, agent_steps_trained: 4096, env_steps_trained: 4096, entropy: 1.5881302535533905, learning_rate: 0.0010000000000000002
iteration [3] => episode_reward_mean: -76.13924050632912, episode_len_mean: 76.55696202531645, agent_steps_trained: 6144, env_steps_trained: 6144, entropy: 1.5708488523960114, learning_rate: 0.0010000000000000002
iteration [4] => episode_reward_mean: -71.95, episode_len_mean: 72.46, agent_steps_trained: 8192, env_steps_trained: 8192, entropy: 1.556837111711502, learning_rate: 0.0010000000000000002
iteration [5] => episode_reward_mean: -73.11, episode_len_mean: 73.65, agent_steps_trained: 10240, env_steps_trained: 10240, ent

In [36]:
from IPython.display import clear_output
import time
import torch
from gymnasium.spaces.utils import flatten

env = PointCoverageEnv({"height": 20, "width": 50, "n_agents": 1, "n_targets": 1})
obs_space = env.observation_space
obs, _ = env.reset()
env.render()

for i in range(100):
    actions = algo.compute_actions({agent: o for agent, o in obs.items()})
    print(actions, "\n")
    
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[33]
____________________________________________________
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                                                  |
|                               *                  |
|                                                  |
|                                                  |
|                                        

## Two Agents

In [57]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit


register_env("my_env", lambda _: PointCoverageEnv({"height": 5, "width": 5, "n_agents": 2, "n_targets": 2, "max_steps": 30}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 50

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo2 = (
    PPOConfig()
    .training(gamma=0.99, 
              #lr=0.001,
              lr_schedule=[
                [0, 0.005],  
                [1000, 0.001],  
                [10000, 0.001],  
              ],
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter, 
              entropy_coeff_schedule = [
                [0, 0.8],  # Start with relatively high entropy coefficient
                [40480, 0],  # Gradually decrease entropy coefficient over 10,000 iterations
              ])
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo2.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo2.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


number of different environment steps: 102400




iteration [1] => episode_reward_mean: -29.33783783783784, episode_len_mean: 27.635135135135137, agent_steps_trained: 2939, env_steps_trained: 2048, entropy: 1.6092265833507884, learning_rate: 0.004999999999999999
Checkpoint saved in directory /tmp/tmpaoix4pvi
iteration [2] => episode_reward_mean: -28.75, episode_len_mean: 26.81, agent_steps_trained: 5918, env_steps_trained: 4096, entropy: 1.6015533913265576, learning_rate: 0.0010000000000000005
iteration [3] => episode_reward_mean: -26.17, episode_len_mean: 26.12, agent_steps_trained: 8782, env_steps_trained: 6144, entropy: 1.5955791971900246, learning_rate: 0.0010000000000000005
iteration [4] => episode_reward_mean: -22.33, episode_len_mean: 24.54, agent_steps_trained: 11547, env_steps_trained: 8192, entropy: 1.5917757105827333, learning_rate: 0.0010000000000000002
iteration [5] => episode_reward_mean: -23.24, episode_len_mean: 24.13, agent_steps_trained: 14487, env_steps_trained: 10240, entropy: 1.5833317550745878, learning_rate: 0.0

In [75]:
from IPython.display import clear_output
import time

env = PointCoverageEnv({"height": 10, "width": 30, "n_agents": 2, "n_targets": 2})
obs, _ = env.reset()
env.render()

for i in range(100):
    actions = algo2.compute_actions(obs)
    print(actions, "\n")
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[54]
________________________________
|                              |
|                              |
|                              |
|                     *        |
|                              |
|                              |
|                              |
|                              |
|  *                           |
|                              |
‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
{}
{}


In [26]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.env_checker import check_env

env = PointCoverageEnv({"height": 10, "width": 50, "n_agents": 2, "n_targets": 2})
check_env(env)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=25000)

obs, _ = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)

AssertionError: The observation returned by `reset()` method must be a numpy array

## Three Agents

In [80]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit


register_env("my_env", lambda _: PointCoverageEnv({"height": 5, "width": 5, "n_agents": 3, "n_targets": 3, "max_steps": 30}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 50

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo3 = (
    PPOConfig()
    .training(gamma=0.99, 
              #lr=0.001,
              lr_schedule=[
                [0, 0.005],  
                [1000, 0.001],  
                [10000, 0.001],  
              ],
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter, 
              entropy_coeff_schedule = [
                [0, 0.8],  # Start with relatively high entropy coefficient
                [40480, 0],  # Gradually decrease entropy coefficient over 10,000 iterations
              ])
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo3.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo3.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 102400




iteration [1] => episode_reward_mean: -18.105263157894736, episode_len_mean: 26.92105263157895, agent_steps_trained: 3826, env_steps_trained: 2048, entropy: 1.609091877085822, learning_rate: 0.005
Checkpoint saved in directory /tmp/tmpxadcgia4
iteration [2] => episode_reward_mean: -18.36, episode_len_mean: 27.42, agent_steps_trained: 7508, env_steps_trained: 4096, entropy: 1.603918524299349, learning_rate: 0.001
iteration [3] => episode_reward_mean: -13.75, episode_len_mean: 27.31, agent_steps_trained: 10915, env_steps_trained: 6144, entropy: 1.595703953046065, learning_rate: 0.001
iteration [4] => episode_reward_mean: -11.53, episode_len_mean: 26.55, agent_steps_trained: 14476, env_steps_trained: 8192, entropy: 1.5948491188196035, learning_rate: 0.001
iteration [5] => episode_reward_mean: -11.56, episode_len_mean: 26.83, agent_steps_trained: 18006, env_steps_trained: 10240, entropy: 1.5882749713384188, learning_rate: 0.001
iteration [6] => episode_reward_mean: -12.65, episode_len_mean

In [86]:
from IPython.display import clear_output
import time

env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 3, "n_targets": 3})
obs, _ = env.reset()
env.render()

for i in range(100):
    actions = algo3.compute_actions(obs)
    print(actions, "\n")
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[99]
____________
|   o      |
|          |
|      x   |
| x        |
|          |
|          |
|        o |
|          |
|      o   |
|         x|
‾‾‾‾‾‾‾‾‾‾‾‾
{'agent-0': array([0.8, 0.6, 0.3, 0. , 0.6, 0.8, 0.1, 0.3, 0.6, 0.2, 0.9, 0.9],
      dtype=float32), 'agent-1': array([0.6, 0.8, 0.3, 0. , 0.8, 0.6, 0.1, 0.3, 0.6, 0.2, 0.9, 0.9],
      dtype=float32), 'agent-2': array([0.6, 0.8, 0.8, 0.6, 0.3, 0. , 0.1, 0.3, 0.6, 0.2, 0.9, 0.9],
      dtype=float32)}
{'agent-0': -1, 'agent-1': -1, 'agent-2': -1}
