In [74]:
from typing import Set
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import random as rnd
import math
from gymnasium.spaces import Discrete, Box, Sequence, Dict
from gymnasium.spaces.utils import flatten, flatten_space
import numpy as np
from IPython.display import clear_output

from ray.rllib.utils.typing import AgentID

class PointCoverageEnv(MultiAgentEnv):

    actions_dict = [(0,-1),(0,1),(1,0),(-1,0),(0,0)]

    def __init__(self, config):
        self.width = config["width"]
        self.height = config["height"]
        self.n_agents = config["n_agents"]
        self.n_targets = config["n_targets"]
        self.max_steps = config["max_steps"] if "max_steps" in config.keys() else None
        self.agents = ['agent-' + str(i) for i in range(self.n_agents)]
        self.observation_space = self.observation_space('agent-0')
        self.action_space = Discrete(5)

    def unflatten_observation_space(self, agent):
       coordinates_space = Box(low=np.array([0, 0]), high=np.array([self.width-1, self.height-1]), dtype=np.int32)
       if self.n_agents > 1:
            return Dict({
                "position": coordinates_space,
                "other_agents": Dict({f"other_agent-{i}": coordinates_space for i in range(self.n_agents-1)}),
                "targets": Dict({f"target-{i}": coordinates_space for i in range(self.n_targets)})
            })
       else:
           return Dict({
                "position": coordinates_space,
                "targets": Dict({f"target-{i}": coordinates_space for i in range(self.n_targets)})
            })

    def observation_space(self, agent):
       return flatten_space(self.unflatten_observation_space(agent))
       #return self.unflatten_observation_space(agent)

    def action_space(self, agent):
        return Discrete(5)
    
    def __get_other_agents(self, agent):
        return [other for other in self.agents if other != agent]

    def __get_random_point(self):
        return (rnd.randint(0, self.width-1), rnd.randint(0, self.height-1))
    
    def __get_observation(self, agent):
        if self.n_agents > 1:
            return flatten(self.unflatten_observation_space(agent), 
                {
                    "position": self.agent_pos[agent],
                    "other_agents": {f"other_agent-{i}": self.agent_pos[other] for i, other in enumerate(self.__get_other_agents(agent))},
                    "targets": {f"target-{i}": pos for i, pos in enumerate(self.targets)}
                }
            )
        else:
            return flatten(self.unflatten_observation_space(agent), 
                {
                    "position": self.agent_pos[agent],
                    "targets": {f"target-{i}": pos for i, pos in enumerate(self.targets)}
                }
            )

    def __get_not_covered_targets(self):
        return set(self.targets) - set(self.agent_pos.values())

    def __is_target_contended(self, target):
        return len([t for t in self.agent_pos.values() if target == t]) > 1

    def __get_reward(self, agent):
        return -1 + self.__get_global_reward()
        if self.agent_pos[agent] in self.targets:
            if self.agent_pos[agent] in [pos[1] for pos in self.old_agent_pos if pos[0] != agent]:
                return -1 # someone was already covering the target -> no +10 reward
            if self.__is_target_contended(self.agent_pos[agent]):
                return -2 # someone arrived at the target at the same time of me -> someone has to leave
            return 10
        else:
            return -1
    
    def __get_global_reward(self):
        return (len(self.not_covered_target) - len(set(self.not_covered_target) - set(self.agent_pos.values())))*10
    
    def __update_agent_position(self, agent, x, y):
        self.agent_pos[agent] = (max(min(self.agent_pos[agent][0] + x, self.width-1), 0),
                                 max(min(self.agent_pos[agent][1] + y, self.height-1), 0))

    def reset(self, seed=None, options=None):
        self.agent_pos = {agent: self.__get_random_point() for agent in self.agents}
        self.targets = [self.__get_random_point() for _ in range(self.n_targets)]
        self.not_covered_target = self.targets.copy()
        self.steps = 0;
        return {agent: self.__get_observation(agent) for agent in self.agents}, {}
     
    def step(self, actions):
        self.steps += 1
        observations, rewards, terminated, truncated, infos = {}, {}, {}, {}, {}

        self.old_agent_pos = self.agent_pos.copy()
        for agent, action in actions.items():
            self.__update_agent_position(agent, self.actions_dict[action][0], self.actions_dict[action][1])

        for agent in actions.keys():
            if not (self.agent_pos[agent] in self.targets and not self.__is_target_contended(self.agent_pos[agent])):
                observations[agent] = self.__get_observation(agent)
                rewards[agent] = self.__get_reward(agent)
                terminated[agent] = False
                truncated[agent] = False
                infos[agent] = {}
        
        if self.max_steps != None and self.steps > self.max_steps:
            truncated['__all__'] = True
        else:
            truncated['__all__'] = False

        self.not_covered_target = list(set(self.not_covered_target) - set(self.agent_pos.values())) 

        terminated['__all__'] = len(self.__get_not_covered_targets()) == 0
        return observations, rewards, terminated, truncated, infos
     
    def render(self, mode='text'):
        str = '_' * (self.width+2) + '\n'
        for i in range(self.height):
            str = str + "|"
            for j in range(self.width):
                if (j,i) in self.agent_pos.values() and (j,i) in self.targets:
                    str = str + '*'
                elif (j,i) in self.agent_pos.values():
                    str = str + 'o'
                elif (j,i) in self.targets:
                    str = str + 'x'
                else:
                    str = str + ' '
            str = str + '|\n'
        str = str + '‾' * (self.width+2)
        print(str)

    def get_agent_ids(self):
       return self.agents

In [14]:
env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2})
obs, _ = env.reset() 
print(obs['agent-0'])
print(obs['agent-1'])
env.render()

actions = {"agent-0": 1, "agent-1": 0}
obs, _, _, _, _ = env.step(actions)
print(obs['agent-0'])
env.render()

[7 2 4 7 1 2 6 4]
[4 7 7 2 1 2 6 4]
____________
|          |
|          |
| x     o  |
|          |
|      x   |
|          |
|          |
|    o     |
|          |
|          |
‾‾‾‾‾‾‾‾‾‾‾‾
[7 1 4 8 1 2 6 4]
____________
|          |
|       o  |
| x        |
|          |
|      x   |
|          |
|          |
|          |
|    o     |
|          |
‾‾‾‾‾‾‾‾‾‾‾‾


In [91]:
import ray
ray.shutdown()
ray.init()

2024-05-16 12:02:42,628	INFO worker.py:1740 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.11.9
Ray version:,2.21.0
Dashboard:,http://127.0.0.1:8265


## Single agent

In [15]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit


register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 1, "n_targets": 1, "max_steps": 30}))

algo = (
    PPOConfig()
    .training(gamma=0.99, lr=0.001, kl_coeff=0.5, train_batch_size=2048, sgd_minibatch_size=256, num_sgd_iter=10)
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(20):
    result = algo.train()
    print(result["sampler_results"])
    #print(result["info"]["learner"]["default_policy"]["learner_stats"]["total_loss"])
    print(f"[{i}]")
    if i % 5 == 0:
        checkpoint_dir = algo.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))




{'episode_reward_max': -4.0, 'episode_reward_min': -31.0, 'episode_reward_mean': -29.1, 'episode_len_mean': 29.228571428571428, 'episode_media': {}, 'episodes_this_iter': 70, 'episodes_timesteps_total': 2046, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [-31.0, -31.0, -31.0, -21.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -15.0, -31.0, -31.0, -4.0, -31.0, -31.0, -31.0, -31.0, -10.0, -31.0, -15.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -16.0, -31.0, -31.0, -31.0, -29.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -21.0, -31.0, -31.0, -31.0, -31.0, -31.0, -31.0, -15.0, -31.0, -31.0, -31.0], 'episode_lengths': [31, 31, 31, 22, 31, 31, 31, 31, 31, 31, 31, 16, 31, 31, 5, 31, 31, 31, 31, 11, 31, 16, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 17

In [16]:
from IPython.display import clear_output
import time
import torch
from gymnasium.spaces.utils import flatten

env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 1, "n_targets": 1, "max_steps": 30})
obs_space = env.observation_space
obs, _ = env.reset()
env.render()

for i in range(100):
    actions = algo.compute_actions({agent: o for agent, o in obs.items()})
    print(actions, "\n")
    
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[11]
____________
|          |
|          |
|          |
|        * |
|          |
|          |
|          |
|          |
|          |
|          |
‾‾‾‾‾‾‾‾‾‾‾‾
{}
{}


## Two Agents

In [None]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit


register_env("my_env", lambda _: PointCoverageEnv({"height": 4, "width": 4, "n_agents": 2, "n_targets": 2, "max_steps": 50}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 10

print(f"number of different environment steps: {trainings*train_batch_size}")

algo2 = (
    PPOConfig()
    .training(gamma=0.99, 
              #lr=0.001,
              lr_schedule=[
                [0, 0.01],  
                [1000, 0.001],  
                [10000, 0.001],  
              ],
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter, 
              entropy_coeff_schedule = [
                [0, 1],  # Start with relatively high entropy coefficient
                [20480, 0],  # Gradually decrease entropy coefficient over 10,000 iterations
              ])
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

def customResultPrint(result):
    print(f"iteration [{result['training_iteration']}] => 
          episode_reward_mean: {result['sampler_results']['episode_reward_mean']}, 
          episode_len_mean: {result['sampler_results']['episode_len_mean']},
          agent_steps_trained: {result['info']['num_agent_steps_trained']},
          env_steps_trained: {result['info']['num_env_steps_trained']}
          ")

for i in range(trainings):
    result = algo2.train()
    customResultPrint(result)
    #print(result["sampler_results"])
    #print(result["info"]["learner"]["default_policy"]["learner_stats"]["total_loss"])
    print(f"[{i}]")
    if i % 5 == 0:
        checkpoint_dir = algo2.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 20480


2024-05-16 12:03:00,806	INFO trainable.py:161 -- Trainable.setup took 12.736 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.



agent_timesteps_total: 2677
connector_metrics:
  ObsPreprocessorConnector_ms: 0.0050098176986452135
  StateBufferConnector_ms: 0.0049023401169549854
  ViewRequirementAgentConnector_ms: 0.10862577529180617
counters:
  num_agent_steps_sampled: 2677
  num_agent_steps_trained: 2677
  num_env_steps_sampled: 2048
  num_env_steps_trained: 2048
custom_metrics: {}
date: 2024-05-16_12-03-05
done: false
env_runner_results:
  connector_metrics:
    ObsPreprocessorConnector_ms: 0.0050098176986452135
    StateBufferConnector_ms: 0.0049023401169549854
    ViewRequirementAgentConnector_ms: 0.10862577529180617
  custom_metrics: {}
  episode_len_mean: 32.34920634920635
  episode_media: {}
  episode_return_max: 26.0
  episode_return_mean: -29.650793650793652
  episode_return_min: -102.0
  episode_reward_max: 26.0
  episode_reward_mean: -29.650793650793652
  episode_reward_min: -102.0
  episodes_this_iter: 63
  episodes_timesteps_total: 2038
  hist_stats:
    episode_lengths: [51, 4, 46, 17, 29, 16, 11, 

SyntaxError: unterminated string literal (detected at line 40) (3292121196.py, line 40)

In [87]:
from IPython.display import clear_output
import time

env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2})
obs, _ = env.reset()
env.render()

for i in range(100):
    actions = algo2.compute_actions(obs)
    print(actions, "\n")
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[31]
____________
|          |
|          |
|          |
|         o|
|         x|
|          |
|          |
|          |
|         *|
|          |
‾‾‾‾‾‾‾‾‾‾‾‾
{'agent-1': array([9, 8, 9, 3, 9, 8, 9, 4], dtype=int32)}
{'agent-1': -1}


KeyboardInterrupt: 

In [7]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

env = PointCoverageEnv({"height": 10, "width": 50, "n_agents": 2, "n_targets": 2})
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=25000)

obs, _ = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


NotImplementedError: Nested observation spaces are not supported (Tuple/Dict space inside Tuple/Dict space).