In [134]:
from typing import Set
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import random as rnd
from gymnasium.spaces import Discrete, Box, Dict
from gymnasium.spaces.utils import flatten, flatten_space
import numpy as np
from IPython.display import clear_output

class PointCoverageEnv(MultiAgentEnv):

    actions_dict = [(0,-1),(0,1),(1,0),(-1,0)]

    def __init__(self, config):
        self.observations_memory = config["observations_memory"] if "observations_memory" in config.keys() else 1
        self.width = config["width"]
        self.height = config["height"]
        self.n_agents = config["n_agents"]
        self.n_targets = config["n_targets"]
        self.max_steps = config["max_steps"] if "max_steps" in config.keys() else None
        self.use_nested_observation = config["use_nested_observation"] if "use_nested_observation" in config.keys() else False
        self.agents = ['agent-' + str(i) for i in range(self.n_agents)]
        self.observation_space = self.observation_space('agent-0')
        self.action_space = Discrete(4)

    def unflatten_observation_space(self, agent):
        coordinates_space = Box(low=np.array([0.0, 0.0], dtype=np.float32), high=np.array([1.0, 1.0], dtype=np.float32), dtype=np.float32)
        obs_space = {"position": coordinates_space,
                     "targets": Dict({f"target-{i}": coordinates_space for i in range(self.n_targets)})}
        if self.n_agents > 1:
            obs_space = {"position": coordinates_space,
                        "other_agents": Dict({f"other_agent-{i}": coordinates_space for i in range(self.n_agents-1)}),
                        "targets": Dict({f"target-{i}": coordinates_space for i in range(self.n_targets)})}
        
        obs_space = Dict(obs_space)

        if self.observations_memory > 1:
            return Dict({f"t(-{i})": obs_space for i in range(self.observations_memory)})
        return obs_space

    def observation_space(self, agent):
       if self.use_nested_observation:
           return self.unflatten_observation_space(agent)
       return flatten_space(self.unflatten_observation_space(agent))

    def action_space(self, agent):
        return Discrete(5)
    
    def __get_other_agents(self, agent):
        return [other for other in self.agents if other != agent]

    def __get_random_point(self):
        return (rnd.randint(0, self.width-1), rnd.randint(0, self.height-1))
    
    def __get_normalized_position(self, position):
        return (position[0]/self.width, position[1]/self.height)

    def __get_unflatten_time_t_observation(self, agent):
        time_t_obs = {"position": self.__get_normalized_position(self.agent_pos[agent]),
               "targets": {f"target-{i}": self.__get_normalized_position(pos) for i, pos in enumerate(self.targets)}}
        if self.n_agents > 1:
            time_t_obs = {"position": self.__get_normalized_position(self.agent_pos[agent]),
               "other_agents": {f"other_agent-{i}": self.__get_normalized_position(self.agent_pos[other]) for i, other in enumerate(self.__get_other_agents(agent))},
               "targets": {f"target-{i}": self.__get_normalized_position(pos) for i, pos in enumerate(self.targets)}}
        return time_t_obs

    def __get_observation(self, agent):
        time_t_obs = self.__get_unflatten_time_t_observation(agent)

        obs = {}
        if self.observations_memory > 1:
            self.agents_memory[agent].pop(0)
            self.agents_memory[agent].append(time_t_obs)
            obs = {f"t(-{i})": self.agents_memory[agent][self.observations_memory-1-i] for i in range(self.observations_memory)}
        else:
            obs = time_t_obs

        if self.use_nested_observation:
            return obs
        return flatten(self.unflatten_observation_space(agent), obs)

    def __get_not_covered_targets(self):
        return set(self.targets) - set(self.agent_pos.values())

    def __is_target_contended(self, target):
        return list(self.agent_pos.values()).count(target) > 1

    def __get_reward(self, agent):
        return -1 + self.__get_global_reward()
        if self.agent_pos[agent] in self.targets:
            if self.agent_pos[agent] in [pos[1] for pos in self.old_agent_pos if pos[0] != agent]:
                return -1 # someone was already covering the target -> no +10 reward
            if self.__is_target_contended(self.agent_pos[agent]):
                return -2 # someone arrived at the target at the same time of me -> someone has to leave
            return 10
        else:
            return -1
    
    def __get_global_reward(self):
        return 0#(len(self.not_covered_target) - len(set(self.not_covered_target) - set(self.agent_pos.values())))*10
    
    def __update_agent_position(self, agent, x, y):
        self.agent_pos[agent] = (max(min(self.agent_pos[agent][0] + x, self.width-1), 0),
                                 max(min(self.agent_pos[agent][1] + y, self.height-1), 0))

    def reset(self, seed=None, options=None):
        self.agent_pos = {agent: self.__get_random_point() for agent in self.agents}
        self.targets = [self.__get_random_point() for _ in range(self.n_targets)]
        self.not_covered_target = self.targets.copy()
        self.steps = 0;
        self.agents_memory = {agent: [self.__get_unflatten_time_t_observation(agent)]*self.observations_memory for agent in self.agents}
        return {agent: self.__get_observation(agent) for agent in self.agents}, {}
     
    def step(self, actions):
        self.steps += 1
        observations, rewards, terminated, truncated, infos = {}, {}, {}, {}, {}

        self.old_agent_pos = self.agent_pos.copy()
        for agent, action in actions.items():
            self.__update_agent_position(agent, self.actions_dict[action][0], self.actions_dict[action][1])

        for agent in actions.keys():
            if not (self.agent_pos[agent] in self.targets and not self.__is_target_contended(self.agent_pos[agent])):
                observations[agent] = self.__get_observation(agent)
                rewards[agent] = self.__get_reward(agent)
                terminated[agent] = False
                truncated[agent] = False
                infos[agent] = {}
        
        if self.max_steps != None and self.steps > self.max_steps:
            truncated['__all__'] = True
        else:
            truncated['__all__'] = False

        self.not_covered_target = list(set(self.not_covered_target) - set(self.agent_pos.values())) 

        terminated['__all__'] = len(self.__get_not_covered_targets()) == 0
        return observations, rewards, terminated, truncated, infos
     
    def render(self, mode='text'):
        str = '_' * (self.width+2) + '\n'
        for i in range(self.height):
            str = str + "|"
            for j in range(self.width):
                if (j,i) in self.agent_pos.values() and (j,i) in self.targets:
                    str = str + '*'
                elif (j,i) in self.agent_pos.values():
                    str = str + 'o'
                elif (j,i) in self.targets:
                    str = str + 'x'
                else:
                    str = str + ' '
            str = str + '|\n'
        str = str + '‾' * (self.width+2)
        print(str)

    def get_agent_ids(self):
       return self.agents

In [135]:
import json

observations_memory = 2

env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "use_nested_observation": False, "observations_memory": observations_memory})
obs, _ = env.reset() 
print(obs)
#print(json.dumps(obs['agent-0'], indent=2))
env.render()

#obs, _, _, _, _ = env.step({'agent-0': 1, 'agent-1': 2})
#print(json.dumps(obs['agent-0'], indent=2))

#obs, _, _, _, _ = env.step({'agent-0': 1, 'agent-1': 2})
#print(json.dumps(obs['agent-0'], indent=2))

{'agent-0': array([0.3, 0.3, 0.4, 0.2, 0.2, 0.8, 0.2, 0.1, 0.3, 0.3, 0.4, 0.2, 0.2,
       0.8, 0.2, 0.1], dtype=float32), 'agent-1': array([0.4, 0.2, 0.3, 0.3, 0.2, 0.8, 0.2, 0.1, 0.4, 0.2, 0.3, 0.3, 0.2,
       0.8, 0.2, 0.1], dtype=float32)}
____________
|          |
|  x       |
|    o     |
|   o      |
|          |
|          |
|          |
|          |
|  x       |
|          |
‾‾‾‾‾‾‾‾‾‾‾‾


In [136]:
#import ray

def customResultPrint(result):
    print(f"iteration [{result['training_iteration']}] => " +
          f"episode_reward_mean: {result['sampler_results']['episode_reward_mean']}, " +
          f"episode_len_mean: {result['sampler_results']['episode_len_mean']}, " +
          f"agent_steps_trained: {result['info']['num_agent_steps_trained']}, " +
          f"env_steps_trained: {result['info']['num_env_steps_trained']}, " + 
          f"entropy: {result['info']['learner']['default_policy']['learner_stats']['entropy']}, " +
          f"learning_rate: {result['info']['learner']['default_policy']['learner_stats']['cur_lr']}")

#ray.shutdown()
#ray.init()

## Single agent

### no memory

In [137]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 1
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 1, "n_targets": 1, "max_steps": 100, "use_nested_observation": False, "observations_memory": observations_memory}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 30

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo = (
    PPOConfig()
    .training(gamma=0.99, 
              lr=0.001,
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 61440


KeyboardInterrupt: 

### memory = 2

In [None]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 2
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 1, "n_targets": 1, "max_steps": 100, "observations_memory": observations_memory}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 30

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo = (
    PPOConfig()
    .training(gamma=0.99, 
              lr=0.001,
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 61440




iteration [1] => episode_reward_mean: -86.17391304347827, episode_len_mean: 86.43478260869566, agent_steps_trained: 2048, env_steps_trained: 2048, entropy: 1.6000442996621131, learning_rate: 0.0010000000000000002
Checkpoint saved in directory /tmp/tmp5lv0z5ba
iteration [2] => episode_reward_mean: -75.31481481481481, episode_len_mean: 75.70370370370371, agent_steps_trained: 4096, env_steps_trained: 4096, entropy: 1.5908732578158378, learning_rate: 0.0010000000000000002
iteration [3] => episode_reward_mean: -76.29113924050633, episode_len_mean: 76.65822784810126, agent_steps_trained: 6144, env_steps_trained: 6144, entropy: 1.5680509522557258, learning_rate: 0.0010000000000000002
iteration [4] => episode_reward_mean: -74.5, episode_len_mean: 74.91, agent_steps_trained: 8192, env_steps_trained: 8192, entropy: 1.5522387847304344, learning_rate: 0.0010000000000000002
iteration [5] => episode_reward_mean: -68.86, episode_len_mean: 69.36, agent_steps_trained: 10240, env_steps_trained: 10240, e

### memory = 3

In [None]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 3
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 1, "n_targets": 1, "max_steps": 100, "observations_memory": observations_memory}))

train_batch_size = 2048
sgd_minibatch_size = 256
num_sgd_iter = 10
trainings = 30

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo = (
    PPOConfig()
    .training(gamma=0.99, 
              lr=0.001,
              kl_coeff=0.5, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 61440




iteration [1] => episode_reward_mean: 32.32, episode_len_mean: 80.8, agent_steps_trained: 2048, env_steps_trained: 2048, entropy: 1.5994981303811073, learning_rate: 0.0010000000000000002
Checkpoint saved in directory /tmp/tmpo1c36i6l
iteration [2] => episode_reward_mean: 41.924528301886795, episode_len_mean: 77.11320754716981, agent_steps_trained: 4096, env_steps_trained: 4096, entropy: 1.5748886257410049, learning_rate: 0.0010000000000000002
iteration [3] => episode_reward_mean: 55.19767441860465, episode_len_mean: 71.1046511627907, agent_steps_trained: 6144, env_steps_trained: 6144, entropy: 1.5543632209300995, learning_rate: 0.0010000000000000002
iteration [4] => episode_reward_mean: 73.73, episode_len_mean: 57.25, agent_steps_trained: 8192, env_steps_trained: 8192, entropy: 1.5233711302280426, learning_rate: 0.0010000000000000002
iteration [5] => episode_reward_mean: 93.93, episode_len_mean: 36.91, agent_steps_trained: 10240, env_steps_trained: 10240, entropy: 1.489880283176899, le

In [None]:
from IPython.display import clear_output
import time
import torch
from gymnasium.spaces.utils import flatten

env = PointCoverageEnv({"height": 20, "width": 100, "n_agents": 1, "n_targets": 1, "observations_memory": observations_memory})
obs_space = env.observation_space
obs, _ = env.reset()
print(obs)
env.render()

for i in range(100):
    actions = algo.compute_actions({agent: o for agent, o in obs.items()})
    print(actions, "\n")
    
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[99]
______________________________________________________________________________________________________
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                                                    |
|                                                                   

## Two Agents

### no memory

In [None]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 1
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "observations_memory": observations_memory}))

train_batch_size = 4096*2
sgd_minibatch_size = 256*2
num_sgd_iter = 30
trainings = 30

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")


algo2 = (
    PPOConfig()
    .training(gamma=0.95, 
              lr=0.0005,
              kl_coeff=0.2, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

for i in range(trainings):
    result = algo2.train()
    customResultPrint(result)
    if i % 5 == 0:
        checkpoint_dir = algo2.save().checkpoint.path
        print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 245760




iteration [1] => episode_reward_mean: -133.48235294117646, episode_len_mean: 95.31764705882352, agent_steps_trained: 12282, env_steps_trained: 8192, entropy: 1.597996175116387, learning_rate: 0.0005000000000000001
Checkpoint saved in directory /tmp/tmp5yoe4j_1
iteration [2] => episode_reward_mean: -118.35, episode_len_mean: 88.41, agent_steps_trained: 24373, env_steps_trained: 16384, entropy: 1.5717069007348323, learning_rate: 0.0005000000000000001


KeyboardInterrupt: 

## memory = 2

In [None]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 2
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "observations_memory": observations_memory}))

train_batch_size = 4096*2
sgd_minibatch_size = 256*2
num_sgd_iter = 30
trainings = 30

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")

#def my_policy_mapping_fn(agent_id, episode):
    # return "agent-policy"

algo2 = (
    PPOConfig()
    .training(gamma=0.95, 
              lr=0.0005,
              kl_coeff=0.2, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    #.multi_agent(policies={"agent-policy": (None, env.observation_space, env.action_space, {})},
    #             policy_mapping_fn=my_policy_mapping_fn)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

#print(algo2.config.is_multi_agent())

for i in range(trainings):
    result = algo2.train()
    customResultPrint(result)
    #if i % 5 == 0:
    #    checkpoint_dir = algo2.save().checkpoint.path
    #    print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 245760




iteration [1] => episode_reward_mean: -140.55172413793105, episode_len_mean: 93.36781609195403, agent_steps_trained: 12447, env_steps_trained: 8192, entropy: 1.3740812677476142, learning_rate: 0.0005000000000000001
iteration [2] => episode_reward_mean: -121.89, episode_len_mean: 84.47, agent_steps_trained: 24352, env_steps_trained: 16384, entropy: 1.3502507567405702, learning_rate: 0.0005000000000000001
iteration [3] => episode_reward_mean: -92.80672268907563, episode_len_mean: 68.78991596638656, agent_steps_trained: 35590, env_steps_trained: 24576, entropy: 1.310470469981905, learning_rate: 0.0005000000000000001
iteration [4] => episode_reward_mean: -64.40718562874251, episode_len_mean: 49.07185628742515, agent_steps_trained: 46671, env_steps_trained: 32768, entropy: 1.2618905988950577, learning_rate: 0.0005000000000000001
iteration [5] => episode_reward_mean: -34.43653250773994, episode_len_mean: 25.325077399380806, agent_steps_trained: 58438, env_steps_trained: 40960, entropy: 1.154

In [None]:
from IPython.display import clear_output
import time

observations_memory = 2
env = PointCoverageEnv({"height": 20, "width": 20, "n_agents": 2, "n_targets": 2, "observations_memory": observations_memory})
obs, _ = env.reset()
env.render()

for i in range(100):
    actions = algo2.compute_actions(obs)
    print(actions, "\n")
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    #print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[11]
______________________
|                    |
|                   *|
|                    |
|                    |
|                    |
|                    |
|                    |
|                    |
|                    |
|                *   |
|                    |
|                    |
|                    |
|                    |
|                    |
|                    |
|                    |
|                    |
|                    |
|                    |
‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
{}


## Three Agents

In [141]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
from gymnasium.wrappers.time_limit import TimeLimit

observations_memory = 2
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 3, "n_targets": 3, "max_steps": 100, "observations_memory": observations_memory}))

train_batch_size = 4096*2
sgd_minibatch_size = 256*2
num_sgd_iter = 30
trainings = 40

total_env_steps = trainings*train_batch_size

print(f"number of different environment steps: {total_env_steps}")

#def my_policy_mapping_fn(agent_id, episode):
    # return "agent-policy"

algo3 = (
    PPOConfig()
    .training(gamma=0.95, 
              lr=0.0005,
              kl_coeff=0.2, 
              train_batch_size=train_batch_size, 
              sgd_minibatch_size=sgd_minibatch_size, 
              num_sgd_iter=num_sgd_iter)
    .env_runners(num_env_runners=1)
    #.multi_agent(policies={"agent-policy": (None, env.observation_space, env.action_space, {})},
    #             policy_mapping_fn=my_policy_mapping_fn)
    .resources(num_gpus=0)
    .environment(env="my_env")
    .build()
)

#print(algo2.config.is_multi_agent())

for i in range(trainings):
    result = algo3.train()
    customResultPrint(result)
    #if i % 5 == 0:
    #    checkpoint_dir = algo2.save().checkpoint.path
    #    print(f"Checkpoint saved in directory {checkpoint_dir}")



number of different environment steps: 327680


2024-05-20 11:00:34,802	INFO worker.py:1749 -- Started a local Ray instance.
2024-05-20 11:00:45,881	INFO trainable.py:161 -- Trainable.setup took 13.185 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


iteration [1] => episode_reward_mean: -173.85882352941175, episode_len_mean: 95.82352941176471, agent_steps_trained: 15032, env_steps_trained: 8192, entropy: 1.3733490586280823, learning_rate: 0.0005000000000000002
iteration [2] => episode_reward_mean: -161.99, episode_len_mean: 94.38, agent_steps_trained: 29244, env_steps_trained: 16384, entropy: 1.3558745746259335, learning_rate: 0.0005000000000000001
iteration [3] => episode_reward_mean: -157.53, episode_len_mean: 89.84, agent_steps_trained: 43751, env_steps_trained: 24576, entropy: 1.3404492764245897, learning_rate: 0.0005000000000000002
iteration [4] => episode_reward_mean: -125.77570093457943, episode_len_mean: 76.44859813084112, agent_steps_trained: 57459, env_steps_trained: 32768, entropy: 1.3144010079212678, learning_rate: 0.0005000000000000001
iteration [5] => episode_reward_mean: -118.28571428571429, episode_len_mean: 73.08035714285714, agent_steps_trained: 70985, env_steps_trained: 40960, entropy: 1.300217405343667, learnin

In [145]:
from IPython.display import clear_output
import time

observations_memory = 2
env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 3, "n_targets": 3, "observations_memory":observations_memory})
obs, _ = env.reset()
env.render()

for i in range(100):
    actions = algo2.compute_actions(obs)
    print(actions, "\n")
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[20]
____________
|          |
|          |
|          |
|         *|
|          |
| *        |
|          |
|         *|
|          |
|          |
‾‾‾‾‾‾‾‾‾‾‾‾
{}
{}


## Two agents, DQN

In [179]:
import ray
ray.shutdown()
#ray.init(object_store_memory=(10**9))

In [180]:
from ray.rllib.algorithms.dqn.dqn import DQNConfig
from ray.rllib.algorithms import DQN
from ray.tune.registry import register_env
from ray import air
from ray import tune

observations_memory = 2
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "observations_memory": observations_memory}))

replay_config = {
        "capacity": 50000,
    }

config = (DQNConfig()
    .training(
        replay_buffer_config=replay_config,
        gamma = tune.grid_search([0.90, 0.95, 0.99]),                # Discount factor for future rewards
        lr = tune.grid_search([0.01, 0.001, 0.0005]),                 # Learning rate
        train_batch_size = tune.grid_search([128, 256]),       # Batch size for training
        dueling=tune.grid_search([True, False]),
        double_q=tune.grid_search([True, False]),
        #model={"fcnet_hiddens": [16], "fcnet_activation": "relu"},  # Model architecture
        #adam_epsilon=0.5
    )
    .environment("my_env")
)

tune.Tuner(
    "DQN",
    run_config=air.RunConfig(stop={"training_iteration":5}),
    param_space=config.to_dict()
).fit()

0,1
Current time:,2024-05-20 12:29:38
Running for:,00:07:40.53
Memory:,3.6/3.7 GiB

Trial name,# failures,error file
DQN_my_env_ca129_00000,1,"/tmp/ray/session_2024-05-20_12-21-53_440403_856/artifacts/2024-05-20_12-21-58/DQN_2024-05-20_12-21-53/driver_artifacts/DQN_my_env_ca129_00000_0_double_q=True,dueling=True,gamma=0.9000,lr=0.0100,train_batch_size=128_2024-05-20_12-21-58/error.txt"
DQN_my_env_ca129_00001,1,"/tmp/ray/session_2024-05-20_12-21-53_440403_856/artifacts/2024-05-20_12-21-58/DQN_2024-05-20_12-21-53/driver_artifacts/DQN_my_env_ca129_00001_1_double_q=False,dueling=True,gamma=0.9000,lr=0.0100,train_batch_size=128_2024-05-20_12-21-59/error.txt"
DQN_my_env_ca129_00002,1,"/tmp/ray/session_2024-05-20_12-21-53_440403_856/artifacts/2024-05-20_12-21-58/DQN_2024-05-20_12-21-53/driver_artifacts/DQN_my_env_ca129_00002_2_double_q=True,dueling=False,gamma=0.9000,lr=0.0100,train_batch_size=128_2024-05-20_12-21-59/error.txt"
DQN_my_env_ca129_00003,1,"/tmp/ray/session_2024-05-20_12-21-53_440403_856/artifacts/2024-05-20_12-21-58/DQN_2024-05-20_12-21-53/driver_artifacts/DQN_my_env_ca129_00003_3_double_q=False,dueling=False,gamma=0.9000,lr=0.0100,train_batch_size=128_2024-05-20_12-21-59/error.txt"
DQN_my_env_ca129_00004,1,"/tmp/ray/session_2024-05-20_12-21-53_440403_856/artifacts/2024-05-20_12-21-58/DQN_2024-05-20_12-21-53/driver_artifacts/DQN_my_env_ca129_00004_4_double_q=True,dueling=True,gamma=0.9500,lr=0.0100,train_batch_size=128_2024-05-20_12-21-59/error.txt"
DQN_my_env_ca129_00005,1,"/tmp/ray/session_2024-05-20_12-21-53_440403_856/artifacts/2024-05-20_12-21-58/DQN_2024-05-20_12-21-53/driver_artifacts/DQN_my_env_ca129_00005_5_double_q=False,dueling=True,gamma=0.9500,lr=0.0100,train_batch_size=128_2024-05-20_12-21-59/error.txt"
DQN_my_env_ca129_00006,1,"/tmp/ray/session_2024-05-20_12-21-53_440403_856/artifacts/2024-05-20_12-21-58/DQN_2024-05-20_12-21-53/driver_artifacts/DQN_my_env_ca129_00006_6_double_q=True,dueling=False,gamma=0.9500,lr=0.0100,train_batch_size=128_2024-05-20_12-21-59/error.txt"
DQN_my_env_ca129_00007,1,"/tmp/ray/session_2024-05-20_12-21-53_440403_856/artifacts/2024-05-20_12-21-58/DQN_2024-05-20_12-21-53/driver_artifacts/DQN_my_env_ca129_00007_7_double_q=False,dueling=False,gamma=0.9500,lr=0.0100,train_batch_size=128_2024-05-20_12-21-59/error.txt"
DQN_my_env_ca129_00008,1,"/tmp/ray/session_2024-05-20_12-21-53_440403_856/artifacts/2024-05-20_12-21-58/DQN_2024-05-20_12-21-53/driver_artifacts/DQN_my_env_ca129_00008_8_double_q=True,dueling=True,gamma=0.9900,lr=0.0100,train_batch_size=128_2024-05-20_12-21-59/error.txt"
DQN_my_env_ca129_00009,1,"/tmp/ray/session_2024-05-20_12-21-53_440403_856/artifacts/2024-05-20_12-21-58/DQN_2024-05-20_12-21-53/driver_artifacts/DQN_my_env_ca129_00009_9_double_q=False,dueling=True,gamma=0.9900,lr=0.0100,train_batch_size=128_2024-05-20_12-21-59/error.txt"

Trial name,status,loc,double_q,dueling,gamma,lr,train_batch_size
DQN_my_env_ca129_00055,PENDING,,False,False,0.95,0.001,256
DQN_my_env_ca129_00057,PENDING,,False,True,0.99,0.001,256
DQN_my_env_ca129_00059,PENDING,,False,False,0.99,0.001,256
DQN_my_env_ca129_00060,PENDING,,True,True,0.9,0.0005,256
DQN_my_env_ca129_00061,PENDING,,False,True,0.9,0.0005,256
DQN_my_env_ca129_00062,PENDING,,True,False,0.9,0.0005,256
DQN_my_env_ca129_00063,PENDING,,False,False,0.9,0.0005,256
DQN_my_env_ca129_00064,PENDING,,True,True,0.95,0.0005,256
DQN_my_env_ca129_00065,PENDING,,False,True,0.95,0.0005,256
DQN_my_env_ca129_00066,PENDING,,True,False,0.95,0.0005,256


2024-05-20 12:22:02,373	ERROR tune_controller.py:1331 -- Trial task failed for trial DQN_my_env_ca129_00005
Traceback (most recent call last):
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff981bac298066d7330a605f4601000000 Worker ID: 94ac3c3d43a31140af4113acf7fcb810a8e59a24f994235471d8e42e Node ID: 710b910025519a0350de60aae0726b0cd1496778ebf0af8e8d6b654a Worker IP address: 172.23.82.135 Worker port: 42089 Worker PID: 12416 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[33m(raylet)[0m [2024-05-20 12:23:39,747 E 12008 12008] (raylet) node_manager.cc:3002: 13 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 710b910025519a0350de60aae0726b0cd1496778ebf0af8e8d6b654a, IP: 172.23.82.135) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.23.82.135`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
[33m(raylet)[0m [2024-05-20 12:23:39,747 E 12008 12008] (raylet) worker_pool.cc:549: 

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff73a23c0bf571374e7b17de9901000000 Worker ID: 0a8fc328081b8029871a7e182fb6920d1637a0b6773659bcc8688c42 Node ID: 710b910025519a0350de60aae0726b0cd1496778ebf0af8e8d6b654a Worker IP address: 172.23.82.135 Worker port: 46203 Worker PID: 12414 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffee41d1f0eea0a7d6d32693dd01

[33m(raylet)[0m [2024-05-20 12:27:14,031 E 12008 12008] (raylet) node_manager.cc:3002: 13 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 710b910025519a0350de60aae0726b0cd1496778ebf0af8e8d6b654a, IP: 172.23.82.135) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.23.82.135`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
[33m(raylet)[0m [2024-05-20 12:27:14,036 E 12008 12008] (raylet) worker_pool.cc:549: 

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: fffffffffffffffff7e76abbcea1234c653db2f201000000 Worker ID: 13965c09980b1db13464c0eaf739080d9e51be37137e66a5dc85b634 Node ID: 710b910025519a0350de60aae0726b0cd1496778ebf0af8e8d6b654a Worker IP address: 172.23.82.135 Worker port: 33353 Worker PID: 13586 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


2024-05-20 12:27:19,463	ERROR tune_controller.py:1331 -- Trial task failed for trial DQN_my_env_ca129_00015
Traceback (most recent call last):
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffec6c290bae20cdb9589132a001000000 Worker ID: b9faf86994356a42e4c56b2c842a534daa93d4ea61807e6f910c2978 Node ID: 710b910025519a0350de60aae0726b0cd1496778ebf0af8e8d6b654a Worker IP address: 172.23.82.135 Worker port: 34921 Worker PID: 14011 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


2024-05-20 12:27:41,396	ERROR tune_controller.py:1331 -- Trial task failed for trial DQN_my_env_ca129_00023
Traceback (most recent call last):
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffb5b7fe6e58e5b275d09d562c01000000 Worker ID: bffbf7a906f71c779b1e34320b890df41b35d5891194e541c0776d47 Node ID: 710b910025519a0350de60aae0726b0cd1496778ebf0af8e8d6b654a Worker IP address: 172.23.82.135 Worker port: 35917 Worker PID: 15508 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.[32m [repeated 2x across cluster][0m


2024-05-20 12:29:31,124	ERROR tune_controller.py:1331 -- Trial task failed for trial DQN_my_env_ca129_00048
Traceback (most recent call last):
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^

In [160]:
from ray.rllib.algorithms.dqn.dqn import DQNConfig
from ray.rllib.algorithms import DQN
from ray.tune.registry import register_env

observations_memory = 2
register_env("my_env", lambda _: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "observations_memory": observations_memory}))

replay_config = {
        "capacity": 50000,
    }

config = (DQNConfig()
    .training(
        replay_buffer_config=replay_config,
        gamma=0.90,                # Discount factor for future rewards
        lr=0.001,                 # Learning rate
        train_batch_size=64,       # Batch size for training
        #model={"fcnet_hiddens": [16], "fcnet_activation": "relu"},  # Model architecture
        #dueling=True,              # Use dueling DQN
        #double_q=True,             # Use double Q-learning
        #adam_epsilon=0.5
        )
    .environment("my_env")
)
config.sample_timeout_s *= 5 

algoDQN = config.build()

for i in range(5):
    result = algoDQN.train()
    print(f"[{i}] mean_reward: {result['sampler_results']['episode_reward_mean']}, mean_len: {result['sampler_results']['episode_len_mean']}")



[0] mean_reward: -144.6, mean_len: 99.2
[1] mean_reward: -150.52631578947367, mean_len: 100.05263157894737
[2] mean_reward: -140.29032258064515, mean_len: 96.16129032258064
[3] mean_reward: -135.0952380952381, mean_len: 94.30952380952381
[4] mean_reward: -134.78846153846155, mean_len: 95.4423076923077


In [147]:
from IPython.display import clear_output
import time

observations_memory = 2
env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "observations_memory":observations_memory})
obs, _ = env.reset()
env.render()

for i in range(100):
    actions = algoDQN.compute_actions(obs)
    print(actions, "\n")
    obs, reward, terminated, truncated, info = env.step(actions)
    clear_output()
    print(f"[{i}]")
    env.render()
    print(obs)
    print(reward)
    time.sleep(0.5)

    if terminated['__all__'] or truncated['__all__']:
        break


[19]
____________
|      x   |
|          |
|          |
|     o    |
|       x  |
|          |
|          |
|          |
|   o      |
|          |
‾‾‾‾‾‾‾‾‾‾‾‾
{'agent-0': array([0.3, 0.8, 0.5, 0.3, 0.7, 0.4, 0.6, 0. , 0.3, 0.9, 0.5, 0.2, 0.7,
       0.4, 0.6, 0. ], dtype=float32), 'agent-1': array([0.5, 0.3, 0.3, 0.8, 0.7, 0.4, 0.6, 0. , 0.5, 0.2, 0.3, 0.9, 0.7,
       0.4, 0.6, 0. ], dtype=float32)}
{'agent-0': -1, 'agent-1': -1}


KeyboardInterrupt: 

# Tianshou

In [None]:
env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "use_nested_observation": False})
obs, _ = env.reset()
print(obs)
print(Batch(obs)) 
print(env.observation_space.shape)

NameError: name 'PointCoverageEnv' is not defined

In [None]:
train_size, test_size = (20, 10)
device = "cpu"

env = PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "use_nested_observation": False})
train_envs = DummyVectorEnv([lambda: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "use_nested_observation": False}) for _ in range(train_size)])
test_envs = DummyVectorEnv([lambda: PointCoverageEnv({"height": 10, "width": 10, "n_agents": 2, "n_targets": 2, "max_steps": 100, "use_nested_observation": False}) for _ in range(test_size)])

assert env.observation_space.shape is not None
assert isinstance(env.action_space, Discrete) 

net = Net(state_shape=env.observation_space.shape, hidden_sizes=[64, 64], device=device)
actor = Actor(preprocess_net=net, action_shape=env.action_space.n, device=device).to(device)
critic = Critic(preprocess_net=net, device=device).to(device)
actor_critic = ActorCritic(actor=actor, critic=critic)

# optimizer of the actor and the critic
optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)

dist = torch.distributions.Categorical
policy: BasePolicy
policy = PPOPolicy(
    actor=actor,
    critic=critic,
    optim=optim,
    dist_fn=dist,
    action_space=env.action_space,
    deterministic_eval=True,
    action_scaling=False,
)

mapolicy_manager = MultiAgentPolicyManager(policies=[policy, policy], env=env)


train_collector = Collector(
    policy=mapolicy_manager,
    env=train_envs,
    buffer=VectorReplayBuffer(20000, len(train_envs)),
)
test_collector = Collector(policy=mapolicy_manager, env=test_envs)

result = OnpolicyTrainer(
    policy=mapolicy_manager,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=10,
    step_per_epoch=50000,
    repeat_per_collect=10,
    episode_per_test=10,
    batch_size=256,
    step_per_collect=2000,
    stop_fn=lambda mean_reward: mean_reward >= 195,
).run()

result.pprint_asdict()

AttributeError: 'dict' object has no attribute 'agent_id'