## Keep average distance

the agents goal is to position close to each others at a distance previously defined

challenges:
- deal with continuous space environment
- limited vision of an agent

### utils

In [1]:
from utils.vectors import Vector2D
from utils.canvas import CanvasWithBorders
from utils.algo_utils import (save_algo, load_algo)
from utils.simulations import (simulate_episode, simulate_random_episode, ppo_result_format)


### environment definition

In [2]:
from typing import Set
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import random as rnd
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiDiscrete
from gymnasium.spaces.utils import flatten, flatten_space
import numpy as np
from IPython.display import clear_output
import math
from ipycanvas import Canvas, hold_canvas

In [3]:
class EnvironmentConfiguration: 
    def __init__(self, n_agents, target_distance, speed, spawn_area=100, visible_nbrs=1, max_steps=None, spawn_area_schedule=None):
        self.n_agents = n_agents
        self.visible_nbrs = visible_nbrs
        self.target_distance = target_distance
        self.max_steps = max_steps
        self.speed = speed
        self.spawn_area = spawn_area
        self.spawn_area_schedule = spawn_area_schedule

class KeepTheDistance(MultiAgentEnv):

    canvas = None
    CANVAS_WIDTH, CANVAS_HEIGHT = 300.0, 300.0

    def __init__(self, config: EnvironmentConfiguration):
        assert config.n_agents > config.visible_nbrs # just base case implemented 
             
        self.n_agents = config.n_agents
        self.visible_nbrs = config.visible_nbrs
        self.target_distance = config.target_distance
        self.max_steps = config.max_steps
        self.speed = config.speed
        self.spawn_area = config.spawn_area
        self.spawn_area_schedule = config.spawn_area_schedule
        if self.spawn_area_schedule != None:
            self.spawn_area_schedule_index = 0
            self.n_reset = 0
            self.spawn_area = self.spawn_area_schedule[0][1]
        
        self.agents_ids = ['agent-' + str(i) for i in range(self.n_agents)]
        self.agent_colors = {agent: self.rgb_to_hex(rnd.randint(0, 255), rnd.randint(0, 255), rnd.randint(0, 255)) for agent in self.agents_ids}
        self.observation_space = self.observation_space('agent-0')
        self.action_space = self.action_space("")

    def unflatten_observation_space(self, agent):
        #distance_vector = Box(low=-np.inf, high=np.inf, shape=(2,1), dtype=np.float32)
        #obs_space = Dict({"nbr-1": distance_vector})
        direction = Box(low=-1, high=1, shape=(2,1), dtype=np.float32)
        distance = Box(low=-np.inf, high=np.inf, shape=(1,1), dtype=np.float32)
        return Dict({f"nbr-{i}": Dict({'direction': direction, 'distance': distance}) for i in range(self.visible_nbrs)})

    def observation_space(self, agent):
        return flatten_space(self.unflatten_observation_space(agent))

    def action_space(self, agent):
        direction = Box(low=-1.0, high=1.0, shape=(2,1), dtype=np.float32)
        speed = Box(0.0, 1.0, dtype=np.float32)
        return flatten_space(Tuple([direction, speed]))
    
    def __get_observation(self, agent):
        distance_vectors = [Vector2D.distance_vector(self.agents_pos[agent], self.agents_pos[nbr])  
                            for nbr in self.__get_n_closest_neighbours(agent, self.visible_nbrs)]

        obs = {
            f"nbr-{i}": {
                "direction": Vector2D.unit_vector(distance_vectors[i]).to_np_array(),
                "distance": np.log(1 + Vector2D.norm(distance_vectors[i])) #1 - np.exp(-alpha * x)
            }
            for i in range(len(distance_vectors))
            }
        return flatten(self.unflatten_observation_space(agent), obs)

    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def __total_distance_from_closest_neighbours(self, agent):
        return sum([abs(Vector2D.distance(self.agents_pos[agent], self.agents_pos[nbr]) - self.target_distance) for nbr in self.__get_n_closest_neighbours(agent, self.visible_nbrs)])

    def __get_local_reward(self, agent, action):
        last_action = self.last_actions[agent]
        self.last_actions[agent] = action

        # r0: negative of the distance from the closest neighbours
        reward_0 = -self.__total_distance_from_closest_neighbours(agent)

        # r1: improvement of the distance from the closest neighbours
        newDistance = self.__total_distance_from_closest_neighbours(agent)
        reward_1 = self.last_step_distances[agent] - newDistance
        self.last_step_distances[agent] = newDistance

        worst_possible_reward_1 = self.speed * (self.visible_nbrs + 1) # I move away from my nbrs and my nbrs are moving away from me at a speed = speed
        reward_1_normalized = reward_1 / worst_possible_reward_1 # [-1,+1] 

        # r2: bonus if the agent is very close to the target distance
        closest_nbrs = self.__get_n_closest_neighbours(agent, self.visible_nbrs)
        #reward_2 = sum([100 if abs(Vector2D.distance(self.agents_pos[agent], self.agents_pos[nbr]) - self.target_distance) < 0.5 else 0 for nbr in closest_nbrs])
        reward_2 = sum([max(0, 1 - (abs(Vector2D.distance(self.agents_pos[agent], self.agents_pos[nbr]) - self.target_distance))) for nbr in closest_nbrs])

        # r5: penalize when the distance between two agents is lower than the target one
        if self.target_distance > 0:
            reward_5 = sum([min(0, (Vector2D.distance(self.agents_pos[agent], self.agents_pos[nbr]) - self.target_distance)/self.target_distance) for nbr in closest_nbrs])
        else:
            reward_5 = 0

        # r3: penalize rapid changes of direction
        reward_3 = -(1-Vector2D.similarity(Vector2D(action[0],action[1]), Vector2D(last_action[0],last_action[1])))

        #reward_4 = -action[2]*10
        return reward_1 + reward_2 + reward_5 # + reward_3 #+ reward_3# working for two agents using value for reward_2 equals to one

    def __get_global_reward(self):
        return 0
    
    def __get_other_agents(self, agent):
        return [other for other in self.agents_ids if other != agent]

    def __get_n_closest_neighbours(self, agent, n=1):
        distances = {other: Vector2D.distance(self.agents_pos[agent], self.agents_pos[other]) for other in self.__get_other_agents(agent)}
        return [neighbour[0] for neighbour in sorted(list(distances.items()), key=lambda d: d[1])[:n]]
        # return {neighbour[0]: neighbour[1] for neighbour in sorted(list(dst.items()), key=lambda d: d[0])[:n]}

    def __update_agent_position(self, agent, action):
        unit_movement = Vector2D(action[0], action[1])
        self.agents_pos[agent] = Vector2D.sum(self.agents_pos[agent], Vector2D.mul(unit_movement, action[2]*self.speed))

    def reset(self, seed=None, options=None):
        if self.spawn_area_schedule != None:
            self.n_reset += 1
            if (self.spawn_area_schedule_index < len(self.spawn_area_schedule)-1 and 
                self.n_reset >= self.spawn_area_schedule[self.spawn_area_schedule_index+1][0]):
                self.spawn_area_schedule_index += 1
                self.spawn_area = self.spawn_area_schedule[self.spawn_area_schedule_index][1]

        self.steps = 0
        self.agents_pos = {agent: Vector2D.get_random_point(max_x=self.spawn_area, max_y=self.spawn_area) for agent in self.agents_ids}
        self.last_step_distances = {agent: self.__total_distance_from_closest_neighbours(agent) for agent in self.agents_ids}
        self.last_actions = {agent: [0]*3 for agent in self.agents_ids}
        return {agent: self.__get_observation(agent) for agent in self.agents_ids}, {}
     
    def step(self, actions):
        self.steps += 1
        observations, rewards, terminated, truncated, infos = {}, {}, {}, {}, {}

        for agent, action in actions.items():
            self.__update_agent_position(agent, action)

        for agent, action in actions.items():
            observations[agent] = self.__get_observation(agent)
            rewards[agent] = self.__get_local_reward(agent, action) + self.__get_global_reward()
            terminated[agent] = False
            truncated[agent] = False
            infos[agent] = {}

        truncated['__all__'] = False
        if self.max_steps != None and self.steps == self.max_steps:
            terminated['__all__'] = True
        else:
            terminated['__all__'] = False

        return observations, rewards, terminated, truncated, infos
     
    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def render(self):
        pass

    def get_agent_ids(self):
       return self.agents


class RenderableKeepTheDistance(KeepTheDistance):
    def render(self):
        if self.canvas is None:
            self.canvas = CanvasWithBorders(width=self.CANVAS_WIDTH, height=self.CANVAS_HEIGHT)
            display(self.canvas)
        
        with hold_canvas():
            agent_size = max(self.CANVAS_WIDTH/float(self.spawn_area),1)
            target_distance_size = (self.CANVAS_WIDTH/float(self.spawn_area))*self.target_distance
            top_left = (0.0,0.0)
            bottom_right = (self.spawn_area, self.spawn_area)
            self.canvas.clear()

            for agent in self.agents_ids:
                raw_pos = self.agents_pos[agent].to_np_array()
                color = self.agent_colors[agent]
                
                agent_pos_in_frame = [((raw_pos[0]-top_left[0])/(bottom_right[0]-top_left[0]))*self.CANVAS_WIDTH,
                            ((raw_pos[1]-top_left[1])/(bottom_right[1]-top_left[1]))*self.CANVAS_HEIGHT,]

                self.canvas.fill_style = color
                self.canvas.fill_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_size/2.0
                )
                
                self.canvas.stroke_style = "black"
                self.canvas.stroke_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_size/2.0
                )

                if self.target_distance > 0:
                    self.canvas.stroke_style = "red"
                    self.canvas.stroke_circle(
                        agent_pos_in_frame[0],
                        agent_pos_in_frame[1],
                        target_distance_size/2.0
                    )

In [4]:
env_config = EnvironmentConfiguration(n_agents=2, target_distance=0, max_steps=500, speed=1, spawn_area=10)
env = RenderableKeepTheDistance(env_config)

print(env.reset()[0])
#env.render()
simulate_random_episode(env, 10, print_info=False)
#env.step({'agent-1': (1,1,1)})

{'agent-0': array([0.37139067, 0.9284767 , 1.8539773 ], dtype=float32), 'agent-1': array([-0.37139067, -0.9284767 ,  1.8539773 ], dtype=float32)}


CanvasWithBorders(height=300, width=300)

## policy training

In [6]:
import ray
ray.shutdown()

## KeepTheDistance?dst=0&agent=2&spawn_area=100

In [5]:
from ray.tune.registry import register_env

env_config = EnvironmentConfiguration(n_agents=2, target_distance=0, max_steps=300, speed=1, spawn_area=100)
register_env("KeepTheDistance?dst=0&agent=2&spawn_area=100", lambda _: KeepTheDistance(env_config))

In [6]:
algo = load_algo("KeepTheDistance?dst=0&agent=2&spawn_area=100")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2024-06-24 13:32:39,201	INFO worker.py:1749 -- Started a local Ray instance.
2024-06-24 13:32:49,554	INFO trainable.py:161 -- Trainable.setup took 13.451 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [60]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 30

algo = (
    PPOConfig()
    .training(gamma = 0.99, 
              lr = 0.001,
              train_batch_size = 4096, 
              sgd_minibatch_size = 256, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=0&agent=2&spawn_area=100")
    .build()
)
clear_output()

out = ""
for i in range(trainin_steps):
    result = algo.train()
    clear_output()
    out += ppo_result_format(result) + "\n"
    print(out)
    simulate_episode(RenderableKeepTheDistance(env_config), algo, 300, sleep_between_frames=0.03, print_info=True)

iteration [1] => episode_reward_mean: -8.2491505251911, episode_len_mean: 300.0, agent_steps_trained: 8192, env_steps_trained: 4096, entropy: 4.260479648411274, learning_rate: 0.0010000000000000005
iteration [2] => episode_reward_mean: 26.74030357421306, episode_len_mean: 300.0, agent_steps_trained: 16384, env_steps_trained: 8192, entropy: 4.282991732160251, learning_rate: 0.0010000000000000005
iteration [3] => episode_reward_mean: 47.404754154425326, episode_len_mean: 300.0, agent_steps_trained: 24576, env_steps_trained: 12288, entropy: 4.188545287897189, learning_rate: 0.0010000000000000005
iteration [4] => episode_reward_mean: 61.9642279714821, episode_len_mean: 300.0, agent_steps_trained: 32768, env_steps_trained: 16384, entropy: 4.28954379533728, learning_rate: 0.0010000000000000005
iteration [5] => episode_reward_mean: 73.0602125192637, episode_len_mean: 300.0, agent_steps_trained: 40960, env_steps_trained: 20480, entropy: 4.296951741228501, learning_rate: 0.0010000000000000005
i

CanvasWithBorders(height=300, width=300)

obs:  {'agent-0': array([-0.48564294,  0.87415725,  4.1395373 ], dtype=float32), 'agent-1': array([ 0.48564294, -0.87415725,  4.1395373 ], dtype=float32)}
action:  {'agent-0': array([ 1.       , -1.       ,  0.5872913], dtype=float32), 'agent-1': array([-0.43972194,  0.21780336,  1.        ], dtype=float32)}
reward:  {'agent-0': 1.2004210590646451, 'agent-1': 1.2004210590646451} 

obs:  {'agent-0': array([-0.47831237,  0.8781898 ,  4.1202292 ], dtype=float32), 'agent-1': array([ 0.47831237, -0.8781898 ,  4.1202292 ], dtype=float32)}
action:  {'agent-0': array([ 1.       , -0.7112257,  0.5463229], dtype=float32), 'agent-1': array([-0.9660263 ,  1.        ,  0.54831016], dtype=float32)}
reward:  {'agent-0': 1.3353318421749094, 'agent-1': 1.3353318421749094} 

obs:  {'agent-0': array([-0.47093028,  0.88217044,  4.098304  ], dtype=float32), 'agent-1': array([ 0.47093028, -0.88217044,  4.098304  ], dtype=float32)}
action:  {'agent-0': array([ 1.       , -0.9942838,  0.9115002], dtype=float3

In [10]:
env_config_2 = EnvironmentConfiguration(n_agents=4, target_distance=0, max_steps=500, speed=1, spawn_area=100)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 100, sleep_between_frames=0.03, print_info=False)

CanvasWithBorders(height=300, width=300)

In [71]:
save_algo(algo, "KeepTheDistance?dst=0&agent=2&spawn_area=100")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/KeepTheDistance?dst=0&agent=2&spawn_area=100), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 5.060915484527746, 'cur_kl_coeff': 0.4500000000000001, 'cur_lr': 0.0010000000000000005, 'total_loss': 7.350311240057151, 'policy_loss': -0.008947773230223296, 'vf_loss': 7.352725898722808, 'vf_explained_var': 0.325551925599575, 'kl': 0.01451804825777196, 'entropy': 3.6460072847704095, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 28320.5, 'diff_num_grad_updates_vs_sampler_policy': 479.5}}, 'num_env_steps_sampled': 122880, 'num_env_steps_trained': 122880, 'num_agent_steps_sampled': 245760, 'num_agent_steps_trained': 245760}, 's

## KeepTheDistance?dst=0&visible_nbrs=2&spawn_area=100

In [72]:
from ray.tune.registry import register_env
env_config = EnvironmentConfiguration(n_agents=3, visible_nbrs=2, target_distance=0, max_steps=300, speed=1, spawn_area=100)
register_env("KeepTheDistance?dst=0&visible_nbrs=2&spawn_area=100", lambda _: KeepTheDistance(env_config))

In [130]:
algo = load_algo("KeepTheDistance?dst=0&visible_nbrs=2&spawn_area=100")

2024-05-28 16:49:20,496	ERROR actor_manager.py:519 -- Ray error, taking actor 1 out of service. The actor died because of an error raised in its creation task, [36mray::RolloutWorker.__init__()[39m (pid=15562, ip=172.23.87.11, actor_id=b5dbde7ce66578d46964347001000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fb41e9daf90>)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/gymnasium/envs/registration.py", line 740, in make
    env_spec = _find_spec(id)
               ^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/gymnasium/envs/registration.py", line 519, in _find_spec
    ns, name, version = parse_env_id(env_name)
                        ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/nicolo/anaconda3/envs/tianEnv/lib/python3.11/site-packages/gymnasium/envs/registration.py", line 286, in parse_env_id
    raise error.Error(
gymnasium.error.Error: Malformed

EnvError: The env string you provided ('KeepTheDistance?dst=0&visible_nbrs=2&spawn_area=100') is:
a) Not a supported/installed environment.
b) Not a tune-registered environment creator.
c) Not a valid env class string.

Try one of the following:
a) For Atari support: `pip install gym[atari] autorom[accept-rom-license]`.
   For PyBullet support: `pip install pybullet`.
b) To register your custom env, do `from ray import tune;
   tune.register('[name]', lambda cfg: [return env obj from here using cfg])`.
   Then in your config, do `config['env'] = [name]`.
c) Make sure you provide a fully qualified classpath, e.g.:
   `ray.rllib.examples.envs.classes.repeat_after_me_env.RepeatAfterMeEnv`


In [74]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 30

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4092, 
              sgd_minibatch_size = 128, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=0&visible_nbrs=2&spawn_area=100")
    .build()
)
clear_output()

env_show = RenderableKeepTheDistance(env_config)
for i in range(trainin_steps):
    result = algo.train()
    simulate_episode(env_show, algo, 150, sleep_between_frames=0.03, print_info=False)
    print(ppo_result_format(result))

CanvasWithBorders(height=300, width=300)

iteration [1] => episode_reward_mean: -9.076550823166727, episode_len_mean: 300.0, agent_steps_trained: 12276, env_steps_trained: 4092, entropy: 4.224508220689339, learning_rate: 0.0010000000000000002
iteration [2] => episode_reward_mean: 69.10403511569474, episode_len_mean: 300.0, agent_steps_trained: 24552, env_steps_trained: 8184, entropy: 4.239348649644015, learning_rate: 0.0010000000000000002
iteration [3] => episode_reward_mean: 132.94989005653392, episode_len_mean: 300.0, agent_steps_trained: 36828, env_steps_trained: 12276, entropy: 4.217899766135634, learning_rate: 0.0010000000000000002
iteration [4] => episode_reward_mean: 190.06564283236446, episode_len_mean: 300.0, agent_steps_trained: 49104, env_steps_trained: 16368, entropy: 4.273230808324981, learning_rate: 0.0010000000000000002
iteration [5] => episode_reward_mean: 234.7931710832915, episode_len_mean: 300.0, agent_steps_trained: 61380, env_steps_trained: 20460, entropy: 4.26117914777053, learning_rate: 0.001000000000000

In [80]:
env_config_2 = EnvironmentConfiguration(n_agents=10, visible_nbrs=2, target_distance=0, max_steps=500, speed=1, spawn_area=500)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 300, sleep_between_frames=0.01, print_info=False)

CanvasWithBorders(height=300, width=300)

In [75]:
save_algo(algo, "KeepTheDistance?dst=0&visible_nbrs=2&spawn_area=100")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/KeepTheDistance?dst=0&visible_nbrs=2&spawn_area=100), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 9.59812976611288, 'cur_kl_coeff': 0.6749999999999998, 'cur_lr': 0.0010000000000000002, 'total_loss': 3.6643962602343474, 'policy_loss': -0.00980367020883581, 'vf_loss': 3.665363485813141, 'vf_explained_var': 0.218065206565355, 'kl': 0.013091016343160103, 'entropy': 3.2479646437628227, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 84075.5, 'diff_num_grad_updates_vs_sampler_policy': 1424.5}}, 'num_env_steps_sampled': 122760, 'num_env_steps_trained': 122760, 'num_agent_steps_sampled': 368280, 'num_agent_steps_trained': 368

## KeepTheDistance?dst=0&visible_nbrs=3&spawn_area=100

In [12]:
from ray.tune.registry import register_env

env_config = EnvironmentConfiguration(n_agents=4, visible_nbrs=3, target_distance=0, max_steps=300, speed=1, spawn_area=100)
register_env("KeepTheDistance?dst=0&visible_nbrs=3&spawn_area=100", lambda _: KeepTheDistance(env_config))

In [13]:
algo = load_algo("KeepTheDistance?dst=0&visible_nbrs=3&spawn_area=100")



KeyboardInterrupt: 

In [17]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 30

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4092, 
              sgd_minibatch_size = 128, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=0&visible_nbrs=3&spawn_area=100")
    .build()
)
clear_output()

env_show = RenderableKeepTheDistance(env_config)
for i in range(trainin_steps):
    result = algo.train()
    simulate_episode(env_show, algo, 150, sleep_between_frames=0.03, print_info=False)
    print(ppo_result_format(result))

CanvasWithBorders(height=300, width=300)

iteration [1] => episode_reward_mean: -9.511731059783386, episode_len_mean: 300.0, agent_steps_trained: 16368, env_steps_trained: 4092, entropy: 4.227877699296306, learning_rate: 0.0010000000000000005


KeyboardInterrupt: 

In [92]:
env_config_2 = EnvironmentConfiguration(n_agents=10, visible_nbrs=3, target_distance=0, max_steps=500, speed=1, spawn_area=400)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 300, sleep_between_frames=0.01, print_info=False)

CanvasWithBorders(height=300, width=300)

In [87]:
save_algo(algo, "KeepTheDistance?dst=0&visible_nbrs=3&spawn_area=100")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/KeepTheDistance?dst=0&visible_nbrs=3&spawn_area=100), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 8.819473860895853, 'cur_kl_coeff': 0.6750000000000002, 'cur_lr': 0.0010000000000000005, 'total_loss': 4.865067737543677, 'policy_loss': -0.008388914499832733, 'vf_loss': 4.862613392408126, 'vf_explained_var': 0.12559887126987687, 'kl': 0.01606408571965559, 'entropy': 3.1699658083477673, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 112395.5, 'diff_num_grad_updates_vs_sampler_policy': 1904.5}}, 'num_env_steps_sampled': 122760, 'num_env_steps_trained': 122760, 'num_agent_steps_sampled': 491040, 'num_agent_steps_trained': 

## KeepTheDistance?dst=5&visible_nbrs=3&spawn_area=100

In [14]:
from ray.tune.registry import register_env
from ray.rllib.algorithms.ppo import PPOConfig

base_algo = load_algo("KeepTheDistance?dst=0&visible_nbrs=3&spawn_area=100")
env_config = EnvironmentConfiguration(n_agents=4, visible_nbrs=3, target_distance=5, max_steps=300, speed=1, spawn_area=100)
register_env("KeepTheDistance?dst=5&visible_nbrs=3&spawn_area=100", lambda _: KeepTheDistance(env_config))

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4092, 
              sgd_minibatch_size = 128, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=5&visible_nbrs=3&spawn_area=100")
    .build()
)
clear_output()
algo.set_weights(base_algo.get_weights())

In [16]:
algo = load_algo("KeepTheDistance?dst=5&visible_nbrs=3&spawn_area=100")

2024-06-24 14:03:27,705	INFO trainable.py:161 -- Trainable.setup took 13.089 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [15]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 30

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4092, 
              sgd_minibatch_size = 128, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=5&visible_nbrs=3&spawn_area=100")
    .build()
)
clear_output()

env_show = RenderableKeepTheDistance(env_config)
for i in range(trainin_steps):
    result = algo.train()
    simulate_episode(env_show, algo, 150, sleep_between_frames=0.03, print_info=False)
    print(ppo_result_format(result))



KeyboardInterrupt: 

In [23]:
env_config_2 = EnvironmentConfiguration(n_agents=20, visible_nbrs=3, target_distance=5, max_steps=500, speed=1, spawn_area=500)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 500, sleep_between_frames=0.03, print_info=False)

CanvasWithBorders(height=300, width=300)

In [37]:
save_algo(algo, "KeepTheDistance?dst=5&visible_nbrs=3&spawn_area=100")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/KeepTheDistance?dst=5&visible_nbrs=3&spawn_area=100), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 10.382454860429439, 'cur_kl_coeff': 1.0124999999999997, 'cur_lr': 0.0010000000000000005, 'total_loss': 5.930729507555173, 'policy_loss': -0.0028303747193086063, 'vf_loss': 5.918511411023578, 'vf_explained_var': 0.23783627534162968, 'kl': 0.014862683368205116, 'entropy': 3.9788712683625107, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 112395.5, 'diff_num_grad_updates_vs_sampler_policy': 1904.5}}, 'num_env_steps_sampled': 122760, 'num_env_steps_trained': 122760, 'num_agent_steps_sampled': 491040, 'num_agent_steps_trained

## KeepTheDistance?dst=10&visible_nbrs=3&spawn_area=100

In [6]:
from ray.tune.registry import register_env
from ray.rllib.algorithms.ppo import PPOConfig

base_algo = load_algo("KeepTheDistance?dst=5&visible_nbrs=3&spawn_area=100")
env_config = EnvironmentConfiguration(n_agents=4, visible_nbrs=3, target_distance=10, max_steps=300, speed=1, spawn_area=100)
register_env("KeepTheDistance?dst=10&visible_nbrs=3&spawn_area=100", lambda _: KeepTheDistance(env_config))

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4092, 
              sgd_minibatch_size = 128, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=10&visible_nbrs=3&spawn_area=100")
    .build()
)
clear_output()
algo.set_weights(base_algo.get_weights())

In [8]:
algo = load_algo("KeepTheDistance?dst=10&visible_nbrs=3&spawn_area=100")



In [78]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 30

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4092, 
              sgd_minibatch_size = 128, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=10&visible_nbrs=3&spawn_area=100")
    .build()
)
clear_output()

env_show = RenderableKeepTheDistance(env_config)
for i in range(trainin_steps):
    result = algo.train()
    simulate_episode(env_show, algo, 150, sleep_between_frames=0.03, print_info=False)
    print(ppo_result_format(result))

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))


KeyboardInterrupt: 

In [9]:
env_config_2 = EnvironmentConfiguration(n_agents=20, visible_nbrs=3, target_distance=10, max_steps=500, speed=1, spawn_area=100)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 500, sleep_between_frames=0.03, print_info=False)

CanvasWithBorders(height=300, width=300)

In [41]:
save_algo(algo, "KeepTheDistance?dst=10&visible_nbrs=3&spawn_area=100")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/KeepTheDistance?dst=10&visible_nbrs=3&spawn_area=100), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 13.893579628899342, 'cur_kl_coeff': 1.0124999999999997, 'cur_lr': 0.0010000000000000005, 'total_loss': 5.905990294521562, 'policy_loss': -0.012485336998277136, 'vf_loss': 5.903962938992057, 'vf_explained_var': 0.29801289379753154, 'kl': 0.014333522689229024, 'entropy': 4.284107104749505, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 112395.5, 'diff_num_grad_updates_vs_sampler_policy': 1904.5}}, 'num_env_steps_sampled': 122760, 'num_env_steps_trained': 122760, 'num_agent_steps_sampled': 491040, 'num_agent_steps_trained'