## Keep average distance

the agents goal is to position close to each others at a distance previously defined

challenges:
- deal with continuous space environment
- limited vision of an agent

### utils

In [1]:
from ipycanvas import Canvas, hold_canvas

class CanvasWithBorders(Canvas):
    def clear(self):
        super().clear()
        border_color = 'black'  # You can customize the border color here
        border_width = 1  # You can customize the border width here
        
        self.fill_style = border_color
        # Draw top border
        self.fill_rect(0, 0, self.width, border_width)
        # Draw bottom border
        self.fill_rect(0, self.height - border_width, self.width, border_width)
        # Draw left border
        self.fill_rect(0, 0, border_width, self.height)
        # Draw right border
        self.fill_rect(self.width - border_width, 0, border_width, self.height)

import os
from ray.rllib.algorithms.algorithm import Algorithm

def save_algo(algo, name):
    base_dir = os.path.join(os.getcwd(), "algos")
    subfolder_path = os.path.join(base_dir, name)
    os.makedirs(subfolder_path, exist_ok=True)
    path_to_checkpoint  = algo.save(subfolder_path)
    print(f"An Algorithm checkpoint has been created inside directory: '{path_to_checkpoint}'.")

def load_algo(name):
    base_dir = os.path.join(os.getcwd(), "algos")
    subfolder_path = os.path.join(base_dir, name)
    if not os.path.exists(subfolder_path):
        raise FileNotFoundError(f"The specified subfolder '{subfolder_path}' does not exist.")
    
    return Algorithm.from_checkpoint(subfolder_path)

#save_algo(algo, "KeepTheDistance_dst=0_agent=2_100x100train")
#algo2 = load_algo("KeepTheDistance_dst=0_agent=2_100x100train")

In [2]:
import random as rnd
import numpy as np
import math

class Vector2D():
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def __str__(self):
        return f"({self.x}, {self.y})"

    def to_np_array(self):
        return np.array([self.x, self.y], dtype=np.float32)

    def get_random_point(max_x, max_y, min_x=0, min_y=0):
        return Vector2D(rnd.randint(min_x, max_x-1), rnd.randint(min_y, max_y-1))
    
    def distance_vector(v1, v2):
        return Vector2D(v1.x-v2.x, v1.y-v2.y)
    
    def distance(v1, v2):
        distance_vector = Vector2D.distance_vector(v1, v2)
        return Vector2D.norm(distance_vector)

    def norm(v):
        return math.sqrt(math.pow(v.x, 2) + math.pow(v.y, 2))

    def unit_vector(v):
        norm = Vector2D.norm(v)
        if norm == 0:
            return Vector2D(0,0)
        return Vector2D(v.x/norm, v.y/norm)

    def similarity(v1, v2):
        x1, y1 = v1.x, v1.y
        x2, y2 = v2.x, v2.y
        v1_value = math.degrees(math.atan2(y1, x1))/180
        v2_value = math.degrees(math.atan2(y2, x2))/180
        diff = abs(v1_value-v2_value)
        similarity = 1 - math.pow(diff, 0.9)
        return similarity
    
    def sum(v1, v2):
        return Vector2D(v1.x+v2.x, v1.y+v2.y)
    
    def mul(v, n):
        return Vector2D(v.x*n, v.y*n)

In [41]:
import time
import numpy as np
import random as rnd

def simulate_episode(env, policy, steps, sleep_between_frames=0.3, print_info=True):
    obs, _ = env.reset()
    env.render()

    for i in range(steps):
        if print_info:
            print(f"obs: ", obs)
        actions = policy.compute_actions(obs)
        #actions = {agent: np.array([rnd.random()*2-1, rnd.random()*2-1, 1.0], np.float32) for agent in obs.keys()}
        #actions = {agent: env.action_space.sample() for agent in obs.keys()}
        obs, reward, _, _, _ = env.step(actions)
        env.render()
        if print_info:
            print(f"action: ", actions)
            print(f"reward: ", reward, "\n")
        time.sleep(sleep_between_frames)

def simulate_random_episode(env, steps, sleep_between_frames=0.3, print_info=True):
    obs, _ = env.reset()
    env.render()

    action_space = env.action_space
    for i in range(steps):
        if print_info:
            print(f"obs: ", obs)
        actions = {agent: action_space.sample() for agent in obs.keys()}
        obs, reward, _, _, _ = env.step(actions)
        env.render()
        if print_info:
            print(f"action: ", actions)
            print(f"reward: ", reward, "\n")
        time.sleep(sleep_between_frames)

def ppo_result_format(result):
    return (f"iteration [{result['training_iteration']}] => " +
          f"episode_reward_mean: {result['sampler_results']['episode_reward_mean']}, " +
          f"episode_len_mean: {result['sampler_results']['episode_len_mean']}, " +
          f"agent_steps_trained: {result['info']['num_agent_steps_trained']}, " +
          f"env_steps_trained: {result['info']['num_env_steps_trained']}, " + 
          f"entropy: {result['info']['learner']['default_policy']['learner_stats']['entropy']}, " +
          f"learning_rate: {result['info']['learner']['default_policy']['learner_stats']['cur_lr']}")

### environment definition

In [51]:
from typing import Set
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import random as rnd
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiDiscrete
from gymnasium.spaces.utils import flatten, flatten_space
import numpy as np
from IPython.display import clear_output
import math
from ipycanvas import Canvas, hold_canvas

class EnvironmentConfiguration: 
    def __init__(self, n_agents, target_distance, speed, spawn_area=100, visible_nbrs=1, max_steps=None, spawn_area_schedule=None):
        self.n_agents = n_agents
        self.visible_nbrs = visible_nbrs
        self.target_distance = target_distance
        self.max_steps = max_steps
        self.speed = speed
        self.spawn_area = spawn_area
        self.spawn_area_schedule = spawn_area_schedule

class KeepTheDistance(MultiAgentEnv):

    canvas = None
    CANVAS_WIDTH, CANVAS_HEIGHT = 300.0, 300.0

    def __init__(self, config: EnvironmentConfiguration):
        assert config.n_agents > config.visible_nbrs # just base case implemented 
             
        self.n_agents = config.n_agents
        self.visible_nbrs = config.visible_nbrs
        self.target_distance = config.target_distance
        self.max_steps = config.max_steps
        self.speed = config.speed
        self.spawn_area = config.spawn_area
        self.spawn_area_schedule = config.spawn_area_schedule
        if self.spawn_area_schedule != None:
            self.spawn_area_schedule_index = 0
            self.n_reset = 0
            self.spawn_area = self.spawn_area_schedule[0][1]
        
        self.agents_ids = ['agent-' + str(i) for i in range(self.n_agents)]
        self.agent_colors = {agent: self.rgb_to_hex(rnd.randint(0, 255), rnd.randint(0, 255), rnd.randint(0, 255)) for agent in self.agents_ids}
        self.observation_space = self.observation_space('agent-0')
        self.action_space = self.action_space("")

    def unflatten_observation_space(self, agent):
        #distance_vector = Box(low=-np.inf, high=np.inf, shape=(2,1), dtype=np.float32)
        #obs_space = Dict({"nbr-1": distance_vector})
        direction = Box(low=-1, high=1, shape=(2,1), dtype=np.float32)
        distance = Box(low=-np.inf, high=np.inf, shape=(1,1), dtype=np.float32)
        return Dict({f"nbr-{i}": Dict({'direction': direction, 'distance': distance}) for i in range(self.visible_nbrs)})

    def observation_space(self, agent):
        return flatten_space(self.unflatten_observation_space(agent))

    def action_space(self, agent):
        direction = Box(low=-1.0, high=1.0, shape=(2,1), dtype=np.float32)
        speed = Box(0.0, 1.0, dtype=np.float32)
        return flatten_space(Tuple([direction, speed]))
    
    def __get_observation(self, agent):
        distance_vectors = [Vector2D.distance_vector(self.agents_pos[agent], self.agents_pos[nbr])  
                            for nbr in self.__get_n_closest_neighbours(agent, self.visible_nbrs)]

        obs = {
            f"nbr-{i}": {
                "direction": Vector2D.unit_vector(distance_vectors[i]).to_np_array(),
                "distance": np.log(1 + Vector2D.norm(distance_vectors[i])) #1 - np.exp(-alpha * x)
            }
            for i in range(len(distance_vectors))
            }
        return flatten(self.unflatten_observation_space(agent), obs)

    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def __total_distance_from_closest_neighbours(self, agent):
        return sum([abs(Vector2D.distance(self.agents_pos[agent], self.agents_pos[nbr]) - self.target_distance) for nbr in self.__get_n_closest_neighbours(agent, self.visible_nbrs)])

    def __get_local_reward(self, agent, action):
        last_action = self.last_actions[agent]
        self.last_actions[agent] = action

        # r0: negative of the distance from the closest neighbours
        reward_0 = -self.__total_distance_from_closest_neighbours(agent)

        # r1: improvement of the distance from the closest neighbours
        newDistance = self.__total_distance_from_closest_neighbours(agent)
        reward_1 = self.last_step_distances[agent] - newDistance
        self.last_step_distances[agent] = newDistance

        # r2: bonus if the agent is very close to the target distance
        closest_nbrs = self.__get_n_closest_neighbours(agent, self.visible_nbrs)
        #reward_2 = sum([100 if abs(Vector2D.distance(self.agents_pos[agent], self.agents_pos[nbr]) - self.target_distance) < 0.5 else 0 for nbr in closest_nbrs])
        reward_2 = sum([max(0, 1 - (abs(Vector2D.distance(self.agents_pos[agent], self.agents_pos[nbr]) - self.target_distance) / self.target_distance)) for nbr in closest_nbrs])


        # r3: penalize rapid changes of direction
        reward_3 = -(1-Vector2D.similarity(Vector2D(action[0],action[1]), Vector2D(last_action[0],last_action[1])))

        #reward_4 = -action[2]*10
        return reward_1 + reward_2 # + reward_3 #+ reward_3# working for two agents using value for reward_2 equals to one

    def __get_global_reward(self):
        return 0
    
    def __get_other_agents(self, agent):
        return [other for other in self.agents_ids if other != agent]

    def __get_n_closest_neighbours(self, agent, n=1):
        distances = {other: Vector2D.distance(self.agents_pos[agent], self.agents_pos[other]) for other in self.__get_other_agents(agent)}
        return [neighbour[0] for neighbour in sorted(list(distances.items()), key=lambda d: d[1])[:n]]
        # return {neighbour[0]: neighbour[1] for neighbour in sorted(list(dst.items()), key=lambda d: d[0])[:n]}

    def __update_agent_position(self, agent, action):
        unit_movement = Vector2D(action[0], action[1])
        self.agents_pos[agent] = Vector2D.sum(self.agents_pos[agent], Vector2D.mul(unit_movement, action[2]*self.speed))

    def reset(self, seed=None, options=None):
        if self.spawn_area_schedule != None:
            self.n_reset += 1
            if (self.spawn_area_schedule_index < len(self.spawn_area_schedule)-1 and 
                self.n_reset >= self.spawn_area_schedule[self.spawn_area_schedule_index+1][0]):
                self.spawn_area_schedule_index += 1
                self.spawn_area = self.spawn_area_schedule[self.spawn_area_schedule_index][1]

        self.steps = 0
        self.agents_pos = {agent: Vector2D.get_random_point(max_x=self.spawn_area, max_y=self.spawn_area) for agent in self.agents_ids}
        self.last_step_distances = {agent: self.__total_distance_from_closest_neighbours(agent) for agent in self.agents_ids}
        self.last_actions = {agent: [0]*3 for agent in self.agents_ids}
        return {agent: self.__get_observation(agent) for agent in self.agents_ids}, {}
     
    def step(self, actions):
        self.steps += 1
        observations, rewards, terminated, truncated, infos = {}, {}, {}, {}, {}

        for agent, action in actions.items():
            self.__update_agent_position(agent, action)

        for agent, action in actions.items():
            observations[agent] = self.__get_observation(agent)
            rewards[agent] = self.__get_local_reward(agent, action) + self.__get_global_reward()
            terminated[agent] = False
            truncated[agent] = False
            infos[agent] = {}

        truncated['__all__'] = False
        if self.max_steps != None and self.steps == self.max_steps:
            terminated['__all__'] = True
        else:
            terminated['__all__'] = False

        return observations, rewards, terminated, truncated, infos
     
    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def render(self):
        pass

    def get_agent_ids(self):
       return self.agents


class RenderableKeepTheDistance(KeepTheDistance):
    def render(self):
        if self.canvas is None:
            self.canvas = CanvasWithBorders(width=self.CANVAS_WIDTH, height=self.CANVAS_HEIGHT)
            display(self.canvas)
        
        with hold_canvas():
            agent_size = max(self.CANVAS_WIDTH/float(self.spawn_area),1)
            top_left = (0.0,0.0)
            bottom_right = (self.spawn_area, self.spawn_area)
            self.canvas.clear()

            for agent in self.agents_ids:
                raw_pos = self.agents_pos[agent].to_np_array()
                color = self.agent_colors[agent]
                
                agent_pos_in_frame = [((raw_pos[0]-top_left[0])/(bottom_right[0]-top_left[0]))*self.CANVAS_WIDTH,
                            ((raw_pos[1]-top_left[1])/(bottom_right[1]-top_left[1]))*self.CANVAS_HEIGHT,]

                self.canvas.fill_style = color
                self.canvas.fill_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_size/2.0
                )
                
                self.canvas.stroke_style = "black"
                self.canvas.stroke_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_size/2.0
                )

In [48]:
env_config = EnvironmentConfiguration(n_agents=2, target_distance=0, max_steps=500, speed=1, spawn_area=10)
env = RenderableKeepTheDistance(env_config)

print(env.reset()[0])
#env.render()
simulate_random_episode(env, 10, print_info=True)
#env.step({'agent-1': (1,1,1)})

{'agent-0': array([0.70710677, 0.70710677, 1.3424541 ], dtype=float32), 'agent-1': array([-0.70710677, -0.70710677,  1.3424541 ], dtype=float32)}


CanvasWithBorders(height=300, width=300)

obs:  {'agent-0': array([-1.       ,  0.       ,  0.6931472], dtype=float32), 'agent-1': array([1.       , 0.       , 0.6931472], dtype=float32)}
action:  {'agent-0': array([-0.45745727, -0.8009053 ,  0.04903754], dtype=float32), 'agent-1': array([0.6539487 , 0.5224657 , 0.59678525], dtype=float32)}
reward:  {'agent-0': -0.45566929521952426, 'agent-1': -0.45566929521952426} 

obs:  {'agent-0': array([-0.9704811 , -0.2411772 ,  0.89839935], dtype=float32), 'agent-1': array([0.9704811 , 0.2411772 , 0.89839935], dtype=float32)}
action:  {'agent-0': array([-0.48482028,  0.10999227,  0.2594751 ], dtype=float32), 'agent-1': array([-0.5568653 , -0.5819053 ,  0.08382403], dtype=float32)}
reward:  {'agent-0': -0.061060199976091134, 'agent-1': -0.061060199976091134} 

obs:  {'agent-0': array([-0.9835766 , -0.18049121,  0.9229602 ], dtype=float32), 'agent-1': array([0.9835766 , 0.18049121, 0.9229602 ], dtype=float32)}
action:  {'agent-0': array([-0.9076892 ,  0.15017396,  0.3761028 ], dtype=float

## policy training

In [6]:
import ray
ray.shutdown()

## KeepTheDistance?dst=0&agent=2&spawn_area=100

In [49]:
from ray.tune.registry import register_env

env_config = EnvironmentConfiguration(n_agents=2, target_distance=0, max_steps=300, speed=1, spawn_area=100)
register_env("KeepTheDistance?dst=0&agent=2&spawn_area=100", lambda _: KeepTheDistance(env_config))

In [9]:
algo = load_algo("KeepTheDistance?dst=0&agent=2&spawn_area=100")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2024-05-24 16:05:21,873	INFO worker.py:1749 -- Started a local Ray instance.
2024-05-24 16:05:30,131	INFO trainable.py:161 -- Trainable.setup took 11.599 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [52]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 5

algo = (
    PPOConfig()
    .training(gamma = 0.99, 
              lr = 0.001,
              train_batch_size = 4096, 
              sgd_minibatch_size = 256, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=0&agent=2&spawn_area=100")
    .build()
)
clear_output()

out = ""
for i in range(trainin_steps):
    result = algo.train()
    clear_output()
    out += ppo_result_format(result) + "\n"
    print(out)
    simulate_episode(RenderableKeepTheDistance(env_config), algo, 300, sleep_between_frames=0.03, print_info=True)

iteration [1] => episode_reward_mean: -6.524227662165874, episode_len_mean: 300.0, agent_steps_trained: 8192, env_steps_trained: 4096, entropy: 4.23718300635616, learning_rate: 0.0010000000000000005
iteration [2] => episode_reward_mean: 54.45913761079322, episode_len_mean: 300.0, agent_steps_trained: 16384, env_steps_trained: 8192, entropy: 4.34562553614378, learning_rate: 0.0010000000000000005
iteration [3] => episode_reward_mean: 50.75709731524839, episode_len_mean: 300.0, agent_steps_trained: 24576, env_steps_trained: 12288, entropy: 4.2624438298245275, learning_rate: 0.0010000000000000005
iteration [4] => episode_reward_mean: 134.46876803369832, episode_len_mean: 300.0, agent_steps_trained: 32768, env_steps_trained: 16384, entropy: 4.18227636863788, learning_rate: 0.0010000000000000005
iteration [5] => episode_reward_mean: 266.40354222345053, episode_len_mean: 300.0, agent_steps_trained: 40960, env_steps_trained: 20480, entropy: 4.103445679694414, learning_rate: 0.00100000000000000

CanvasWithBorders(height=300, width=300)

obs:  {'agent-0': array([0.0464614, 0.9989201, 3.7852457], dtype=float32), 'agent-1': array([-0.0464614, -0.9989201,  3.7852457], dtype=float32)}
action:  {'agent-0': array([-0.26440948, -0.32545447,  0.28671905], dtype=float32), 'agent-1': array([-0.10532188, -0.31774217,  0.21871096], dtype=float32)}
reward:  {'agent-0': -43.02027082455359, 'agent-1': -43.02027082455359} 

obs:  {'agent-0': array([0.04526294, 0.9989751 , 3.7846503 ], dtype=float32), 'agent-1': array([-0.04526294, -0.9989751 ,  3.7846503 ], dtype=float32)}
action:  {'agent-0': array([-1.        ,  1.        ,  0.49738595], dtype=float32), 'agent-1': array([-0.51586145, -0.7581216 ,  0.28242958], dtype=float32)}
reward:  {'agent-0': -43.7168073859859, 'agent-1': -43.7168073859859} 

obs:  {'agent-0': array([0.036497  , 0.99933374, 3.8003495 ], dtype=float32), 'agent-1': array([-0.036497  , -0.99933374,  3.8003495 ], dtype=float32)}
action:  {'agent-0': array([1.        , 0.22029948, 0.10762587], dtype=float32), 'agent-

In [212]:
env_config_2 = EnvironmentConfiguration(n_agents=2, target_distance=0, max_steps=500, speed=1, spawn_area=100)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 300, sleep_between_frames=0.03, print_info=False)

CanvasWithBorders(height=300, width=300)

In [207]:
save_algo(algo, "KeepTheDistance?dst=0&agent=2&spawn_area=100")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/KeepTheDistance?dst=0&agent=2&spawn_area=100), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 3.2092962741230924, 'cur_kl_coeff': 1.0124999999999997, 'cur_lr': 0.0010000000000000005, 'total_loss': 9.670777402321498, 'policy_loss': -0.010992191499099135, 'vf_loss': 9.667357384165127, 'vf_explained_var': -0.007525691576302051, 'kl': 0.014234306790058408, 'entropy': 4.4639156160255276, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 28320.5, 'diff_num_grad_updates_vs_sampler_policy': 479.5}}, 'num_env_steps_sampled': 122880, 'num_env_steps_trained': 122880, 'num_agent_steps_sampled': 245760, 'num_agent_steps_trained': 24576

## KeepTheDistance?dst=0&agent=3&visible_nbrs=2&spawn_area=100

In [41]:
from ray.tune.registry import register_env
env_config = EnvironmentConfiguration(n_agents=3, visible_nbrs=2, target_distance=0, max_steps=300, speed=1, spawn_area=100)
register_env("KeepTheDistance?dst=0&agent=3&visible_nbrs=2&spawn_area=100", lambda _: KeepTheDistance(env_config))

In [8]:
algo = load_algo("KeepTheDistance?dst=0&agent=3&visible_nbrs=2&spawn_area=100")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2024-05-24 10:13:14,302	INFO worker.py:1749 -- Started a local Ray instance.
2024-05-24 10:13:23,121	INFO trainable.py:161 -- Trainable.setup took 12.209 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [39]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 40

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4092, 
              sgd_minibatch_size = 128, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=0&agent=3&visible_nbrs=2&spawn_area=100")
    .build()
)
clear_output()

env_show = RenderableKeepTheDistance(env_config)
for i in range(trainin_steps):
    result = algo.train()
    simulate_episode(env_show, algo, 150, sleep_between_frames=0.03, print_info=False)
    print(ppo_result_format(result))

CanvasWithBorders(height=300, width=300)

iteration [1] => episode_reward_mean: -16085.280054435147, episode_len_mean: 1000.0, agent_steps_trained: 12288, env_steps_trained: 4096, entropy: 4.240592692130142, learning_rate: 0.0010000000000000002
iteration [2] => episode_reward_mean: -16432.439549192502, episode_len_mean: 1000.0, agent_steps_trained: 24576, env_steps_trained: 8192, entropy: 4.213702567583985, learning_rate: 0.0010000000000000002


KeyboardInterrupt: 

In [11]:
env_config_2 = EnvironmentConfiguration(n_agents=3, visible_nbrs=2, target_distance=0, max_steps=500, speed=1, spawn_area=100)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 300, sleep_between_frames=0.01, print_info=False)

CanvasWithBorders(height=300, width=300)

In [18]:
save_algo(algo, "KeepTheDistance?dst=0&agent=3&visible_nbrs=2&spawn_area=100")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/KeepTheDistance?dst=0&agent=3&visible_nbrs=2&spawn_area=100), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 26.797518886129062, 'cur_kl_coeff': 5.473673629760742, 'cur_lr': 0.0010000000000000002, 'total_loss': 4.241659853690201, 'policy_loss': 0.0008611866208310757, 'vf_loss': 4.17278758486112, 'vf_explained_var': 0.026051732442445224, 'kl': 0.012425122178001438, 'entropy': 5.3370947420597075, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 56880.5, 'diff_num_grad_updates_vs_sampler_policy': 719.5}}, 'num_env_steps_sampled': 163840, 'num_env_steps_trained': 163840, 'num_agent_steps_sampled': 491520, 'num_agent_steps_tra

## KeepTheDistance?dst=0&agent=4&visible_nbrs=3&spawn_area=100

In [104]:
from ray.tune.registry import register_env

train_batch_size = 4096
reset_per_batch = train_batch_size/300

spawn_area_schedule = [[0,10],[4,30],[9,50],[18,100]]
#spawn_area_schedule = [[0,10],[4,30],[10,50],[18,100]]

env_config = EnvironmentConfiguration(n_agents=4, visible_nbrs=3, target_distance=0, max_steps=1000, speed=1, spawn_area=20, 
                                      spawn_area_schedule=[[schedule[0]*train_batch_size, schedule[1]] for schedule in spawn_area_schedule])
env_config_show = EnvironmentConfiguration(n_agents=4, visible_nbrs=3, target_distance=0, max_steps=300, speed=1, spawn_area=20, 
                                      spawn_area_schedule=spawn_area_schedule)
register_env("KeepTheDistance?dst=0&agent=4&visible_nbrs=3&spawn_area=100", lambda _: KeepTheDistance(env_config))

In [101]:
algo = load_algo("KeepTheDistance?dst=0&agent=4&visible_nbrs=3&spawn_area=100")

FileNotFoundError: The specified subfolder '/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/KeepTheDistance?dst=0&agent=4&visible_nbrs=3&spawn_area=100' does not exist.

In [105]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 40

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = train_batch_size, 
              sgd_minibatch_size = 256, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=0&agent=4&visible_nbrs=3&spawn_area=100")
    .build()
)
clear_output()

env_to_show = RenderableKeepTheDistance(env_config_show)
for i in range(trainin_steps):
    result = algo.train()
    #clear_output()
    #print(out)
    simulate_episode(env_to_show, algo, 150, sleep_between_frames=0.03, print_info=False)
    print(ppo_result_format(result))

CanvasWithBorders(height=300, width=300)

iteration [1] => episode_reward_mean: 241.53260623370483, episode_len_mean: 1000.0, agent_steps_trained: 16384, env_steps_trained: 4096, entropy: 4.262171379725138, learning_rate: 0.0010000000000000005
iteration [2] => episode_reward_mean: 8972.364221824693, episode_len_mean: 1000.0, agent_steps_trained: 32768, env_steps_trained: 8192, entropy: 4.166322618474563, learning_rate: 0.0010000000000000005
iteration [3] => episode_reward_mean: 20690.61975089599, episode_len_mean: 1000.0, agent_steps_trained: 49152, env_steps_trained: 12288, entropy: 4.057953937600057, learning_rate: 0.0010000000000000005
iteration [4] => episode_reward_mean: 29882.30053861677, episode_len_mean: 1000.0, agent_steps_trained: 65536, env_steps_trained: 16384, entropy: 4.133171391238769, learning_rate: 0.0010000000000000005


KeyboardInterrupt: 

In [174]:
env_config_2 = EnvironmentConfiguration(n_agents=4, visible_nbrs=3, target_distance=0, max_steps=500, speed=1, spawn_area=100)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 300, sleep_between_frames=0.03, print_info=False)

CanvasWithBorders(height=300, width=300)

In [None]:
save_algo(algo, "KeepTheDistance?dst=0&agent=3&visible_nbrs=2&spawn_area=100")