## Keep average distance

the agents goal is to position close to each others at a distance previously defined

challenges:
- deal with continuous space environment
- limited vision of an agent

### utils

In [106]:
from ipycanvas import Canvas, hold_canvas

class CanvasWithBorders(Canvas):
    def clear(self):
        super().clear()
        border_color = 'black'  # You can customize the border color here
        border_width = 1  # You can customize the border width here
        
        self.fill_style = border_color
        # Draw top border
        self.fill_rect(0, 0, self.width, border_width)
        # Draw bottom border
        self.fill_rect(0, self.height - border_width, self.width, border_width)
        # Draw left border
        self.fill_rect(0, 0, border_width, self.height)
        # Draw right border
        self.fill_rect(self.width - border_width, 0, border_width, self.height)

import os
from ray.rllib.algorithms.algorithm import Algorithm

def save_algo(algo, name):
    base_dir = os.path.join(os.getcwd(), "algos")
    subfolder_path = os.path.join(base_dir, name)
    os.makedirs(subfolder_path, exist_ok=True)
    path_to_checkpoint  = algo.save(subfolder_path)
    print(f"An Algorithm checkpoint has been created inside directory: '{path_to_checkpoint}'.")

def load_algo(name):
    base_dir = os.path.join(os.getcwd(), "algos")
    subfolder_path = os.path.join(base_dir, name)
    if not os.path.exists(subfolder_path):
        raise FileNotFoundError(f"The specified subfolder '{subfolder_path}' does not exist.")
    
    return Algorithm.from_checkpoint(subfolder_path)

#save_algo(algo, "KeepTheDistance_dst=0_agent=2_100x100train")
#algo2 = load_algo("KeepTheDistance_dst=0_agent=2_100x100train")

In [18]:
import time
import numpy as np
import random as rnd

def simulate_episode(env, policy, steps, sleep_between_frames=0.3, print_info=True):
    obs, _ = env.reset()
    env.render()
    

    for i in range(steps):
        if print_info:
            print(f"obs: ", obs)
        actions = policy.compute_actions(obs)
        #actions = {agent: np.array([rnd.random()*2-1, rnd.random()*2-1, 1.0], np.float32) for agent in obs.keys()}
        #actions = {agent: env.action_space.sample() for agent in obs.keys()}
        obs, reward, _, _, _ = env.step(actions)
        env.render()
        if print_info:
            print(f"action: ", actions)
            print(f"reward: ", reward, "\n")
        time.sleep(sleep_between_frames)

def ppo_result_format(result):
    return (f"iteration [{result['training_iteration']}] => " +
          f"episode_reward_mean: {result['sampler_results']['episode_reward_mean']}, " +
          f"episode_len_mean: {result['sampler_results']['episode_len_mean']}, " +
          f"agent_steps_trained: {result['info']['num_agent_steps_trained']}, " +
          f"env_steps_trained: {result['info']['num_env_steps_trained']}, " + 
          f"entropy: {result['info']['learner']['default_policy']['learner_stats']['entropy']}, " +
          f"learning_rate: {result['info']['learner']['default_policy']['learner_stats']['cur_lr']}")

### environment definition

In [74]:
from typing import Set
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import random as rnd
from gymnasium.spaces import Discrete, Box, Dict, Tuple
from gymnasium.spaces.utils import flatten, flatten_space
import numpy as np
from IPython.display import clear_output
import math
from ipycanvas import Canvas, hold_canvas

class EnvironmentConfiguration: 
    def __init__(self, n_agents, target_distance, speed, spawn_area=100, max_steps=None):
        self.n_agents = n_agents
        self.target_distance = target_distance
        self.max_steps = max_steps
        self.speed = speed
        self.spawn_area = spawn_area

class KeepTheDistance(MultiAgentEnv):

    canvas = None
    CANVAS_WIDTH, CANVAS_HEIGHT = 300.0, 300.0

    def __init__(self, config: EnvironmentConfiguration):
        assert config.n_agents == 2 # just base case implemented 
             
        self.n_agents = config.n_agents
        self.target_distance = config.target_distance
        self.max_steps = config.max_steps
        self.speed = config.speed
        self.spawn_area = config.spawn_area
        
        self.agents_ids = ['agent-' + str(i) for i in range(self.n_agents)]
        self.agent_colors = {agent: self.rgb_to_hex(rnd.randint(0, 255), rnd.randint(0, 255), rnd.randint(0, 255)) for agent in self.agents_ids}
        self.observation_space = self.observation_space('agent-0')
        self.action_space = self.action_space("")

    def unflatten_observation_space(self, agent):
        #distance_vector = Box(low=-np.inf, high=np.inf, shape=(2,1), dtype=np.float32)
        #obs_space = Dict({"nbr-1": distance_vector})
        direction = Box(low=-1, high=1, shape=(2,1), dtype=np.float32)
        distance = Box(low=-np.inf, high=np.inf, shape=(1,1), dtype=np.float32)
        return Dict({"nbr-1": Dict({'direction': direction, 'distance': distance})})

    def observation_space(self, agent):
        return flatten_space(self.unflatten_observation_space(agent))

    def action_space(self, agent):
        direction = Box(low=-1.0, high=1.0, shape=(2,1), dtype=np.float32)
        speed = Box(0.0, 1.0, dtype=np.float32)
        return flatten_space(Tuple([direction, speed]))
    
    def __get_random_point(self, max_x, max_y, min_x=0, min_y=0):
        return (rnd.randint(min_x, max_x-1), rnd.randint(min_y, max_y-1))
    
    def __get_observation(self, agent):
        nbr = self.__get_other_agents(agent)
        distance_vector = self.__compute_distance_vector(agent, nbr[0])
        obs = {
            "nbr-1": {
                "direction": self.__compute_unit_vector(distance_vector),
                "distance": self.__compute_distance(distance_vector)
            }
        }
        return flatten(self.unflatten_observation_space(agent), obs)

    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def __compute_distance_vector(self, agent1, agent2):
        agent1_pos = self.agents_pos[agent1]
        agent2_pos = self.agents_pos[agent2]
        return (agent1_pos[0]-agent2_pos[0], agent1_pos[1]-agent2_pos[1])

    def __compute_distance(self, distance_vector):
        return math.sqrt(math.pow(distance_vector[0], 2) + math.pow(distance_vector[1], 2))

    def __compute_norm(self, vector):
        return math.sqrt(math.pow(vector[0], 2) + math.pow(vector[1], 2))
    
    def __compute_unit_vector(self, vector):
        norm = self.__compute_norm(vector)
        if norm == 0:
            return [0,0]
        return [vector[0]/norm, vector[1]/norm]

    def __compute_distance_from_closest_neighbours(self, agent):
        obs = [self.__compute_distance_vector(agent, self.__get_other_agents(agent)[0])]
        distance = np.array([abs(self.__compute_distance(distance_vector) - self.target_distance) for distance_vector in obs]).sum()
        return distance

    def __get_local_reward(self, agent, action):
        closest_nbrs = self.__get_n_closest_neighbours(agent, 1)

        newDistance = sum([abs(distance - self.target_distance) for distance in closest_nbrs.values()])
        reward_1 = self.last_step_distances[agent] - newDistance
        self.last_step_distances[agent] = newDistance

        reward_2 = sum([1 if abs(distance - self.target_distance) < 0.5 else 0 for distance in closest_nbrs.values()])

        #reward_3 = - action[2]
        #return -newDistance + reward_2 
        return reward_1 + reward_2 #+ reward_3

    def __get_global_reward(self):
        return 0
    
    def __get_other_agents(self, agent):
        return [other for other in self.agents_ids if other != agent]

    def __get_n_closest_neighbours(self, agent, n=1):
        dst = {other: self.__compute_distance(self.__compute_distance_vector(agent, other)) for other in self.__get_other_agents(agent)}
        return {neighbour[0]: neighbour[1] for neighbour in sorted(list(dst.items()), key=lambda d: d[0])[:n]}

    def __update_agent_position(self, agent, action):
        unit_movement = self.__compute_unit_vector([action[0], action[1]])
        self.agents_pos[agent] = (self.agents_pos[agent][0] + unit_movement[0]*action[2]*self.speed, 
                                 self.agents_pos[agent][1] + unit_movement[1]*action[2]*self.speed)

    def reset(self, seed=None, options=None):
        self.steps = 0
        self.agents_pos = {agent: self.__get_random_point(max_x=self.spawn_area, max_y=self.spawn_area) for agent in self.agents_ids}
        self.last_step_distances = {agent: self.__compute_distance_from_closest_neighbours(agent) for agent in self.agents_ids}
        return {agent: self.__get_observation(agent) for agent in self.agents_ids}, {}
     
    def step(self, actions):
        self.steps += 1
        observations, rewards, terminated, truncated, infos = {}, {}, {}, {}, {}

        for agent, action in actions.items():
            self.__update_agent_position(agent, action)

        for agent, action in actions.items():
            observations[agent] = self.__get_observation(agent)
            rewards[agent] = self.__get_local_reward(agent, action) + self.__get_global_reward()
            terminated[agent] = False
            truncated[agent] = False
            infos[agent] = {}

        truncated['__all__'] = False
        if self.max_steps != None and self.steps == self.max_steps:
            terminated['__all__'] = True
        else:
            terminated['__all__'] = False

        return observations, rewards, terminated, truncated, infos
     
    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def render(self):
        pass

    def get_agent_ids(self):
       return self.agents


class RenderableKeepTheDistance(KeepTheDistance):
    def render(self):
        if self.canvas is None:
            self.canvas = CanvasWithBorders(width=self.CANVAS_WIDTH, height=self.CANVAS_HEIGHT)
            display(self.canvas)
        
        with hold_canvas():
            agent_size = max(self.CANVAS_WIDTH/float(self.spawn_area),1)
            top_left = (0.0,0.0)
            bottom_right = (self.spawn_area, self.spawn_area)
            self.canvas.clear()

            for agent in self.agents_ids:
                raw_pos = self.agents_pos[agent]
                color = self.agent_colors[agent]
                
                agent_pos_in_frame = [((raw_pos[0]-top_left[0])/(bottom_right[0]-top_left[0]))*self.CANVAS_WIDTH,
                            ((raw_pos[1]-top_left[1])/(bottom_right[1]-top_left[1]))*self.CANVAS_HEIGHT,]

                self.canvas.fill_style = color
                self.canvas.fill_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_size/2.0
                )
                
                self.canvas.stroke_style = "black"
                self.canvas.stroke_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_size/2.0
                )

In [13]:
import time

env_config = EnvironmentConfiguration(n_agents=2, target_distance=0, max_steps=100, speed=1, spawn_area=5)
env = RenderableKeepTheDistance(env_config)
obs, _ = env.reset()
env.render()

for i in range(5):
    print(obs)
    #actions = {agent: np.array([rnd.random()*2-1, rnd.random()*2-1, 1.0], np.float32) for agent in obs.keys()}
    actions = {'agent-0': np.array([-1.0, -1.0, 1.0], np.float32),
               'agent-1': np.array([0.0, 0.0, 1], np.float32)}
    #actions = {agent: env.action_space.sample() for agent in obs.keys()}
    obs, reward, _, _, _ = env.step(actions)
    print(actions)
    print(reward, "\n")
    env.render()
    time.sleep(0.3)


CanvasWithBorders(height=300, width=300)

{'agent-0': array([1., 0., 2.], dtype=float32), 'agent-1': array([-1.,  0.,  2.], dtype=float32)}
{'agent-0': array([-1., -1.,  1.], dtype=float32), 'agent-1': array([0., 0., 1.], dtype=float32)}
{'agent-0': 0.5263742417920994, 'agent-1': 0.5263742417920994} 

{'agent-0': array([ 0.8773552, -0.4798415,  1.4736258], dtype=float32), 'agent-1': array([-0.8773552,  0.4798415,  1.4736258], dtype=float32)}
{'agent-0': array([-1., -1.,  1.], dtype=float32), 'agent-1': array([0., 0., 1.], dtype=float32)}
{'agent-0': -0.057107971252458256, 'agent-1': -0.057107971252458256} 

{'agent-0': array([ 0.38268343, -0.9238795 ,  1.5307337 ], dtype=float32), 'agent-1': array([-0.38268343,  0.9238795 ,  1.5307337 ], dtype=float32)}
{'agent-0': array([-1., -1.,  1.], dtype=float32), 'agent-1': array([0., 0., 1.], dtype=float32)}
{'agent-0': -0.5940529953102542, 'agent-1': -0.5940529953102542} 

{'agent-0': array([-0.05709766, -0.9983686 ,  2.1247866 ], dtype=float32), 'agent-1': array([0.05709766, 0.998368

## policy training

In [113]:
import ray
ray.shutdown()

## KeepTheDistance?dst=0&agent=2&spawn_area=100

In [114]:
from ray.tune.registry import register_env

env_config = EnvironmentConfiguration(n_agents=2, target_distance=0, max_steps=300, speed=1, spawn_area=100)
register_env("KeepTheDistance?dst=0&agent=2&spawn_area=100", lambda _: KeepTheDistance(env_config))

In [None]:
algo = load_algo("KeepTheDistance?dst=0&agent=2&spawn_area=100")

In [115]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 30

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4096, 
              sgd_minibatch_size = 256, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="KeepTheDistance?dst=0&agent=2&spawn_area=100")
    .build()
)
clear_output()

out = ""
for i in range(trainin_steps):
    result = algo.train()
    clear_output()
    out += ppo_result_format(result) + "\n"
    print(out)
    simulate_episode(RenderableKeepTheDistance(env_config), algo, 50, sleep_between_frames=0.08, print_info=True)

iteration [1] => episode_reward_mean: -4.95664815891495, episode_len_mean: 300.0, agent_steps_trained: 8192, env_steps_trained: 4096, entropy: 4.245799648761749, learning_rate: 0.0010000000000000005
iteration [2] => episode_reward_mean: 25.736145597755076, episode_len_mean: 300.0, agent_steps_trained: 16384, env_steps_trained: 8192, entropy: 4.2675507590174675, learning_rate: 0.0010000000000000005
iteration [3] => episode_reward_mean: 46.87691646133618, episode_len_mean: 300.0, agent_steps_trained: 24576, env_steps_trained: 12288, entropy: 4.28767983019352, learning_rate: 0.0010000000000000005
iteration [4] => episode_reward_mean: 63.17620430531192, episode_len_mean: 300.0, agent_steps_trained: 32768, env_steps_trained: 16384, entropy: 4.272764860590299, learning_rate: 0.0010000000000000005
iteration [5] => episode_reward_mean: 83.51466914032467, episode_len_mean: 300.0, agent_steps_trained: 40960, env_steps_trained: 20480, entropy: 4.080347933868567, learning_rate: 0.00100000000000000

CanvasWithBorders(height=300, width=300)

obs:  {'agent-0': array([-0.4472136,  0.8944272, 35.77709  ], dtype=float32), 'agent-1': array([ 0.4472136, -0.8944272, 35.77709  ], dtype=float32)}
action:  {'agent-0': array([ 0.93228984, -0.17839646,  0.80335176], dtype=float32), 'agent-1': array([-1.        ,  0.72599816,  1.        ], dtype=float32)}
reward:  {'agent-0': 1.35772393719202, 'agent-1': 1.35772393719202} 

obs:  {'agent-0': array([-0.41841963,  0.90825385, 34.419365  ], dtype=float32), 'agent-1': array([ 0.41841963, -0.90825385, 34.419365  ], dtype=float32)}
action:  {'agent-0': array([ 0.47593868, -1.        ,  0.8037338 ], dtype=float32), 'agent-1': array([0.15418804, 1.        , 1.        ], dtype=float32)}
reward:  {'agent-0': 1.63307693167814, 'agent-1': 1.63307693167814} 

obs:  {'agent-0': array([-0.43337393,  0.9012142 , 32.786285  ], dtype=float32), 'agent-1': array([ 0.43337393, -0.9012142 , 32.786285  ], dtype=float32)}
action:  {'agent-0': array([ 0.24133325, -1.        ,  1.        ], dtype=float32), 'age

In [116]:
env_config_2 = EnvironmentConfiguration(n_agents=2, target_distance=0, max_steps=500, speed=1, spawn_area=500)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 200, sleep_between_frames=0.03, print_info=False)

CanvasWithBorders(height=300, width=300)

In [117]:
save_algo(algo, "KeepTheDistance?dst=0&agent=2&spawn_area=100")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/KeepTheDistance?dst=0&agent=2&spawn_area=100), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 5.732406087343891, 'cur_kl_coeff': 0.6750000000000002, 'cur_lr': 0.0010000000000000005, 'total_loss': 2.0669429610628867, 'policy_loss': -0.009641030859590198, 'vf_loss': 2.0618812701043985, 'vf_explained_var': 0.41663303778817257, 'kl': 0.021781805181126438, 'entropy': 3.2973313165207703, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 28320.5, 'diff_num_grad_updates_vs_sampler_policy': 479.5}}, 'num_env_steps_sampled': 122880, 'num_env_steps_trained': 122880, 'num_agent_steps_sampled': 245760, 'num_agent_steps_trained': 245760

TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/KeepTheDistance?dst=0&agent=2&spawn_area=100), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 5.732406087343891, 'cur_kl_coeff': 0.6750000000000002, 'cur_lr': 0.0010000000000000005, 'total_loss': 2.0669429610628867, 'policy_loss': -0.009641030859590198, 'vf_loss': 2.0618812701043985, 'vf_explained_var': 0.41663303778817257, 'kl': 0.021781805181126438, 'entropy': 3.2973313165207703, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 28320.5, 'diff_num_grad_updates_vs_sampler_policy': 479.5}}, 'num_env_steps_sampled': 122880, 'num_env_steps_trained': 122880, 'num_agent_steps_sampled': 245760, 'num_agent_steps_trained': 245760}, 'sampler_results': {'episode_reward_max': 656.34262325767