# Form a Line

an agent is randomly nominated line leader and the other agents have to arrange themselves to form a line having as center the leader agent

observation: 
- the relative position from the leader
- the relative position from the leftmost member of the line
- the relative position from the rightmost member of the line

### utils

In [1]:
from utils.vectors import Vector2D
from utils.canvas import CanvasWithBorders
from utils.algo_utils import (save_algo, load_algo)
from utils.simulations import (simulate_episode, simulate_random_episode, ppo_result_format)

### environment definition

In [2]:
from typing import Set
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import random as rnd
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiDiscrete
from gymnasium.spaces.utils import flatten, flatten_space
import numpy as np
from IPython.display import clear_output
import math
from ipycanvas import Canvas, hold_canvas

class EnvironmentConfiguration: 
    def __init__(self, n_agents, collision_radius, spawn_area=100, max_steps=None):
        self.n_agents = n_agents
        self.max_steps = max_steps
        self.spawn_area = spawn_area
        self.collision_radius = collision_radius

class FormALine(MultiAgentEnv):
    canvas = None
    CANVAS_WIDTH, CANVAS_HEIGHT = 300.0, 300.0
    LINE_TOLLERANCE_Y = 1
    LINE_TOLLERANCE_X = 5

    def __init__(self, config: EnvironmentConfiguration):
        self.n_agents = config.n_agents
        self.collision_radius = config.collision_radius
        self.max_steps = config.max_steps
        self.spawn_area = config.spawn_area

        self.agents_ids = ['agent-' + str(i) for i in range(self.n_agents)]
        self.agent_colors = {agent: self.rgb_to_hex(rnd.randint(0, 255), rnd.randint(0, 255), rnd.randint(0, 255)) for agent in self.agents_ids}
        self.observation_space = self.observation_space('agent-0')
        self.action_space = self.action_space("")

    def reset(self, seed=None, options=None):
        self.steps = 0
        self.agents_pos = {agent: Vector2D.get_random_point(max_x=self.spawn_area, max_y=self.spawn_area) for agent in self.agents_ids}
        self.leader = 'agent-0'
        self.distance_from_line = {agent: abs(self.agents_pos[self.leader].y - self.agents_pos[agent].y) 
                                   for agent in self.__get_other_agents(self.leader)}

        return {agent: self.__get_observation(agent) for agent in self.__get_other_agents(self.leader)}, {}
    
    def unflatten_observation_space(self, agent):
        direction = Box(low=-1, high=1, shape=(2,1), dtype=np.float32)
        distance = Box(low=-np.inf, high=np.inf, shape=(1,1), dtype=np.float32)
        return Dict({member: Dict({'direction': direction, 'distance': distance}) for member in ['leftmost','leader','rightmost']})

    def observation_space(self, agent):
        return flatten_space(self.unflatten_observation_space(agent))

    def action_space(self, agent):
        direction = Box(low=-1.0, high=1.0, shape=(2,1), dtype=np.float32)
        speed = Box(0.0, 1.0, dtype=np.float32)
        return flatten_space(Tuple([direction, speed]))
    
    def __get_observation(self, agent):
        obs = {}
        leader_distance_vector = Vector2D.distance_vector(self.agents_pos[self.leader], self.agents_pos[agent])
        obs['leftmost'] = {"direction": Vector2D(0,0).to_np_array(), "distance": 0}
        obs['rightmost'] = {"direction": Vector2D(0,0).to_np_array(), "distance": 0}
        obs['leader'] = {"direction": Vector2D.unit_vector(leader_distance_vector).to_np_array(), 
                         "distance": np.log(1 + Vector2D.norm(leader_distance_vector))}

        return flatten(self.unflatten_observation_space(agent), obs)

    def __get_other_agents(self, agent):
        return [other for other in self.agents_ids if other != agent]
    
    def __is_aligned_with_the_leader(self, agent):
        leader_pos = self.agents_pos[self.leader]
        agent_pos = self.agents_pos[agent]
        return abs(agent_pos.y - leader_pos.y) <= self.LINE_TOLLERANCE_Y

    def __get_local_reward(self, agent, action):
        # reward 0: how much I improved the distance from the agent y
        old_distance_from_line = self.distance_from_line[agent]
        new_distance_from_line = abs(self.agents_pos[agent].y - self.agents_pos[self.leader].y)
        reward_0 = old_distance_from_line - new_distance_from_line 
        self.distance_from_line[agent] = new_distance_from_line

        # reward 1: bonus if is aligned with the leader (linear)
        reward_1 = max(0, 1 - new_distance_from_line) 

        # reward 2: penalize the movement
        reward_2 = -action[2]

        # reward 3: penalize the movement if the agent is very close to the perfect y
        reward_3 = -action[2] if old_distance_from_line <= 0.5 else 0

        # reward 4: bonus if the agent is very close to the perfect y
        reward_4 = new_distance_from_line == 0 

        # reward 5: bonus if is aligned with the leader (exponential)
        reward_5 = pow(4, -new_distance_from_line)

        # reward 6: punish collisions
        reward_6 = sum([1 
                    if Vector2D.distance(self.agents_pos[agent], self.agents_pos[other]) <= 2*self.collision_radius 
                    else 0
                    for other in self.__get_other_agents(agent)])

        return reward_0 + reward_1 + reward_2/10.0 + reward_5
    
    def __get_global_reward(self):
        return 0
    
    def __update_agent_position(self, agent, action):
        unit_movement = Vector2D(action[0], action[1])
        self.agents_pos[agent] = Vector2D.sum(self.agents_pos[agent], Vector2D.mul(unit_movement, action[2]))

    def step(self, actions):
        self.steps += 1
        observations, rewards, terminated, truncated, infos = {}, {}, {}, {}, {}

        for agent, action in actions.items():
            self.__update_agent_position(agent, action)

        for agent, action in actions.items():
            observations[agent] = self.__get_observation(agent)
            rewards[agent] = self.__get_local_reward(agent, action) + self.__get_global_reward()
            terminated[agent] = False
            truncated[agent] = False
            infos[agent] = {"dst": self.distance_from_line[agent]}

        truncated['__all__'] = False
        if self.max_steps != None and self.steps == self.max_steps:
            terminated['__all__'] = True
        else:
            terminated['__all__'] = False

        return observations, rewards, terminated, truncated, infos
    
    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def render(self):
        pass

    def get_agent_ids(self):
       return self.agents
    

class RenderableFormALine(FormALine):
    def render(self):
        if self.canvas is None:
            self.canvas = CanvasWithBorders(width=self.CANVAS_WIDTH, height=self.CANVAS_HEIGHT)
            display(self.canvas)
        
        with hold_canvas():
            agent_size = max(self.CANVAS_WIDTH/float(self.spawn_area),1)
            collision_radius_size = (self.CANVAS_WIDTH/float(self.spawn_area))*self.collision_radius
            top_left = (0.0,0.0)
            bottom_right = (self.spawn_area, self.spawn_area)
            self.canvas.clear()

            for agent in self.agents_ids:
                raw_pos = self.agents_pos[agent].to_np_array()
                color = self.agent_colors[agent]
                
                agent_pos_in_frame = [((raw_pos[0]-top_left[0])/(bottom_right[0]-top_left[0]))*self.CANVAS_WIDTH,
                            ((raw_pos[1]-top_left[1])/(bottom_right[1]-top_left[1]))*self.CANVAS_HEIGHT,]

                self.canvas.fill_style = color
                self.canvas.fill_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_size/2.0
                )
                
                self.canvas.stroke_style = "black"
                self.canvas.stroke_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_size/2.0
                )

                if self.collision_radius > 0:
                    self.canvas.stroke_style = "red"
                    self.canvas.stroke_circle(
                        agent_pos_in_frame[0],
                        agent_pos_in_frame[1],
                        collision_radius_size
                    )

In [4]:
env_config = EnvironmentConfiguration(n_agents=10, collision_radius=5, max_steps=500, spawn_area=100)
env = RenderableFormALine(env_config)

env.reset()
#env.render()
simulate_random_episode(env, 100, print_info=False, sleep_between_frames=0.1)
#env.step({'agent-1': (1,1,1)})

CanvasWithBorders(height=300, width=300)

## policy training

In [3]:
import ray
ray.shutdown()

## FormALine_align_y

align the agents at the same y as the leader not caring about their x

In [3]:
from ray.tune.registry import register_env

env_config = EnvironmentConfiguration(n_agents=2, collision_radius=5, max_steps=500, spawn_area=100)
register_env("FormALine_align_y", lambda _: FormALine(env_config))

In [None]:
algo = load_algo("FormALine_align_y")

In [67]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 90

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4096, 
              sgd_minibatch_size = 256, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="FormALine_align_y")
    .build()
)
clear_output()

out = ""
for i in range(trainin_steps):
    result = algo.train()
    clear_output()
    out += ppo_result_format(result) + "\n"
    print(out)
    simulate_episode(RenderableFormALine(env_config), algo, 200, sleep_between_frames=0.01, print_info=True)

iteration [61] => episode_reward_mean: 807.2520174006892, episode_len_mean: 500.0, agent_steps_trained: 249856, env_steps_trained: 249856, entropy: 2.915201820929845, learning_rate: 0.0010000000000000005
iteration [62] => episode_reward_mean: 806.9810169364544, episode_len_mean: 500.0, agent_steps_trained: 253952, env_steps_trained: 253952, entropy: 3.2331853161255517, learning_rate: 0.0010000000000000005
iteration [63] => episode_reward_mean: 810.176767425222, episode_len_mean: 500.0, agent_steps_trained: 258048, env_steps_trained: 258048, entropy: 2.9921262289086976, learning_rate: 0.0010000000000000005
iteration [64] => episode_reward_mean: 813.4085314334421, episode_len_mean: 500.0, agent_steps_trained: 262144, env_steps_trained: 262144, entropy: 2.8091236571470897, learning_rate: 0.0010000000000000005
iteration [65] => episode_reward_mean: 820.8583668698349, episode_len_mean: 500.0, agent_steps_trained: 266240, env_steps_trained: 266240, entropy: 2.999361576139927, learning_rate: 

CanvasWithBorders(height=300, width=300)

info:  {'agent-1': {'dst': 1.0}}
info:  {'agent-1': {'dst': 0.5948020815849304}}
info:  {'agent-1': {'dst': 0.03444129228591919}}
info:  {'agent-1': {'dst': 0.00013205409049987793}}
info:  {'agent-1': {'dst': 0.00013205409049987793}}
info:  {'agent-1': {'dst': 0.039071228355169296}}
info:  {'agent-1': {'dst': 0.039071228355169296}}
info:  {'agent-1': {'dst': 0.5399658419191837}}
info:  {'agent-1': {'dst': 0.24049431458115578}}
info:  {'agent-1': {'dst': 0.24049431458115578}}
info:  {'agent-1': {'dst': 0.24049431458115578}}
info:  {'agent-1': {'dst': 0.19005734100937843}}
info:  {'agent-1': {'dst': 0.19446153193712234}}
info:  {'agent-1': {'dst': 0.19446153193712234}}
info:  {'agent-1': {'dst': 0.14852000027894974}}
info:  {'agent-1': {'dst': 0.14852000027894974}}
info:  {'agent-1': {'dst': 0.14852000027894974}}
info:  {'agent-1': {'dst': 0.14852000027894974}}
info:  {'agent-1': {'dst': 0.1341201588511467}}
info:  {'agent-1': {'dst': 0.1341201588511467}}
info:  {'agent-1': {'dst': 0.865

In [75]:
env_config_2 = EnvironmentConfiguration(n_agents=10, collision_radius=5, max_steps=500, spawn_area=100)
simulate_episode(RenderableFormALine(env_config_2), algo, 300, sleep_between_frames=0.01, print_info=False)

CanvasWithBorders(height=300, width=300)

In [71]:
save_algo(algo, "FormALine_align_y")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/FormALine_align_y), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 11.9410849476854, 'cur_kl_coeff': 2.1624389648437505, 'cur_lr': 0.0010000000000000005, 'total_loss': 2.2055642472124117, 'policy_loss': -0.0021745497370526815, 'vf_loss': 2.177628695395106, 'vf_explained_var': 0.2545866960038741, 'kl': 0.013924137522894852, 'entropy': 3.4696590453386307, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 42960.5, 'diff_num_grad_updates_vs_sampler_policy': 239.5}}, 'num_env_steps_sampled': 368640, 'num_env_steps_trained': 368640, 'num_agent_steps_sampled': 368640, 'num_agent_steps_trained': 368640}, 'sampler_results': {'episo

## FormALine_align_x

In [7]:
base_algo = load_algo("FormALine_align_y")



In [8]:
from ray.tune.registry import register_env
from ray.rllib.algorithms.ppo import PPOConfig

env_config = EnvironmentConfiguration(n_agents=5, collision_radius=5, max_steps=500, spawn_area=100)
register_env("FormALine_align_x", lambda _: FormALine(env_config))

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4092, 
              sgd_minibatch_size = 128, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="FormALine_align_x")
    .build()
)
clear_output()
algo.set_weights(base_algo.get_weights())

In [None]:
algo = load_algo("FormALine_align_x")

In [9]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 30

out = ""
for i in range(trainin_steps):
    result = algo.train()
    clear_output()
    out += ppo_result_format(result) + "\n"
    print(out)
    simulate_episode(RenderableFormALine(env_config), algo, 200, sleep_between_frames=0.01, print_info=True)

iteration [1] => episode_reward_mean: 318.20808245314856, episode_len_mean: 500.0, agent_steps_trained: 16368, env_steps_trained: 4092, entropy: 4.285577949513914, learning_rate: 0.0010000000000000005
iteration [2] => episode_reward_mean: 493.8949219933791, episode_len_mean: 500.0, agent_steps_trained: 32736, env_steps_trained: 8184, entropy: 4.269192019785483, learning_rate: 0.0010000000000000005



CanvasWithBorders(height=300, width=300)

info:  {'agent-1': {'dst': 26.148744836449623}, 'agent-2': {'dst': 54.70322436094284}, 'agent-3': {'dst': 29.0}, 'agent-4': {'dst': 25.584942162036896}}
info:  {'agent-1': {'dst': 26.050019294023514}, 'agent-2': {'dst': 53.812951028347015}, 'agent-3': {'dst': 29.01897190697491}, 'agent-4': {'dst': 25.397972285747528}}
info:  {'agent-1': {'dst': 25.078371554613113}, 'agent-2': {'dst': 53.812951028347015}, 'agent-3': {'dst': 29.01897190697491}, 'agent-4': {'dst': 24.609527230262756}}
info:  {'agent-1': {'dst': 25.078371554613113}, 'agent-2': {'dst': 53.79117308743298}, 'agent-3': {'dst': 28.682591186836362}, 'agent-4': {'dst': 24.609527230262756}}
info:  {'agent-1': {'dst': 24.997239738702774}, 'agent-2': {'dst': 53.79117308743298}, 'agent-3': {'dst': 28.26979743130505}, 'agent-4': {'dst': 24.609527230262756}}
info:  {'agent-1': {'dst': 24.969401609152555}, 'agent-2': {'dst': 53.894499169662595}, 'agent-3': {'dst': 27.6284326184541}, 'agent-4': {'dst': 25.24173927307129}}
info:  {'agent-

KeyboardInterrupt: 