## Ring Formation

### utils

In [72]:
from utils.vectors import Vector2D
from utils.canvas import CanvasWithBorders
from utils.algo_utils import (save_algo, load_algo)
from utils.simulations import (simulate_episode, simulate_random_episode, ppo_result_format, sac_result_format)

### environment

In [182]:
from typing import Set
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import random as rnd
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiDiscrete
from gymnasium.spaces.utils import flatten, flatten_space
import numpy as np
from IPython.display import clear_output
import math
from ipycanvas import Canvas, hold_canvas
from enum import Enum

class EnvironmentConfiguration: 
    def __init__(self, slave_agents, ring_radius, spawn_area=100, max_steps=None, cache_size=1):
        # parameters that shouldn't affect the agents' behaviour
        self.spawn_area = spawn_area
        self.max_steps = max_steps
        # parameters that affect the agents' behavious
        self.ring_radius = ring_radius
        # parameters that affect the observation space
        self.slave_agents = slave_agents
        self.cache_size = cache_size

class RingFormation(MultiAgentEnv):
    canvas = None
    CANVAS_WIDTH, CANVAS_HEIGHT = 300.0, 300.0

    Task = Enum('Task', ['DISTANCE_FROM_CENTER', 'DISTANCE_FROM_NEIGHBOURS'])

    def __init__(self, config: EnvironmentConfiguration):
        assert config.slave_agents >= 3
        
        self.current_task = self.Task.DISTANCE_FROM_CENTER

        # parameters that shouldn't affect the agents' behaviour
        self.spawn_area = config.spawn_area
        self.max_steps = config.max_steps
        # parameters that affect the agents' behavious
        self.ring_radius = config.ring_radius
        # parameters that affect the observation space
        self.slave_agents = config.slave_agents
        self.cache_size = config.cache_size

        self.agent_ids = ['agent-' + str(i) for i in range(self.slave_agents)]
        self.agent_colors = {agent: self.rgb_to_hex(rnd.randint(0, 255), rnd.randint(0, 255), rnd.randint(0, 255)) for agent in self.agent_ids}


        #circumference = (2 * self.ring_radius * np.pi)
        self.ideal_distance = 2 * self.ring_radius * math.sin(np.pi/self.slave_agents) #circumference / self.slave_agents

        self.observation_space = self.observation_space('agent-0')
        self.action_space = self.action_space("")

    def unflatten_observation_space(self, agent):
        direction = Box(low=-1, high=1, shape=(2,1), dtype=np.float32)
        #direction = Box(low=-np.pi, high=np.pi, shape=(1,1), dtype=np.float32)
        distance = Box(low=-np.inf, high=np.inf, shape=(1,1), dtype=np.float32)

        time_t_obs = Dict({"prev": Dict({'direction': direction, 'distance': distance}),
                           "next": Dict({'direction': direction, 'distance': distance}),
                           "center": Dict({'direction': direction, 'distance': distance})})
        
        return Dict({f"t[-{t}]": time_t_obs for t in range(0, self.cache_size)})

    def observation_space(self, agent):
        return flatten_space(self.unflatten_observation_space(agent))

    def action_space(self, agent):
        direction = Box(low=-1.0, high=1.0, shape=(2,1), dtype=np.float32)
        #direction = Box(low=-np.pi, high=np.pi, shape=(1,1), dtype=np.float32)
        speed = Box(0.0, 1.0, dtype=np.float32)
        return flatten_space(Tuple([direction, speed]))
    
    def reset(self, seed=None, options=None):
        self.steps = 0
        self.agent_pos = {agent: Vector2D.get_random_point(max_x=self.spawn_area, max_y=self.spawn_area) for agent in self.agent_ids}
        self.agent_old_pos = dict(self.agent_pos)

        self.old_distance_from_nbrs = {agent: self.__get_distance_from_nbrs(agent) for agent in self.agent_ids}

        self.center_pos = Vector2D.sum(
            Vector2D.get_random_point(max_x=int(self.spawn_area * 2.0/3), max_y=int(self.spawn_area * 2.0/3)),
            Vector2D(self.spawn_area * 1.0/6, self.spawn_area * 1.0/6)
        )
        self.info = {agent: {} for agent in self.agent_ids}
        self.observation_cache = {agent: [] for agent in self.agent_ids}
        return {agent: self.__get_observation(agent) for agent in self.agent_ids}, {}

    def __update_agent_position(self, agent, action):
        #direction, speed = action
        #unit_movement = Vector2D.from_rad(direction)
        speed = action[2]
        unit_movement = Vector2D.unit_vector(Vector2D(action[0], action[1]))
        self.agent_old_pos[agent] = self.agent_pos[agent]
        self.agent_pos[agent] = Vector2D.sum(self.agent_pos[agent], Vector2D.mul(unit_movement, speed))

    def __get_time_t_observation(self, agent):
        prev_distance_vector = Vector2D.distance_vector(self.agent_pos[agent], self.agent_old_pos[self.__prev_agent(agent)])
        prev = {
            "direction": Vector2D.unit_vector(prev_distance_vector).to_np_array(), #math.atan2(prev_distance_vector.y, prev_distance_vector.x),
            "distance": np.log(1 + abs(Vector2D.norm(prev_distance_vector))) #1 - np.exp(-alpha * x)
        }

        next_distance_vector = Vector2D.distance_vector(self.agent_pos[agent], self.agent_old_pos[self.__next_agent(agent)])
        next = {
            "direction": Vector2D.unit_vector(next_distance_vector).to_np_array(), #math.atan2(next_distance_vector.y, next_distance_vector.x),
            "distance": np.log(1 + abs(Vector2D.norm(next_distance_vector))) #1 - np.exp(-alpha * x)
        }

        center_distance_vector = Vector2D.distance_vector(self.agent_pos[agent], self.center_pos)
        center = {
            "direction": Vector2D.unit_vector(center_distance_vector).to_np_array(), #math.atan2(center_distance_vector.y, center_distance_vector.x),
            "distance": np.log(1 + abs(Vector2D.norm(center_distance_vector))) #1 - np.exp(-alpha * x)
        }
    
        obs = {"prev": prev, "next": next, "center": center}
        return obs

    def __get_observation(self, agent):
        if len(self.observation_cache[agent]) == 0:
            self.observation_cache[agent] = [self.__get_time_t_observation(agent)]*self.cache_size
        else:
            self.observation_cache[agent] = [self.__get_time_t_observation(agent)] + self.observation_cache[agent]
            self.observation_cache[agent].pop()

        obs = {
            f"t[-{t}]": self.observation_cache[agent][t]
            for t in range(0, self.cache_size)
        }

        return flatten(self.unflatten_observation_space(agent), obs)

    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def __get_distance_from_nbrs(self, agent):
        return [Vector2D.distance(self.agent_pos[agent], self.agent_pos[nbr]) for nbr in [self.__prev_agent(agent), self.__next_agent(agent)]]

    def __get_local_reward(self, agent, action):
        
        old_distance_from_target = abs(Vector2D.distance(self.agent_old_pos[agent], self.center_pos) - self.ring_radius)
        distance_from_target = abs(Vector2D.distance(self.agent_pos[agent], self.center_pos) - self.ring_radius)  

        #reward 1: bonus if the agent moves toward the target, malus otherwise
        reward_1 = (old_distance_from_target - distance_from_target)

        self.info[agent]["distance_from_target"] = distance_from_target
        #reward 2: bonus if very close to the target
        reward_2 = max(20*(0.05-distance_from_target), 0) * 10
        #reward_2 = 10 if distance_from_target < 0.05 else 0
        #if reward_2 == 0:
        #    reward_2 = - np.log(1 + distance_from_target)
        
        #reward 3: collision malus
        #reward_3 = sum([-1 if Vector2D.distance(self.agent_pos[agent], self.agent_pos[nbr]) < 0.5 else 0 for nbr in self.__get_other_agents(agent)])

        #reward 4: penalize movement
        #reward_4 = -action[2]

        #reward 5: if the agent is not in the bonus target area it gets a malus
        reward_5 = -1 if reward_2 == 0 else 0 

        #reward 6: the agent is at the target distance
        reward_6 = 100 if distance_from_target <= 0.05 else 0
        
        # reward 7: distance improvement from prev and next
        distance_from_nbrs = self.__get_distance_from_nbrs(agent)
        #reward_7 = abs(np.mean(self.old_distance_from_nbrs[agent]) - self.ideal_distance) - abs(np.mean(distance_from_nbrs) - self.ideal_distance)
        reward_7 = -np.mean([abs(dst - self.ideal_distance) for dst in distance_from_nbrs])
        self.old_distance_from_nbrs[agent] = distance_from_nbrs

        # reward 8: very negative reward if not in position
        reward_8 = -100 if distance_from_target > 0.5 else 0

        # reward 9: penalize the agent if the old distance was optimal and the current is not
        reward_9 = -10 if old_distance_from_target <= 0.05 and distance_from_target > 0.05 else 0

        # update info
        self.info[agent]["steps_at_ideal_distance"] = reward_2 > 0

        if self.current_task == self.Task.DISTANCE_FROM_CENTER:
            return reward_1 + reward_2 #+ reward_9
        else:
            return reward_1 + reward_2 + reward_7

    def __get_global_reward(self):
        return 0
        #return self.global_reward * 100
    
    def __get_other_agents(self, agent):
        return [other for other in self.agent_ids if other != agent]

    def __prev_agent(self, agent):
        agent_id = int(agent.split("-")[1])
        return f"agent-{(agent_id-1) % self.slave_agents}"

    def __next_agent(self, agent):
        agent_id = int(agent.split("-")[1])
        return f"agent-{(agent_id+1) % self.slave_agents}"

    def step(self, actions):
        self.steps += 1
        observations, rewards, terminated, truncated, infos = {}, {}, {}, {}, {}

        for agent, action in actions.items():
            self.__update_agent_position(agent, action)

        for agent, action in actions.items():
            observations[agent] = self.__get_observation(agent)
            rewards[agent] = self.__get_local_reward(agent, action) + self.__get_global_reward()
            terminated[agent] = False
            truncated[agent] = False
            infos[agent] = self.info[agent]

        truncated['__all__'] = False
        if self.max_steps != None and self.steps == self.max_steps:
            terminated['__all__'] = True
            for agent in self.agent_ids:
                terminated[agent] = True
        else:
            terminated['__all__'] = False

        return observations, rewards, terminated, truncated, infos
     
    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def render(self):
        pass

    def get_agent_ids(self):
       return self.agents
    
    def set_task(self, task):
        self.current_task = task


class RenderableRingFormation(RingFormation):
    def render(self):
        if self.canvas is None:
            self.canvas = CanvasWithBorders(width=self.CANVAS_WIDTH, height=self.CANVAS_HEIGHT)
            display(self.canvas)
        
        with hold_canvas():
            unit = self.CANVAS_WIDTH/float(self.spawn_area)
            agent_render_size = max(unit,1)
            top_left, bottom_right = (0.0,0.0), (self.spawn_area, self.spawn_area)
            self.canvas.clear()

            raw_pos = self.center_pos.to_np_array()
            color = "red"
            
            agent_pos_in_frame = [((raw_pos[0]-top_left[0])/(bottom_right[0]-top_left[0]))*self.CANVAS_WIDTH,
                        ((raw_pos[1]-top_left[1])/(bottom_right[1]-top_left[1]))*self.CANVAS_HEIGHT,]

            self.canvas.fill_style = color
            self.canvas.fill_circle(
                agent_pos_in_frame[0],
                agent_pos_in_frame[1],
                agent_render_size/2.0
            )
            
            self.canvas.stroke_style = "black"
            self.canvas.stroke_circle(
                agent_pos_in_frame[0],
                agent_pos_in_frame[1],
                agent_render_size/2.0
            )

            self.canvas.stroke_style = "red"
            self.canvas.stroke_circle(
                agent_pos_in_frame[0],
                agent_pos_in_frame[1],
                self.ring_radius*unit
            )

            for agent in self.agent_ids:
                raw_pos = self.agent_pos[agent].to_np_array()
                color = self.agent_colors[agent]
                
                agent_pos_in_frame = [((raw_pos[0]-top_left[0])/(bottom_right[0]-top_left[0]))*self.CANVAS_WIDTH,
                            ((raw_pos[1]-top_left[1])/(bottom_right[1]-top_left[1]))*self.CANVAS_HEIGHT,]

                self.canvas.fill_style = color
                self.canvas.fill_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_render_size/2.0
                )
                
                self.canvas.stroke_style = "black"
                self.canvas.stroke_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_render_size/2.0
                )

In [107]:
from time import sleep
config = EnvironmentConfiguration(3, 10, 50)
env = RenderableRingFormation(config)
env.reset()

for i in range(0,50):
    obs, rew, _, _, info = env.step({"agent-0": [-1,-1,1]})
    #print(info["agent-0"])
    print(info["agent-0"]["distance_from_target"])
    print(rew, "\n")    
    env.render()
    sleep(0.1)

20.17625392897625
{'agent-0': 0.9828360591618583} 



CanvasWithBorders(height=300, width=300)

19.19458363264016
{'agent-0': 0.981670296336091} 

18.214200782237064
{'agent-0': 0.980382850403096} 

17.23524441009393
{'agent-0': 0.9789563721431342} 

16.25787406426344
{'agent-0': 0.9773703458304901} 

15.282273684775358
{'agent-0': 0.9756003794880819} 

14.30865637934459
{'agent-0': 0.9736173054307677} 

13.337270347369603
{'agent-0': 0.971386031974987} 

12.368406281257109
{'agent-0': 0.968864066112495} 

11.402406683841619
{'agent-0': 0.9659995974154896} 

10.439677692119176
{'agent-0': 0.9627289917224431} 

9.480704208392474
{'agent-0': 0.9589734837267017} 

8.526069436122874
{'agent-0': 0.9546347722695998} 

7.576480337348887
{'agent-0': 0.9495890987739877} 

6.632801127482345
{'agent-0': 0.9436792098665414} 

5.6960977839571285
{'agent-0': 0.9367033435252168} 

4.767697787424746
{'agent-0': 0.9283999965323826} 

3.8492711085412576
{'agent-0': 0.9184266788834883} 

2.942941031119684
{'agent-0': 0.9063300774215737} 

2.051437044278467
{'agent-0': 0.8915039868412169} 

1.178306

KeyboardInterrupt: 

In [None]:
config = EnvironmentConfiguration(3, 10, 100)
env = RenderableRingFormation(config)
print(env.reset())
env.render()
#simulate_random_episode(env, 100, 0.03, print_info=False)

({'agent-0': array([ 0.7677255 ,  0.6407788 ,  4.0278707 ,  0.99763036,  0.06880209,
        3.4034908 ,  0.8849182 , -0.46574643,  3.1122217 ], dtype=float32), 'agent-1': array([ 0.37139067,  0.9284767 ,  3.6082413 , -0.6401844 , -0.76822126,
        2.8106368 , -0.99763036, -0.06880209,  3.4034908 ], dtype=float32), 'agent-2': array([ 0.45764342,  0.88913584,  3.9509714 , -0.8849182 ,  0.46574643,
        3.1122217 ,  0.6401844 ,  0.76822126,  2.8106368 ], dtype=float32)}, {})


CanvasWithBorders(height=300, width=300)

## Policy training

In [176]:
import ray
ray.shutdown()

### 3 agents, PPO

In [177]:
from ray.tune.registry import register_env

config = EnvironmentConfiguration(3, 10, 50, max_steps=300, cache_size=2)
env = RingFormation(config)
register_env("ring_formation?agents=3&algo=PPO", lambda _: RingFormation(config))

In [178]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 50

algo = (
    PPOConfig()
    .training(gamma = 0.925, 
            lr = 0.0005,
            train_batch_size = 4096*2, 
            sgd_minibatch_size = 256, 
            num_sgd_iter = 30,
            #entropy_coeff=0.005,
    )
    #.env_runners(num_env_runners=4)
    .resources(num_gpus=0)
    .environment(env="ring_formation?agents=3&algo=PPO")
    #.callbacks(CustomMetricsCallback)
    .build()
)
clear_output()

out = ""
for i in range(trainin_steps):
    """
    if i == 30:
        out += "updating task to DISTANCE_FROM_NEIGHBOURS\n"
        algo.workers.foreach_worker(
            lambda ev: ev.foreach_env(
                lambda env: env.set_task(RingFormation.Task.DISTANCE_FROM_NEIGHBOURS)))
    """
    result = algo.train()
    clear_output()
    out += ppo_result_format(result) + "\n"
    print(out)
    simulate_episode(RenderableRingFormation(config), algo, 200, sleep_between_frames=0.03, print_reward=True, print_action=True)

iteration [1] => episode_reward_mean: 8.086963805695733, episode_len_mean: 300.0, agent_steps_trained: 24576, env_steps_trained: 8192, entropy: 4.208974745372931, learning_rate: 0.0005000000000000001
iteration [2] => episode_reward_mean: 26.956670970055274, episode_len_mean: 300.0, agent_steps_trained: 49152, env_steps_trained: 16384, entropy: 4.1447681092553665, learning_rate: 0.0005000000000000001
iteration [3] => episode_reward_mean: 38.74642361237552, episode_len_mean: 300.0, agent_steps_trained: 73728, env_steps_trained: 24576, entropy: 4.080778366492854, learning_rate: 0.0005000000000000001
iteration [4] => episode_reward_mean: 51.17005199772458, episode_len_mean: 300.0, agent_steps_trained: 98304, env_steps_trained: 32768, entropy: 4.020288323362668, learning_rate: 0.0005000000000000001
iteration [5] => episode_reward_mean: 77.09068909403591, episode_len_mean: 300.0, agent_steps_trained: 122880, env_steps_trained: 40960, entropy: 3.988818285614252, learning_rate: 0.0005000000000

CanvasWithBorders(height=300, width=300)

action:  {'agent-0': array([ 1.       , -1.       ,  0.8208726], dtype=float32), 'agent-1': array([-0.67236066,  0.25055313,  0.24013585], dtype=float32), 'agent-2': array([0.25559998, 1.        , 1.        ], dtype=float32)}
reward:  {'agent-0': 0.8201755031686915, 'agent-1': 0.18025747589430097, 'agent-2': 0.9696277046904154} 

action:  {'agent-0': array([ 1., -1.,  1.], dtype=float32), 'agent-1': array([-1.        ,  1.        ,  0.25777608], dtype=float32), 'agent-2': array([0.28699255, 1.        , 0.49432746], dtype=float32)}
reward:  {'agent-0': 0.9990939454599221, 'agent-1': 0.10270205664080123, 'agent-2': 0.48538157160446005} 

action:  {'agent-0': array([ 1.        , -0.79195297,  1.        ], dtype=float32), 'agent-1': array([-1.      ,  0.311316,  0.37472 ], dtype=float32), 'agent-2': array([-0.18007141,  1.        ,  1.        ], dtype=float32)}
reward:  {'agent-0': 0.9972927196821715, 'agent-1': 0.28895338587054553, 'agent-2': 0.8247858817470703} 

action:  {'agent-0': arr

In [180]:
save_algo(algo, "ring_formation?agents=3&algo=PPO&phase=1")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/ring_formation?agents=3&algo=PPO&phase=1), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 7.685452235324515, 'cur_kl_coeff': 1.0125, 'cur_lr': 0.0005000000000000001, 'total_loss': 4.969661102692286, 'policy_loss': -0.008522659237496556, 'vf_loss': 4.96145134040465, 'vf_explained_var': 0.07497475184500217, 'kl': 0.01652585773659265, 'entropy': 2.784695533083545, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 142560.5, 'diff_num_grad_updates_vs_sampler_policy': 1439.5}}, 'num_env_steps_sampled': 409600, 'num_env_steps_trained': 409600, 'num_agent_steps_sampled': 1228800, 'num_agent_steps_trained': 1228800}, 'sampler_resul

In [183]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print
from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 30

algo = load_algo("ring_formation?agents=3&algo=PPO&phase=1")
clear_output()

algo.workers.foreach_worker(
    lambda ev: ev.foreach_env(
        lambda env: env.set_task(RingFormation.Task.DISTANCE_FROM_NEIGHBOURS)))

out = ""
for i in range(trainin_steps):
    result = algo.train()
    clear_output()
    out += ppo_result_format(result) + "\n"
    print(out)
    simulate_episode(RenderableRingFormation(config), algo, 200, sleep_between_frames=0.03, print_reward=True, print_action=True)

iteration [51] => episode_reward_mean: -595.035641479298, episode_len_mean: 300.0, agent_steps_trained: 1253376, env_steps_trained: 417792, entropy: 2.7272518075174754, learning_rate: 0.0005000000000000001
iteration [52] => episode_reward_mean: -580.077361686813, episode_len_mean: 300.0, agent_steps_trained: 1277952, env_steps_trained: 425984, entropy: 2.7143479217257767, learning_rate: 0.0005000000000000001
iteration [53] => episode_reward_mean: -564.2013487637489, episode_len_mean: 300.0, agent_steps_trained: 1302528, env_steps_trained: 434176, entropy: 2.6577868813027936, learning_rate: 0.0005000000000000001
iteration [54] => episode_reward_mean: -590.0828583142396, episode_len_mean: 300.0, agent_steps_trained: 1327104, env_steps_trained: 442368, entropy: 2.6654861602104374, learning_rate: 0.0005000000000000001
iteration [55] => episode_reward_mean: -593.1136960408238, episode_len_mean: 300.0, agent_steps_trained: 1351680, env_steps_trained: 450560, entropy: 2.7420337500671548, lear

CanvasWithBorders(height=300, width=300)

action:  {'agent-0': array([-1., -1.,  1.], dtype=float32), 'agent-1': array([-0.28370523,  1.        ,  1.        ], dtype=float32), 'agent-2': array([1.        , 1.        , 0.37425423], dtype=float32)}
reward:  {'agent-0': 0.850294849427268, 'agent-1': 0.9102400475882142, 'agent-2': -0.16494274732264813} 

action:  {'agent-0': array([-1.        , -0.9384874 ,  0.18040425], dtype=float32), 'agent-1': array([0.25716949, 1.        , 1.        ], dtype=float32), 'agent-2': array([0.13385928, 1.        , 0.96851724], dtype=float32)}
reward:  {'agent-0': 0.14859728167277808, 'agent-1': 0.5593903048434221, 'agent-2': -0.8453717750092808} 

action:  {'agent-0': array([-1., -1.,  1.], dtype=float32), 'agent-1': array([-0.91787934,  1.        ,  1.        ], dtype=float32), 'agent-2': array([-0.15176296,  0.15703273,  0.        ], dtype=float32)}
reward:  {'agent-0': 0.8312757423563681, 'agent-1': 0.9999973665604536, 'agent-2': 0.0} 

action:  {'agent-0': array([ 0.81421185, -1.        ,  1. 

KeyboardInterrupt: 

In [156]:
save_algo(algo, "ring_formation?agents=3&algo=PPO&phase=2")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/ring_formation?agents=3&algo=PPO&phase=2), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 12.446995903137658, 'cur_kl_coeff': 2.2781249999999997, 'cur_lr': 0.0005000000000000001, 'total_loss': 2.6672143841369285, 'policy_loss': -0.007560958999965806, 'vf_loss': 2.646884426019258, 'vf_explained_var': 0.4825471475927366, 'kl': 0.0122429251908923, 'entropy': 3.7913157107101547, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 84960.5, 'diff_num_grad_updates_vs_sampler_policy': 1439.5}}, 'num_env_steps_sampled': 491520, 'num_env_steps_trained': 491520, 'num_agent_steps_sampled': 1474560, 'num_agent_steps_trained': 1474560}, '

In [179]:
simulate_episode(RenderableRingFormation(config), algo, steps=300, sleep_between_frames=0.03)

CanvasWithBorders(height=300, width=300)

### 3 agents, SAC

In [None]:
from ray.tune.registry import register_env

config = EnvironmentConfiguration(3, 10, 50, max_steps=500, cache_size=2)
env = RingFormation(config)
register_env("ring_formation?agents=3&algo=SAC", lambda _: RingFormation(config))

In [None]:
from ray.rllib.algorithms.sac import SACConfig
from ray.tune.logger import pretty_print
from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 30

algo = (
    SACConfig()
    .training(gamma = 0.925, 
            lr = 0.0005,
            train_batch_size = 4096*2, 
            #entropy_coeff=0.005,
    )
    #.env_runners(num_env_runners=4)
    .resources(num_gpus=0)
    .environment(env="ring_formation?agents=3&algo=SAC")
    .build()
)
clear_output()

out = ""
for i in range(trainin_steps):
    result = algo.train()
    clear_output()
    out += sac_result_format(result) + "\n"
    print(out)
    simulate_episode(RenderableRingFormation(config), algo, 200, sleep_between_frames=0.03, print_reward=True, print_action=True, print_ob = True)

iteration [1] => episode_reward_mean: nan, episode_len_mean: nan
iteration [2] => episode_reward_mean: nan, episode_len_mean: nan
iteration [3] => episode_reward_mean: -70.76117451354, episode_len_mean: 500.0
iteration [4] => episode_reward_mean: -70.76117451354, episode_len_mean: 500.0
iteration [5] => episode_reward_mean: -74.599750876038, episode_len_mean: 500.0
iteration [6] => episode_reward_mean: -74.599750876038, episode_len_mean: 500.0



CanvasWithBorders(height=300, width=300)

obs:  {'agent-0': array([-1.2090671,  2.6523044, -2.6928694,  3.4329107, -0.7853982,
        3.2205396, -1.2090671,  2.6523044, -2.6928694,  3.4329107,
       -0.7853982,  3.2205396], dtype=float32), 'agent-1': array([ 0.02104952,  3.48657   , -0.09065989,  3.8106866 ,  0.44872335,
        3.4329107 ,  0.02104952,  3.48657   , -0.09065989,  3.8106866 ,
        0.44872335,  3.4329107 ], dtype=float32), 'agent-2': array([2.7798634, 2.6523044, 2.3561945, 3.2205396, 3.050933 , 3.8106866,
       2.7798634, 2.6523044, 2.3561945, 3.2205396, 3.050933 , 3.8106866],
      dtype=float32)}
action:  {'agent-0': array([2.748429  , 0.07747126], dtype=float32), 'agent-1': array([1.3803537 , 0.35695222], dtype=float32), 'agent-2': array([-1.709255 ,  0.8173977], dtype=float32)}
reward:  {'agent-0': 0.05296291724707025, 'agent-1': -0.07684920888441127, 'agent-2': 0.1565842108373534} 

obs:  {'agent-0': array([-1.2133633,  2.648564 , -2.6947944,  3.4345775, -0.7866336,
        3.2176776, -1.2090671,  2.6

KeyboardInterrupt: 