## Collect the items

the agents goal is to collect all the items in the environment

### utils

In [1]:
from utils.vectors import Vector2D
from utils.canvas import CanvasWithBorders
from utils.algo_utils import (save_algo, load_algo)
from utils.simulations import (simulate_episode, simulate_random_episode, ppo_result_format)

### environment definition

In [49]:
from typing import Set
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import random as rnd
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiDiscrete
from gymnasium.spaces.utils import flatten, flatten_space
import numpy as np
from IPython.display import clear_output
import math
from ipycanvas import Canvas, hold_canvas

class EnvironmentConfiguration: 
    def __init__(self, n_agents, n_targets, agent_range, spawn_area=100, visible_nbrs=1, visible_targets=1, max_steps=None, cache_size=1):
        # parameters that shouldn't affect the agents' behaviour
        self.n_agents = n_agents
        self.n_targets = n_targets
        self.spawn_area = spawn_area
        self.max_steps = max_steps
        # parameters that affect the agents' behavious
        self.agent_range = agent_range
        # parameters that affect the observation space
        self.visible_nbrs = visible_nbrs
        self.visible_targets = visible_targets
        self.cache_size = cache_size

class KeepTheDistance(MultiAgentEnv):
    canvas = None
    CANVAS_WIDTH, CANVAS_HEIGHT = 300.0, 300.0

    def __init__(self, config: EnvironmentConfiguration):
        assert config.n_agents > config.visible_nbrs

        self.n_agents = config.n_agents
        self.n_targets = config.n_targets
        self.spawn_area = config.spawn_area
        self.max_steps = config.max_steps
        self.agent_range = config.agent_range
        self.visible_nbrs = config.visible_nbrs
        self.visible_targets = config.visible_targets
        self.cache_size = config.cache_size

        self.agents_ids = ['agent-' + str(i) for i in range(self.n_agents)]
        self.agent_colors = {agent: self.rgb_to_hex(rnd.randint(0, 255), rnd.randint(0, 255), rnd.randint(0, 255)) for agent in self.agents_ids}
        self.observation_space = self.observation_space('agent-0')
        self.action_space = self.action_space("")

    def unflatten_observation_space(self, agent):
        direction = Box(low=-1, high=1, shape=(2,1), dtype=np.float32)
        distance = Box(low=-np.inf, high=np.inf, shape=(1,1), dtype=np.float32)

        nbrs = Dict({f"nbr-{i}": Dict({'direction': direction, 'distance': distance}) for i in range(self.visible_nbrs)})
        targets = Dict({f"target-{i}": Dict({'direction': direction, 'distance': distance}) for i in range(self.visible_targets)})

        time_t_obs = Dict({"nbrs": nbrs, "targets": targets})

        return Dict({f"t[-{t}]": time_t_obs for t in range(0, self.cache_size)})

    def observation_space(self, agent):
        return flatten_space(self.unflatten_observation_space(agent))

    def action_space(self, agent):
        direction = Box(low=-1.0, high=1.0, shape=(2,1), dtype=np.float32)
        speed = Box(0.0, 1.0, dtype=np.float32)
        return flatten_space(Tuple([direction, speed]))
    
    def __get_time_t_observation(self, agent):
        nbrs_distance_vectors = [Vector2D.distance_vector(self.agents_pos[agent], self.agents_pos[nbr])  
                            for nbr in self.__get_n_closest_neighbours(agent, self.visible_nbrs)]

        targets_distance_vectors = [Vector2D.distance_vector(self.agents_pos[agent], self.targets_pos[target])  
                            for target in self.__get_n_closest_targets(agent, self.visible_targets)]

        nbrs = {
            f"nbr-{i}": {
                "direction": Vector2D.unit_vector(nbrs_distance_vectors[i]).to_np_array(),
                "distance": np.log(1 + Vector2D.norm(nbrs_distance_vectors[i])) #1 - np.exp(-alpha * x)
            }
            for i in range(len(nbrs_distance_vectors))
        }
    
        targets = {
            f"target-{i}": {
                "direction": Vector2D.unit_vector(targets_distance_vectors[i]).to_np_array(),
                "distance": np.log(1 + Vector2D.norm(targets_distance_vectors[i])) #1 - np.exp(-alpha * x)
            }
            for i in range(len(targets_distance_vectors))
        }
        
        for i in range(len(targets_distance_vectors), self.visible_targets):
            targets[f"target-{i}"] = {
                "direction": np.array([0,0], dtype=np.int32),
                "distance": -1 #1 - np.exp(-alpha * x)
            }

        obs = {
            "nbrs": nbrs,
            "targets": targets
        }

        return obs

    def __get_observation(self, agent):
        if len(self.observation_cache[agent]) == 0:
            self.observation_cache[agent] = [self.__get_time_t_observation(agent)]*self.cache_size
        else:
            self.observation_cache[agent] = [self.__get_time_t_observation(agent)] + self.observation_cache[agent]
            self.observation_cache[agent].pop()

        obs = {
            f"t[-{t}]": self.observation_cache[agent][t]
            for t in range(0, self.cache_size)
        }

        return flatten(self.unflatten_observation_space(agent), obs)

    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def __get_local_reward(self, agent, action):
        # reward_1: small bonus if the agent collects an item
        reward_1 = +5 if agent in self.collectors else 0

        # reward_2: malus if the agent collides with another agent 
        reward_2= sum([-2 if Vector2D.distance(self.agents_pos[agent], self.agents_pos[nbr]) < self.agent_range*2 else 0 for nbr in self.__get_other_agents(agent)])

        # reward_3: -1 at each step
        reward_3 = -1

        # reward_4: positive reward if the agent moves toward the closest targets, negative otherwise
        distance_diff = ([Vector2D.distance(self.agent_old_pos[agent], self.targets_pos[target]) -
                    Vector2D.distance(self.agents_pos[agent], self.targets_pos[target])
            for target in self.closest_targets[agent]])
        
        reward_4 = max(distance_diff) if len(distance_diff) > 0 else 0

        self.info[agent] = {"info": {f"r2: {reward_2}, r3: {reward_3}, r4: {reward_4}"}}
        return  reward_2 + reward_3 + reward_4*3

    def __get_global_reward(self):
        return self.global_reward * 100
    
    def __get_other_agents(self, agent):
        return [other for other in self.agents_ids if other != agent]

    def __get_n_closest_neighbours(self, agent, n=1):
        distances = {other: Vector2D.distance(self.agents_pos[agent], self.agents_pos[other]) for other in self.__get_other_agents(agent)}
        return [neighbour[0] for neighbour in sorted(list(distances.items()), key=lambda d: d[1])[:n]]
        # return {neighbour[0]: neighbour[1] for neighbour in sorted(list(dst.items()), key=lambda d: d[0])[:n]}

    def __get_n_closest_targets(self, agent, n=1):
        n = min(n, len(self.targets_pos.keys()))
        distances = {target: Vector2D.distance(self.agents_pos[agent], pos) for target, pos in self.targets_pos.items()}
        self.closest_targets[agent] = [target[0] for target in sorted(list(distances.items()), key=lambda d: d[1])[:n]]
        return self.closest_targets[agent]

    def __update_agent_position(self, agent, action):
        unit_movement = Vector2D(action[0], action[1])
        self.agent_old_pos[agent] = self.agents_pos[agent]
        self.agents_pos[agent] = Vector2D.sum(self.agents_pos[agent], Vector2D.mul(unit_movement, action[2]))

    def __collect_items(self):
        self.collectors = []
        uncollected_targets = {}
        for target, target_pos in self.targets_pos.items():
            collected = False
            for agent in self.agents_pos.values():
                if Vector2D.distance(target_pos, agent) < self.agent_range:
                    collected = True
                    self.collectors.append(agent)
            if not collected:
                uncollected_targets[target] = target_pos
        self.targets_pos = uncollected_targets

    def __collect_items_and_compute_global_reward(self):
        old_uncollected_items = len(self.targets_pos.keys())
        self.__collect_items()
        updated_uncollected_items = len(self.targets_pos.keys())
        self.global_reward = old_uncollected_items - updated_uncollected_items

    def reset(self, seed=None, options=None):
        self.steps = 0
        self.agents_pos = {agent: Vector2D.get_random_point(max_x=self.spawn_area, max_y=self.spawn_area) for agent in self.agents_ids}
        self.agent_old_pos = dict(self.agents_pos)
        self.targets_pos = {f"target-{i}": Vector2D.get_random_point(max_x=self.spawn_area, max_y=self.spawn_area) for i in range(self.n_targets)}
        self.collectors = []
        self.closest_targets = {}
        self.info = {}
        self.observation_cache = {agent: [] for agent in self.agents_ids}
        return {agent: self.__get_observation(agent) for agent in self.agents_ids}, {}
     
    def step(self, actions):
        self.steps += 1
        observations, rewards, terminated, truncated, infos = {}, {}, {}, {}, {}

        for agent, action in actions.items():
            self.__update_agent_position(agent, action)

        self.__collect_items_and_compute_global_reward()

        for agent, action in actions.items():
            observations[agent] = self.__get_observation(agent)
            rewards[agent] = self.__get_local_reward(agent, action) + self.__get_global_reward()
            terminated[agent] = False
            truncated[agent] = False
            infos[agent] = self.info[agent]

        truncated['__all__'] = False
        if len(self.targets_pos.keys()) == 0:
            terminated['__all__'] = True
        elif self.max_steps != None and self.steps == self.max_steps:
            terminated['__all__'] = True
        else:
            terminated['__all__'] = False

        return observations, rewards, terminated, truncated, infos
     
    def rgb_to_hex(self, r, g, b):
        return f'#{r:02x}{g:02x}{b:02x}'

    def render(self):
        pass

    def get_agent_ids(self):
       return self.agents


class RenderableKeepTheDistance(KeepTheDistance):
    def render(self):
        if self.canvas is None:
            self.canvas = CanvasWithBorders(width=self.CANVAS_WIDTH, height=self.CANVAS_HEIGHT)
            display(self.canvas)
        
        with hold_canvas():
            unit = self.CANVAS_WIDTH/float(self.spawn_area)
            agent_render_size = max(unit,1)
            agent_range_render_size = unit*self.agent_range
            top_left, bottom_right = (0.0,0.0), (self.spawn_area, self.spawn_area)
            self.canvas.clear()

            self.canvas.fill_style = "red"
            for target in self.targets_pos.values():
                raw_pos = target.to_np_array()

                target_pos_in_frame = [((raw_pos[0]-top_left[0])/(bottom_right[0]-top_left[0]))*self.CANVAS_WIDTH,
                        ((raw_pos[1]-top_left[1])/(bottom_right[1]-top_left[1]))*self.CANVAS_HEIGHT,]
                
                self.canvas.fill_circle(
                    target_pos_in_frame[0],
                    target_pos_in_frame[1],
                    1
                )

            for agent in self.agents_ids:
                raw_pos = self.agents_pos[agent].to_np_array()
                color = self.agent_colors[agent]
                
                agent_pos_in_frame = [((raw_pos[0]-top_left[0])/(bottom_right[0]-top_left[0]))*self.CANVAS_WIDTH,
                            ((raw_pos[1]-top_left[1])/(bottom_right[1]-top_left[1]))*self.CANVAS_HEIGHT,]

                self.canvas.fill_style = color
                self.canvas.fill_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_render_size/2.0
                )
                
                self.canvas.stroke_style = "black"
                self.canvas.stroke_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_render_size/2.0
                )

                self.canvas.stroke_style = "red"
                self.canvas.stroke_circle(
                    agent_pos_in_frame[0],
                    agent_pos_in_frame[1],
                    agent_range_render_size
                )

In [7]:
env_config = EnvironmentConfiguration(
    n_agents = 4,
    n_targets = 2,
    spawn_area = 100,
    max_steps=300,
    agent_range = 5,
    visible_nbrs = 3,
    visible_targets = 3,
    cache_size=3)

env = RenderableKeepTheDistance(env_config)

print(env.reset()[0]['agent-0'])
env.render()
simulate_random_episode(env, 100, sleep_between_frames=0.03, print_info=False)
#env.step({'agent-1': (1,1,1)})

[ 0.99991345 -0.01315676  4.3438907   0.9957851   0.09171705  4.3479743
  0.83570546 -0.5491779   4.439842    0.7808688  -0.62469506  3.6742415
  0.2631174  -0.9647638   4.060594    0.          0.         -1.
  0.99991345 -0.01315676  4.3438907   0.9957851   0.09171705  4.3479743
  0.83570546 -0.5491779   4.439842    0.7808688  -0.62469506  3.6742415
  0.2631174  -0.9647638   4.060594    0.          0.         -1.
  0.99991345 -0.01315676  4.3438907   0.9957851   0.09171705  4.3479743
  0.83570546 -0.5491779   4.439842    0.7808688  -0.62469506  3.6742415
  0.2631174  -0.9647638   4.060594    0.          0.         -1.        ]


CanvasWithBorders(height=300, width=300)

-0.0419645466054952
0.7205040169434582
0.05460370904376788
-0.26290442748749143
-0.30536286000073787
0.08478925934286252
0.1818537425983422
0.42006796375359556
0.08425288128137254
-0.07742643100480606
0.11212917881842799
0.6387183065614153
-0.23030367551744746
-0.43747122839783614
0.23745495419001372
0.15847057058822855
-0.27710592470358364
-0.2511834129640178
0.22827457109975313
0.2993045229891962
0.20868433148500998
-0.28684835993841773
0.2118994013233575
-0.0028165990922843775
0.3231193816745339
0.15642170327328486
0.016506790921514636
0.28418283699910774
0.35098751217441304
-0.14934726651630115
0.13513815138038154
0.13359108758311322
0.24069645001193862
-0.1630173257387355
0.06325507467997227
0.13554715500762882
-0.175861156352763
0.14308738973121393
0.17709433001532737
0.03789413259124785
0.524239614272993
0.14746954655723243
0.525759370178017
-0.05347061894926952
-0.37956541360583174
-0.5501720195463093
0.2516971673582269
0.03717497891383914
-0.05723010260259187
0.076804984006336

## policy training

In [32]:
import ray
ray.shutdown()

## collect_the_items?visible_nbrs=3&visible_targets=3

In [15]:
from ray.tune.registry import register_env

env_config = EnvironmentConfiguration(
    n_agents = 4,
    n_targets = 10,
    spawn_area = 100,
    max_steps=300,
    agent_range = 5,
    visible_nbrs = 3,
    visible_targets = 3)
register_env("collect_the_items?visible_nbrs=3&visible_targets=3", lambda _: KeepTheDistance(env_config))

In [31]:
algo = load_algo("collect_the_items?visible_nbrs=3&visible_targets=3")

2024-06-04 10:53:37,419	INFO trainable.py:161 -- Trainable.setup took 10.745 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [16]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 60

algo = (
    PPOConfig()
    .training(gamma = 0.95, 
              lr = 0.001,
              train_batch_size = 4096, 
              sgd_minibatch_size = 256, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="collect_the_items?visible_nbrs=3&visible_targets=3")
    .build()
)
clear_output()

out = ""
for i in range(trainin_steps):
    result = algo.train()
    clear_output()
    out += ppo_result_format(result) + "\n"
    print(out)
    simulate_episode(RenderableKeepTheDistance(env_config), algo, 500, sleep_between_frames=0.01, print_reward=True, print_info=True)

iteration [1] => episode_reward_mean: -54.52017210909764, episode_len_mean: 300.0, agent_steps_trained: 16384, env_steps_trained: 4096, entropy: 4.2294624326129755, learning_rate: 0.0010000000000000005
iteration [2] => episode_reward_mean: 202.58213024504346, episode_len_mean: 300.0, agent_steps_trained: 32768, env_steps_trained: 8192, entropy: 4.297891163950165, learning_rate: 0.0010000000000000005
iteration [3] => episode_reward_mean: 351.88367417079417, episode_len_mean: 300.0, agent_steps_trained: 49152, env_steps_trained: 12288, entropy: 4.306855703641971, learning_rate: 0.0010000000000000005
iteration [4] => episode_reward_mean: 515.5179825586839, episode_len_mean: 300.0, agent_steps_trained: 65536, env_steps_trained: 16384, entropy: 4.3420214214672646, learning_rate: 0.0010000000000000005
iteration [5] => episode_reward_mean: 615.0612015930723, episode_len_mean: 300.0, agent_steps_trained: 81920, env_steps_trained: 20480, entropy: 4.257603951295217, learning_rate: 0.001000000000

CanvasWithBorders(height=300, width=300)

info:  {'agent-0': {'info': {'r2: 0, r3: -1, r4: 0.33403079028189'}}, 'agent-1': {'info': {'r2: 0, r3: -1, r4: 1.0072935966548968'}}, 'agent-2': {'info': {'r2: 0, r3: -1, r4: -0.5727182176185792'}}, 'agent-3': {'info': {'r2: 0, r3: -1, r4: -0.1379721634254416'}}}
reward:  {'agent-0': 100.00209237084567, 'agent-1': 102.02188078996468, 'agent-2': 97.28184534714427, 'agent-3': 98.58608350972368} 

info:  {'agent-0': {'info': {'r2: 0, r3: -1, r4: 0.12978017244862627'}}, 'agent-1': {'info': {'r2: 0, r3: -1, r4: 0.7228598509919415'}}, 'agent-2': {'info': {'r2: 0, r3: -1, r4: 0.5090639321921486'}}, 'agent-3': {'info': {'r2: 0, r3: -1, r4: -0.520222210898492'}}}
reward:  {'agent-0': -0.6106594826541212, 'agent-1': 1.1685795529758245, 'agent-2': 0.5271917965764459, 'agent-3': -2.560666632695476} 

info:  {'agent-0': {'info': {'r2: 0, r3: -1, r4: 0.24104618630396857'}}, 'agent-1': {'info': {'r2: 0, r3: -1, r4: 0.7419611390346113'}}, 'agent-2': {'info': {'r2: 0, r3: -1, r4: -0.4468778789499197'}}

In [32]:
env_config_2 = EnvironmentConfiguration(
    n_agents = 10,
    n_targets = 10,
    spawn_area = 500,
    max_steps=300,
    agent_range = 5,
    visible_nbrs = 3,
    visible_targets = 3)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 300, sleep_between_frames=0.01, print_info=False)

CanvasWithBorders(height=300, width=300)

In [30]:
save_algo(algo, "collect_the_items?visible_nbrs=3&visible_targets=3")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/collect_the_items?visible_nbrs=3&visible_targets=3), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 54.611373374362785, 'cur_kl_coeff': 38.9239013671875, 'cur_lr': 0.0010000000000000005, 'total_loss': 8.893688325832288, 'policy_loss': 0.003270520373916952, 'vf_loss': 8.343666425719857, 'vf_explained_var': 0.10399388348062834, 'kl': 0.01404667480073843, 'entropy': 4.994146442164977, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 114240.5, 'diff_num_grad_updates_vs_sampler_policy': 959.5}}, 'num_env_steps_sampled': 245760, 'num_env_steps_trained': 245760, 'num_agent_steps_sampled': 983040, 'num_agent_steps_trained': 98304

## collect_the_items?visible_nbrs=3&visible_targets=3&cache_size=3

In [57]:
from ray.tune.registry import register_env

env_config = EnvironmentConfiguration(
    n_agents = 4,
    n_targets = 5,
    spawn_area = 100,
    max_steps=500,
    agent_range = 5,
    visible_nbrs = 3,
    visible_targets = 3,
    cache_size=2)
register_env("collect_the_items?visible_nbrs=3&visible_targets=3&cache_size=3", lambda _: KeepTheDistance(env_config))

In [None]:
algo = load_algo("collect_the_items?visible_nbrs=3&visible_targets=3&cache_size=3")

In [58]:
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.logger import pretty_print

from gymnasium.wrappers.time_limit import TimeLimit

trainin_steps = 60

algo = (
    PPOConfig()
    .training(gamma = 0.925, 
              lr = 0.001,
              train_batch_size = 4096, 
              sgd_minibatch_size = 256, 
              num_sgd_iter = 30,
              #entropy_coeff=0.005,
              )
    .env_runners(num_env_runners=1)
    .resources(num_gpus=0)
    .environment(env="collect_the_items?visible_nbrs=3&visible_targets=3&cache_size=3")
    .build()
)
clear_output()

out = ""
for i in range(trainin_steps):
    result = algo.train()
    clear_output()
    out += ppo_result_format(result) + "\n"
    print(out)
    simulate_episode(RenderableKeepTheDistance(env_config), algo, 500, sleep_between_frames=0.01, print_reward=True)

iteration [1] => episode_reward_mean: -739.5492273316224, episode_len_mean: 500.0, agent_steps_trained: 16384, env_steps_trained: 4096, entropy: 4.246371173237761, learning_rate: 0.0010000000000000005
iteration [2] => episode_reward_mean: -477.55517091648153, episode_len_mean: 500.0, agent_steps_trained: 32768, env_steps_trained: 8192, entropy: 4.225632384916147, learning_rate: 0.0010000000000000005
iteration [3] => episode_reward_mean: -300.4690354731436, episode_len_mean: 500.0, agent_steps_trained: 49152, env_steps_trained: 12288, entropy: 4.202773099889358, learning_rate: 0.0010000000000000005
iteration [4] => episode_reward_mean: -91.2652442908371, episode_len_mean: 500.0, agent_steps_trained: 65536, env_steps_trained: 16384, entropy: 4.21546970034639, learning_rate: 0.0010000000000000005
iteration [5] => episode_reward_mean: 99.83375406704977, episode_len_mean: 494.1219512195122, agent_steps_trained: 81920, env_steps_trained: 20480, entropy: 4.237968141088883, learning_rate: 0.00

CanvasWithBorders(height=300, width=300)

reward:  {'agent-0': 0.5441133493377883, 'agent-1': 0.7009293577975981, 'agent-2': 1.8477870083621806, 'agent-3': 0.18192650056783322} 

reward:  {'agent-0': 2.4200859007911326, 'agent-1': 1.0657918038671532, 'agent-2': 1.1947469470405192, 'agent-3': 1.118971699641527} 

reward:  {'agent-0': 1.1699873206680458, 'agent-1': 1.7480914526410931, 'agent-2': 2.402047571515384, 'agent-3': 1.4576047010449464} 

reward:  {'agent-0': 2.5854738273930344, 'agent-1': 0.5565722765386099, 'agent-2': 0.6277600938064269, 'agent-3': 0.6702206649803948} 

reward:  {'agent-0': 101.00752632274484, 'agent-1': 100.94431586187355, 'agent-2': 102.5464559200949, 'agent-3': 100.33231318203542} 

reward:  {'agent-0': 1.986451927807245, 'agent-1': 3.21175423045279, 'agent-2': 2.0132276415023895, 'agent-3': 1.0006724383594303} 

reward:  {'agent-0': 1.5188484804766276, 'agent-1': 3.208671366142397, 'agent-2': 2.967948464109533, 'agent-3': -0.4555973011817791} 

reward:  {'agent-0': -0.014024658909931986, 'agent-1':

In [59]:
env_config_2 = EnvironmentConfiguration(
    n_agents = 10,
    n_targets = 30,
    spawn_area = 300,
    max_steps=300,
    agent_range = 5,
    visible_nbrs = 3,
    visible_targets = 3,
    cache_size=2)
simulate_episode(RenderableKeepTheDistance(env_config_2), algo, 600, sleep_between_frames=0.01, print_info=False)

CanvasWithBorders(height=300, width=300)

In [60]:
save_algo(algo, "collect_the_items?visible_nbrs=3&visible_targets=3&cache_size=3")

An Algorithm checkpoint has been created inside directory: 'TrainingResult(checkpoint=Checkpoint(filesystem=local, path=/mnt/c/Users/nicol/Desktop/Università/tesi/experiments/RL_experiments/algos/collect_the_items?visible_nbrs=3&visible_targets=3&cache_size=3), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 6.850533539305131, 'cur_kl_coeff': 3.417187499999999, 'cur_lr': 0.0010000000000000005, 'total_loss': 8.01003879532218, 'policy_loss': -0.019474794888325656, 'vf_loss': 7.9856410051385565, 'vf_explained_var': 0.17176912014062207, 'kl': 0.012838793924716659, 'entropy': 4.168454428389668, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 256.0, 'num_grad_updates_lifetime': 114240.5, 'diff_num_grad_updates_vs_sampler_policy': 959.5}}, 'num_env_steps_sampled': 245760, 'num_env_steps_trained': 245760, 'num_agent_steps_sampled': 983040, 'num_agent_steps_