# **My custom env - dense reward**

**Environment classe pasted here in order to make it accessible to all the workers of Ray.**

It is equivalent to /myenv_6_dense_reward.py

In [1]:
under_examination = "looseplay_2agents_densereward"

In [2]:
import sys, os
from pathlib import Path

current_dir = Path(os.getcwd()) 
pointed_dir = current_dir

while True:
    if pointed_dir.is_dir() and "distributed_ai_project" == str(pointed_dir).split("/")[-1]:
        project_dir = pointed_dir
        print(f"Project dir found: {project_dir}")
        break
    
    if pointed_dir == pointed_dir.parent:
        print("Error while looking for 'distributed_project', project dir")
        break
        
    pointed_dir = pointed_dir.parent


Project dir found: /home/terra/Desktop/magistrale/distributed_ai_project


In [3]:
import numpy as np
import functools
import gymnasium as gym
from gymnasium import spaces
from pettingzoo import ParallelEnv
import pygame 
import os
from collections import defaultdict
import random

SPRITES_DIR = f"{project_dir}/sprites"


class MyGridWorld(ParallelEnv):
    metadata = {"render_modes": ["human"], "name": "custom_grid_v0"}

    def __init__(self, render_mode=None, n_agents = 2, grid_size=15, vision_radius = 5):
        if grid_size<8:
            print("Error, need to insert a grid size greater or equal to 8")
        self.grid_size = grid_size
        self.render_mode = render_mode
        self.vision_radius = vision_radius
        ######## Agents, fixed components, and forbidden positions
        self.possible_agents = [f"agent{i}" for i in range(1, n_agents+1)]
        self.agents = self.possible_agents[:]
        self.gate_open = False

        self.fixed_components = {
            "button1":  {"pos": np.array([self.grid_size//2+2, 1+3+2]), "file": "button.png"},
            "button2":  {"pos": np.array([self.grid_size//2+2, 1+1]), "file": "button.png"},

            "gate_open":  {"pos": np.array([self.grid_size//2, 1+3]), "file": "gate_open.png"},
            "gate_close":  {"pos": np.array([self.grid_size//2, 1+3]), "file": "gate_close.png"}   
        }
        self.button1_pos = self.fixed_components["button1"]["pos"]
        self.button2_pos = self.fixed_components["button2"]["pos"]

        self.gate_pos = self.fixed_components["gate_open"]["pos"] 
        self.target_final_pos = self.gate_pos+[0, -1]

        self.x_range = (1, self.grid_size-1-1)
        self.y_range = (1+3+1, self.grid_size-1-1)
        self.forbidden_position = {tuple(self.button1_pos)}    

        self.max_cycles = 100
        self.current_cycles = 0
        ######## Pygame graphic configuration 
        self.window_size = 810
        self.cell_size = self.window_size // self.grid_size
        self.window = None
        self.clock = None
        
        # Walls
        self.grid_map = np.zeros((self.grid_size, self.grid_size))
        self.grid_map[0, :] = 1
        self.grid_map[:, 0] = 1
        self.grid_map[-1, :] = 1
        self.grid_map[:, -1] = 1
        self.grid_map[ :, 1+3] = 1

        # Sprites 
        self.agent_sprites = {}
        self.component_sprites = {}

        ######## Action space and observation space
        # Five possible actions in each grid: stay(0), up(1), down(2), left(3), right(4)
        self.action_spaces = {a: spaces.Discrete(5) for a in self.possible_agents}
        OBSERVATION_DIM = 2*(len(self.agents)-1)+3*2+1
        self.SENTINEL = (self.grid_size-1)*2
        self.observation_spaces = {
            a: spaces.Box(low=-(self.grid_size-1), high=self.SENTINEL, shape=(OBSERVATION_DIM,), dtype=np.float32) 
            for a in self.possible_agents
        }
        self.state_spaces = {a: None for a in self.possible_agents}
    
    def _visible(self, my_pos, target_pos):
        """(Manhattan distance)."""
        delta = target_pos - my_pos
        is_visible = np.linalg.norm(delta, ord=1) <= self.vision_radius
        return (is_visible, delta if is_visible else np.array([self.SENTINEL,self.SENTINEL], dtype=np.float32))

    def state(self):
        observations = self.gather_observations()
        global_state = []
        
        for agent in self.possible_agents:
            if agent in observations:
                global_state.append(observations[agent])
            else:
                # Se l'agente è morto/uscito, usa zeri
                dim = self.observation_spaces[agent].shape[0]
                global_state.append(np.zeros(dim, dtype=np.float32))
                
        return np.concatenate(global_state)

    # Boilerplate PettingZoo
    def observation_space(self, agent): return self.observation_spaces[agent]
    def action_space(self, agent): return self.action_spaces[agent]

    '''
    Step in the environment
    '''
    def step(self, actions):
        if not self.agents: return {}, {}, {}, {}, {}
        self.current_cycles += 1
        
        rewards = {a: 0 for a in self.agents}

        terminations = {a: False for a in self.agents}
        truncations = {a: False for a in self.agents}
        infos = {a: {"is_success":False} for a in self.agents}
        
        desired_positions = {}      
        button_pressed = False
        agents_desire_gate = []
        for agent, action in actions.items():
            current_pos = self.agent_positions[agent].copy()
            target_pos = current_pos.copy()

            if action == 1: target_pos[1] -= 1 
            elif action == 2: target_pos[1] += 1
            elif action == 3: target_pos[0] -= 1
            elif action == 4: target_pos[0] += 1

            # Monitor button, gate and walls
            is_wall = self.grid_map[target_pos[0], target_pos[1]] == 1
            is_gate = (target_pos == self.gate_pos).all()
            is_button = (target_pos == self.button1_pos).all() or (target_pos == self.button2_pos).all()

            if is_button:
                button_pressed = True
            if is_gate:
                agents_desire_gate.append((agent, current_pos))
            
            if is_wall and not is_gate:
                desired_positions[agent] = current_pos 
            else:
                desired_positions[agent] = target_pos
                        
        # If gate was open remove a wall, if gate was closed update the position of those who wanted to cross it
        self.gate_open = button_pressed
        pushed_agent = None

        if self.gate_open:
            self.grid_map[self.gate_pos[0], self.gate_pos[1]] = 0
        else:            
            self.grid_map[self.gate_pos[0], self.gate_pos[1]] = 1

            for a in agents_desire_gate:
                agent = a[0]
                current_pos = a[1]
                desired_positions[agent] = current_pos      # if current pos is not the gate pos

                if (current_pos==self.gate_pos).all():      # if current pos is gate pos, in this case agent needs to be pushed down
                    gate_x, gate_y = self.gate_pos
                    candidate_cells = [
                        np.array([gate_x,     gate_y + 1]),
                        np.array([gate_x - 1, gate_y + 1]),
                        np.array([gate_x + 1, gate_y + 1]),
                    ]

                    for cell in candidate_cells:
                        if not any((cell == p).all() for p in self.agent_positions.values()):
                            desired_positions[agent] = cell
                            pushed_agent = (agent, cell)
                            break
                    # Note: if there is not a free cell the agent stays in the gate pos (should not happen)
                  
        # Check for conflicts (more than one agents have the same desired position)
        final_positions = self.agent_positions.copy()
        target_counts = defaultdict(list)         # Key: tuple(x, y) of desired positions, Value: list of agents desiring it
        for agent, pos in desired_positions.items():
            pos_tuple = tuple(pos)
            target_counts[pos_tuple].append(agent)

        # Solve eventual conflicts
        for pos_tuple, agents_at_target in target_counts.items():      
            # Case 1: no contended position
            if len(agents_at_target) == 1:
                agent = agents_at_target[0]
                final_positions[agent] = desired_positions[agent]
            
            # Case 2: contended position
            else:
                if pushed_agent and tuple(pushed_agent[1]) == pos_tuple:
                    winning_agent = pushed_agent[0]
                else:
                    one_agent_already_here_not_moving = None
                    for agent in agents_at_target:
                        if tuple(self.agent_positions[agent]) == pos_tuple:
                            one_agent_already_here_not_moving = agent
                            break

                    winning_agent = one_agent_already_here_not_moving if one_agent_already_here_not_moving else random.choice(agents_at_target)
                
                final_positions[winning_agent] = desired_positions[winning_agent]
                agents_at_target.remove(winning_agent)
                for losing_agent in agents_at_target[:]:
                    final_positions[losing_agent] = self.agent_positions[losing_agent] 
        self.agent_positions = final_positions

        # Negative reward if an agent is on the bottom area and is not pressing the button
        agents_upper_area = 0
        for a, pos in self.agent_positions.items():
            dist_target = np.linalg.norm(pos - self.target_final_pos)
            if pos[1]<4:                                # Agent on upper area
                rewards[a] = +0.5
                agents_upper_area+=1

                if (pos == self.button2_pos).all():
                    rewards[a] +=0.3
            else:
                rewards[a] -= (dist_target * 0.05)      # Agent on bottom area

                if (pos == self.button1_pos).all():     # Agent on button
                    rewards[a] += 0.3     
       
        if agents_upper_area == len(self.agents):
            rewards = {a: +100 for a in self.agents}
            terminations = {a: True for a in self.agents}
            infos = {a: {"is_success":True} for a in self.agents}

        if self.current_cycles >= self.max_cycles:
            truncations= {a: True for a in self.agents}
            
        
        if self.render_mode == "human":
            self.render()
            
        final_obs = self.gather_observations()
        self.agents = [a for a in self.agents if not terminations[a]]
        return final_obs, rewards, terminations, truncations, infos
   
    '''
    Determines initial condition of the simulation
    '''

    def generate_valid_position(self):
        while True:
            x = np.random.randint(self.x_range[0], self.x_range[1])
            y = np.random.randint(self.y_range[0], self.y_range[1])
            new_position = tuple((x, y))

            if new_position not in self.forbidden_position:
                return new_position
            
    def generate_set_initial_positions(self):
        set_initial_positions = set()
        while len(set_initial_positions)<len(self.possible_agents):
            new_pos = self.generate_valid_position()
            set_initial_positions.add(new_pos)
        return set_initial_positions
    
    def generate_test_values(self, agent_num):
        if agent_num==1:
            return[7, 9]
        else:
            return [9,13]

    def reset(self, seed=None, options=None):
        self.agents = self.possible_agents[:]
        set_initial_positions = list(self.generate_set_initial_positions())
        self.current_cycles = 0

        self.agent_positions = {}
        for i, agent_id in enumerate(self.agents):
            self.agent_positions[agent_id] = np.array(set_initial_positions[i])
        
        # To visualize the reset position if "human" mode on
        if self.render_mode == "human":
            self.render()         

        return self.gather_observations(), {a: {} for a in self.agents}
    
    '''
    Observation: [my_cur_pos, (other_cur) x number of other agents, button_pos, gate_pos, gate_open (1|0)]    '''
    def gather_observations(self):
        gate_status_info = np.array([int(self.gate_open)], dtype=np.float32)
        observations = {}
        for observing_agent in self.agents:
            obs_segments = []
            observing_agent_pos = self.agent_positions[observing_agent]
            
            for other_agent in self.agents:
                if other_agent != observing_agent:
                    is_visible, delta = self._visible(observing_agent_pos,self.agent_positions[other_agent])
                    obs_segments.append(delta)
          
            is_visible, delta = self._visible(observing_agent_pos,self.button1_pos)
            obs_segments.append(delta)
           
            is_visible, delta = self._visible(observing_agent_pos,self.button2_pos)
            obs_segments.append(delta)

            is_visible, delta = self._visible(observing_agent_pos,self.gate_pos)
            obs_segments.append(delta)

            obs_segments.append(gate_status_info if is_visible else np.array([-1], dtype=np.float32))
            observations[observing_agent] = np.concatenate(obs_segments, dtype=np.float32)
        return observations

    '''
    Functions for Sprite rendering and loading
    '''    
    def _create_missing_sprite(self, text, bg_color, border_color = (0,0,0)):  
        error_sprite = pygame.Surface((self.cell_size, self.cell_size), pygame.SRCALPHA)
        error_sprite.fill((0, 0, 0, 0))
        center = (self.cell_size // 2, self.cell_size // 2)
        radius = int(self.cell_size * 0.4)
        pygame.draw.circle(error_sprite, bg_color, center, radius) 
        pygame.draw.circle(error_sprite, border_color, center, radius, 2)
        
        # Add text label
        try:
            font_size = int(self.cell_size * 0.22)
            font = pygame.font.Font(None, font_size)   
            text_color = (0, 0, 0) if sum(bg_color) > 300 else (255, 255, 255) 
            text_surface = font.render(text, True, text_color)
            text_rect = text_surface.get_rect(center=center)
            error_sprite.blit(text_surface, text_rect)
            
        except Exception as e:
            print(f"WARNING: Error while writing text inside missing sprite {text}. A missing sprite without text label will be used for this simulation.")
            pass
            
        return error_sprite
    
    def _load_and_scale_sprite(self, filename, component_name):
        path = os.path.join(SPRITES_DIR, filename)
    
        try:
            image = pygame.image.load(path).convert_alpha() 
            scaled_size = int(self.cell_size )
            return pygame.transform.scale(image, (scaled_size, scaled_size))
            
        except (FileNotFoundError, pygame.error) as e:
            print(f"WARNING! Sprite image {path} not found. Using default sprite for this simulation.")
    
            text = component_name.upper() if component_name else "ERROR"
            bg_color = (100, 100, 255) if "AGENT" in text else (128,128,128) # Blue agent, Gray fixed components
            return self._create_missing_sprite(text, bg_color)

    def render(self):
        if self.render_mode is None:
            return

        if self.window is None:
            pygame.init()
            pygame.display.init()
            pygame.font.init()
            self.window = pygame.display.set_mode((self.window_size, self.window_size))
            
            for agent_name in self.agents:
               # self.agent_sprites[agent_name] = self._load_and_scale_sprite(f"{agent_name}.png", agent_name)
               self.agent_sprites[agent_name] = self._load_and_scale_sprite(".png", agent_name)

            for name, data in self.fixed_components.items():
                self.component_sprites[name] = self._load_and_scale_sprite(data["file"], name)

                 
        if self.clock is None:
            self.clock = pygame.time.Clock()

        canvas = pygame.Surface((self.window_size, self.window_size))
        canvas.fill((220, 220, 220)) 
        pix_square_size = self.cell_size 
        sprite_offset = (pix_square_size - int(pix_square_size * 0.8)) // 2

        # Render walls
        for x in range(self.grid_size):
            for y in range(self.grid_size):
                if self.grid_map[x, y] == 1:
                    pygame.draw.rect(
                        canvas, 
                        (50, 50, 50),
                        pygame.Rect(x * pix_square_size, y * pix_square_size, pix_square_size, pix_square_size)
                    )

        # Render fixed components and agents
        for name, data in self.fixed_components.items():
            if self.gate_open and name == "gate_close":
                continue
            pos = data["pos"]
            x_coord = pos[0] * pix_square_size + sprite_offset
            y_coord = pos[1] * pix_square_size + sprite_offset
            sprite = self.component_sprites[name]
            canvas.blit(sprite, (x_coord, y_coord))

        for agent in self.agents:
            pos = self.agent_positions[agent]            
            x_coord = pos[0] * pix_square_size + sprite_offset
            y_coord = pos[1] * pix_square_size + sprite_offset
            
            sprite = self.agent_sprites[agent]
            canvas.blit(sprite, (x_coord, y_coord))

        # Render the grid
        for x in range(self.grid_size + 1):
            pygame.draw.line(
                canvas, 0, (0, pix_square_size * x), (self.window_size, pix_square_size * x), width=2
            )
            pygame.draw.line(
                canvas, 0, (pix_square_size * x, 0), (pix_square_size * x, self.window_size), width=2
            )

        # Display the render
        self.window.blit(canvas, canvas.get_rect())
        pygame.event.pump()
        pygame.display.update()
        self.clock.tick(4) # FPS

    def close(self):
        if self.window is not None:
            pygame.display.quit()
            pygame.quit()


  from pkg_resources import resource_stream, resource_exists


**Wrap the envornment in a PettingZoo env (for compatibility with Ray) and register it on Ray lib**

In [4]:
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
from ray.tune.registry import register_env

def env_creator(config):
    return ParallelPettingZooEnv(
        MyGridWorld(render_mode=None, n_agents=2)
    )

register_env("my_gridworld", env_creator)

## **IPPO - loose play settings**

Callback to register episode success rate

In [5]:
from ray.rllib.algorithms.callbacks import DefaultCallbacks

class SuccessMetricCallback(DefaultCallbacks):
    def on_episode_end(self, *, worker, base_env, policies, episode, **kwargs):
        
        is_success = False
        one_agent_id = list(episode.get_agents())[-1]
        last_info = episode.last_info_for(one_agent_id)
        is_success = last_info["is_success"]
       
        episode.custom_metrics["success_rate"] = int(is_success)

New config

In [6]:
from ray.rllib.algorithms.ppo import PPOConfig

dummy_env = MyGridWorld(n_agents=2)
agent_ids = dummy_env.possible_agents

multi_agent_policies = {
    agent_id: (
        None,  # default model
        dummy_env.observation_spaces[agent_id],
        dummy_env.action_spaces[agent_id],
        {
            "model": {
                "use_lstm": True,
                "lstm_cell_size": 128,
                "max_seq_len": 100,
                "lstm_use_prev_action": True,
                "lstm_use_prev_reward": True,
            }
        }
    )
    for agent_id in agent_ids
}

config = (
    PPOConfig()
    .environment(env="my_gridworld")
    .env_runners(num_env_runners=0)
    .callbacks(callbacks_class=SuccessMetricCallback)
    .multi_agent(
        policies=multi_agent_policies,
        policy_mapping_fn=lambda agent_id, *args, **kwargs: agent_id
    )
    .api_stack(
        enable_rl_module_and_learner=False,
        enable_env_runner_and_connector_v2=False
    )
    .evaluation(
        evaluation_interval=1,
        evaluation_duration=10,
        evaluation_duration_unit="episodes",
        evaluation_num_env_runners=1,
        evaluation_config={"explore": True}
    )
    .training(
        lr = 1e-4,
        train_batch_size_per_learner=8000,
        num_epochs=10,
        entropy_coeff=0.01,
        entropy_coeff_schedule = [
            [0, 0.03],
            [2e6, 0.005]
        ],
        kl_coeff=0.2,
        kl_target=0.01,
    )
)

dummy_env.close()


Old config

In [None]:
from ray.rllib.algorithms.ppo import PPOConfig

dummy_env = MyGridWorld(n_agents=2)
agent_ids = dummy_env.possible_agents

multi_agent_policies = {
    agent_id: (
        None,  # default model
        dummy_env.observation_spaces[agent_id],
        dummy_env.action_spaces[agent_id],
        {
            "model": {
                "use_lstm": True,
                "lstm_cell_size": 128,
                "max_seq_len": 75,
                "lstm_use_prev_action": True,
                "lstm_use_prev_reward": True,
            }
        }
    )
    for agent_id in agent_ids
}

config = (
    PPOConfig()
    .environment(env="my_gridworld")
    .env_runners(num_env_runners=0)
    .callbacks(callbacks_class=SuccessMetricCallback)
    .multi_agent(
        policies=multi_agent_policies,
        policy_mapping_fn=lambda agent_id, *args, **kwargs: agent_id
    )
    .api_stack(
        enable_rl_module_and_learner=False,
        enable_env_runner_and_connector_v2=False
    )
    .evaluation(
        evaluation_interval=1,
        evaluation_duration=10,
        evaluation_duration_unit="episodes",
        evaluation_num_env_runners=1,
        evaluation_config={"explore": True}
    )
    .training(
        lr = 1e-4,
        train_batch_size_per_learner=4000,
        num_epochs=10,
        entropy_coeff=0.03,
        
        kl_coeff=0.2,
        kl_target=0.01,
    )
)

dummy_env.close()


## **Save the agents model**

In [7]:
import os
def save_model(ppo, file_name):
    current_dir = os.getcwd()
    model_dir = os.path.join(current_dir, "models")
    checkpoint_name = os.path.join(model_dir, file_name)
    ppo.save(checkpoint_name)

    print(f"Model saved into {checkpoint_name}")

## **Wandb, 3 runs of training to generate esteme**

In [8]:
import wandb
group_name = "dense"

for run_idx in range(2,4):
    print(f"\n{'='*40}")
    print(f"RUN NUMBER {run_idx}/3")
    print(f"{'='*40}\n")

    ppo = config.build_algo()
    wandb.init(
        project="gridworld_pomdp_looseplay_2agents",
        group=group_name,           
        name=f"run_{run_idx}",    
        reinit=True 
    )
    file_name = f"{under_examination}_run_{run_idx}"
    
    num_total_episodes = 0
    for i in range(80):
        result = ppo.train()
        num_total_episodes += result['env_runners']['num_episodes']
        
        print(f"ITERATION {i}, num episode {num_total_episodes}".center(100,"-"))
        train_rew = round(result['env_runners']['episode_return_mean'], 2)
        train_steps_episode = round(result['env_runners']['episode_len_mean'], 2)
        train_success_rate = round(result['env_runners']['custom_metrics'].get('success_rate_mean', 0), 2) * 100
        print(f"""TRAINING\nRew:{train_rew} \nSteps per episode:{train_steps_episode} \nSuccess rate:{train_success_rate}%\n""")

        if 'evaluation' in result:
            testing_rew = round(result['evaluation']['env_runners']['episode_return_mean'], 2)
            testing_steps_episode = round(result['evaluation']['env_runners']['episode_len_mean'], 2)
            testing_success_rate = round(result['evaluation']['env_runners']['custom_metrics'].get('success_rate_mean', 0), 2) * 100
            print(f"""TESTING\nRew:{testing_rew} \nSteps per episode:{testing_steps_episode} \nSuccess rate:{testing_success_rate}%\n\n""")

        log_data = {
            "Episodes": num_total_episodes,

            "Training_steps_per_episode": train_steps_episode,
            "Training_success_rate_per_episode": train_success_rate,
            "Training_reward_per_episode": train_rew,

            "Testing_steps_per_episode": testing_steps_episode,
            "Testing_success_rate_per_episode": testing_success_rate,
            "Testing_reward_per_episode": testing_rew
        }
        wandb.log(log_data)
    save_model(ppo, file_name)
    wandb.finish()

print("FINISHED")


RUN NUMBER 2/3



`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2026-01-02 22:03:37,577	INFO worker.py:2007 -- Started a local Ray instance.
[2026-01-02 22:03:40,538 E 302279 302279] core_worker.cc:2223: Actor with class name: 'RolloutWorker' and ID: 'd8b5367ca80da5c0b1739e0401000000' has constructor arguments in the object store and max_restarts > 0. If the argume

[36m(pid=gcs_server)[0m [2026-01-02 22:04:06,529 E 302326 302326] (gcs_server) gcs_server.cc:303: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[33m(raylet)[0m [2026-01-02 22:04:07,515 E 302441 302441] (raylet) main.cc:1032: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[36m(RolloutWorker pid=302496)[0m [2026-01-02 22:04:08,354 E 302496 302612] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[2026-01-02 22:04:08,740 E 302279 302494] core_worker_process.cc:842: Failed to establish connection to the metr

------------------------------------ITERATION 0, num episode 80-------------------------------------
TRAINING
Rew:-76.76 
Steps per episode:100.0 
Success rate:0.0%

TESTING
Rew:-58.41 
Steps per episode:100.0 
Success rate:0.0%


------------------------------------ITERATION 1, num episode 160------------------------------------
TRAINING
Rew:-65.61 
Steps per episode:100.0 
Success rate:0.0%

TESTING
Rew:-47.68 
Steps per episode:100.0 
Success rate:0.0%


------------------------------------ITERATION 2, num episode 240------------------------------------
TRAINING
Rew:-49.72 
Steps per episode:100.0 
Success rate:0.0%

TESTING
Rew:-39.89 
Steps per episode:100.0 
Success rate:0.0%


------------------------------------ITERATION 3, num episode 320------------------------------------
TRAINING
Rew:-45.34 
Steps per episode:100.0 
Success rate:0.0%

TESTING
Rew:-40.55 
Steps per episode:100.0 
Success rate:0.0%


------------------------------------ITERATION 4, num episode 400------------

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Episodes,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇█
Testing_reward_per_episode,▁▁▂▂▂▂▂▂▂▃▃▂▂▂▃▂▃▄▃▃▄▃▄▃▄▅▅▅▇▅▇▆▇▇▇▅████
Testing_steps_per_episode,██████████▇█████▇▇▇█▆▇▇▇▆▆▇▇▅▆▄▅▃▄▃▅▁▂▁▁
Testing_success_rate_per_episode,▁▁▁▁▁▁▁▁▁▂▃▂▁▁▂▁▂▃▂▂▃▂▃▂▃▅▃▄▆▄▆▅▆▇▆▄████
Training_reward_per_episode,▁▂▂▂▂▃▂▂▃▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇███
Training_steps_per_episode,███████████████████▇█▇▇▇▇▆▆▆▆▅▅▄▄▄▃▃▃▃▁▁
Training_success_rate_per_episode,▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▂▁▂▂▂▃▃▃▃▄▅▅▅▆▆▆▇▆▆▇▇███

0,1
Episodes,8211.0
Testing_reward_per_episode,196.52
Testing_steps_per_episode,36.22
Testing_success_rate_per_episode,100.0
Training_reward_per_episode,192.28
Training_steps_per_episode,36.37
Training_success_rate_per_episode,96.0



RUN NUMBER 3/3



`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
[36m(RolloutWorker pid=302500)[0m   from pkg_resources import resource_stream, resource_exists
[36m(pid=302495)[0m [2026-01-02 22:04:08,728 E 302495 303041] core_worker_process.cc:842: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status:

------------------------------------ITERATION 0, num episode 80-------------------------------------
TRAINING
Rew:-71.26 
Steps per episode:100.0 
Success rate:0.0%

TESTING
Rew:-68.69 
Steps per episode:100.0 
Success rate:0.0%


------------------------------------ITERATION 1, num episode 160------------------------------------
TRAINING
Rew:-63.02 
Steps per episode:100.0 
Success rate:0.0%

TESTING
Rew:-47.98 
Steps per episode:100.0 
Success rate:0.0%


------------------------------------ITERATION 2, num episode 240------------------------------------
TRAINING
Rew:-53.66 
Steps per episode:100.0 
Success rate:0.0%

TESTING
Rew:-40.63 
Steps per episode:100.0 
Success rate:0.0%


------------------------------------ITERATION 3, num episode 320------------------------------------
TRAINING
Rew:-45.16 
Steps per episode:100.0 
Success rate:0.0%

TESTING
Rew:-34.13 
Steps per episode:100.0 
Success rate:0.0%


------------------------------------ITERATION 4, num episode 400------------

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Episodes,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇█
Testing_reward_per_episode,▁▂▂▂▂▂▂▂▂▂▂▃▃▄▅▆▆▄▄▄▄▄▅▄▅▆▄▇▆▆▇▆▇█▇█████
Testing_steps_per_episode,█████████████▇▆▆▆▇▆█▇▇▆▇▇▆▇▅▆▆▆▅▄▃▅▃▂▃▂▁
Testing_success_rate_per_episode,▁▁▁▁▁▁▁▁▁▁▁▂▂▃▄▅▆▃▃▃▃▃▄▃▃▅▃▆▆▅▆▆▆█▆█████
Training_reward_per_episode,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇█████
Training_steps_per_episode,██████████████▇▇▇▆▇▆▇▇▆▇▇▆▆▆▅▅▅▅▄▃▄▃▃▂▂▁
Training_success_rate_per_episode,▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▃▃▄▃▄▃▃▄▄▄▄▄▄▆▆▆▆▇▇▇█████

0,1
Episodes,8645.0
Testing_reward_per_episode,198.13
Testing_steps_per_episode,25.22
Testing_success_rate_per_episode,100.0
Training_reward_per_episode,197.03
Training_steps_per_episode,31.03
Training_success_rate_per_episode,100.0


FINISHED


## **Load agents**

In [None]:
from ray.rllib.algorithms.ppo import PPO
import os

def load_model(file_name):
    current_dir = os.getcwd()
    model_dir = os.path.join(current_dir, "models")
    checkpoint_name = os.path.join(model_dir, file_name)

    ppo = PPO.from_checkpoint(checkpoint_name)
    return ppo

In [9]:
import os

current_dir = os.getcwd()
model_dir = os.path.join(current_dir, "models")

for filename in os.listdir(model_dir):
    if "best" in filename and under_examination in filename:
        matching_file = filename
        break

file_path = os.path.join(model_dir, matching_file)
ppo = load_model(file_path)


NameError: name 'load_model' is not defined

## **Graphically test the agent**

In [None]:
env = MyGridWorld(grid_size=15, n_agents=2, render_mode="human")
obs, _ = env.reset()

# Initialize lstm, previous actions and rewards
lstm_states = {agent_id: [np.zeros(128), np.zeros(128)] for agent_id in env.possible_agents}  # 128 = lstm_cell_size
prev_actions = {agent_id: 0 for agent_id in env.possible_agents}                              # 0 as defualt previous action
prev_rewards = {agent_id: 0.0 for agent_id in env.possible_agents}                            # 0 as default previous reward

num_episodes = 0
while num_episodes<3:
    actions = {}

    for agent_id, agent_obs in obs.items():
        action, new_state, _ = ppo.get_policy(agent_id).compute_single_action(
            obs=agent_obs,
            state=lstm_states[agent_id],
            prev_action=prev_actions[agent_id],
            prev_reward=prev_rewards[agent_id],
            explore=True
        )
        actions[agent_id] = action
        lstm_states[agent_id] = new_state  # update lstm states

    obs, rewards, terminations, truncations, infos = env.step(actions)

    # update prev_actions e prev_rewards
    for agent_id in obs.keys():
        prev_actions[agent_id] = actions[agent_id]
        prev_rewards[agent_id] = rewards[agent_id]

    if any(terminations.values()) or any(truncations.values()):
        obs, _ = env.reset()
        # reset lstm states, previous actions and rewards
        lstm_states = {agent_id: [np.zeros(128), np.zeros(128)] for agent_id in env.possible_agents}
        prev_actions = {agent_id: 0 for agent_id in env.possible_agents}
        prev_rewards = {agent_id: 0.0 for agent_id in env.possible_agents}
        num_episodes += 1

env.close()


{'agent1': array([ 1., -2., 28., 28., 28., 28., 28., 28., -1.], dtype=float32), 'agent2': array([-1.,  2., 28., 28., 28., 28.,  3., -1.,  0.], dtype=float32)}
{'agent1': array([ 2., -2., 28., 28., 28., 28., 28., 28., -1.], dtype=float32), 'agent2': array([-2.,  2.,  4.,  1., 28., 28.,  2., -1.,  0.], dtype=float32)}
{'agent1': array([ 3., -2., 28., 28., 28., 28., 28., 28., -1.], dtype=float32), 'agent2': array([-3.,  2.,  3.,  1., 28., 28.,  1., -1.,  0.], dtype=float32)}
