# Use Centralized PPO from Stable Baselines-3 on Custom Particle Environment

In [None]:
# This is the run-through with random actions
import particle_v1
env = particle_v1.parallel_env(num_agents=10, num_food_sources=1, flow = 'none', max_cycles=25, render_mode='human')

env.reset() # Do seed=42 for reproducibility
for agent in env.agent_iter():
    observation, reward, termination, truncation, info = env.last()

    if termination or truncation:
        action = None
    else:
        action = env.action_space(agent).sample() # this is where you would insert your policy

    env.step(action)
env.close()

  from pkg_resources import resource_stream, resource_exists


We make our environment match the stable baselines algorithms (needs to be vectorised).

In [3]:
import particle_v1
import supersuit as ss
from stable_baselines3 import PPO

# Create the PettingZoo environment
env = particle_v1.parallel_env(num_agents=10, num_food_sources=1, flow='none', max_cycles=25, render_mode='human')

# Convert to parallel API (needed for supersuit + SB3)
env = ss.pettingzoo_env_to_vec_env_v1(env)

# Optional: normalize observations and rewards to ease training of neural net (makes sense with extreme values, but here, we have a window of -1 to 1)
env = ss.concat_vec_envs_v1(env, num_vec_envs=1, num_cpus=1, base_class="stable_baselines3")

Now, we train a simple PPO model. Note, that this is not true MAPPO (multi-agent), because we have centralised policy AND execution.

In [None]:
model = PPO(
    policy="MlpPolicy",
    env=env,
    verbose=1,
    n_steps= 2048, # (DEFAULT 2048) number of steps to run for each environment per update (will stop after n steps (step() function in env))
)
# took about 34 minutes for 10000 time steps
model.learn(total_timesteps=10000) # The total number of samples (env steps) to train on (over all environments/updates). Only cut-off value.

model.save("models/mappo_shared")

Using cpu device
------------------------------
| time/              |       |
|    fps             | 9     |
|    iterations      | 1     |
|    time_elapsed    | 2067  |
|    total_timesteps | 20480 |
------------------------------


We evaluate our model on a new environment instance. (Opens visualisation automatically)

In [None]:
# Evaluation of environment
import particle_v1
import supersuit as ss
from stable_baselines3 import PPO

env = particle_v1.parallel_env(num_agents=10, num_food_sources=1, flow='none', max_cycles=25, render_mode='human')
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, num_vec_envs=1, num_cpus=1, base_class="stable_baselines3")
obs = env.reset()

# Reload model
model = PPO.load("models/mappo_shared", env=env)

for i in range(10):
    action, _ = model.predict(obs)
    obs, rewards, dones, infos = env.step(action) # all vectors of size num_agents
    # if done = 1 (True), the action is done 
    print(f"Rewards at step {i}:")
    print(rewards)
#env.close()

Rewards at step 0:
[-1.1371735  -0.13711435 -0.84482586 -0.25118434 -0.9150127  -0.99484783
 -0.5816918  -2.2213273  -1.5276202  -0.82559264]
Rewards at step 1:
[-1.0530511  -0.13102573 -0.7456678  -0.21609263 -0.90993154 -1.0004635
 -0.65373605 -2.2090368  -1.5537815  -0.8217115 ]
Rewards at step 2:
[-0.9965045  -0.15468349 -0.77245975 -0.19054812 -0.810871   -0.91794074
 -0.63505465 -2.2410457  -1.5452732  -0.84965503]
Rewards at step 3:
[-0.8910552  -0.20312041 -0.73831147 -0.16368963 -0.6774763  -0.8070537
 -0.6720824  -2.2410457  -1.6245698  -0.86827207]
Rewards at step 4:
[-0.77923787 -0.2242342  -0.71944207 -0.12690601 -0.57119375 -0.69667387
 -0.6283018  -2.1555922  -1.8042825  -0.88829845]
Rewards at step 5:
[-0.6331607  -0.2202357  -0.69307995 -0.14415388 -1.4545912  -0.6916193
 -1.5362015  -2.0915108  -1.9282336  -0.9469649 ]
Rewards at step 6:
[-0.58991045 -0.25735444 -0.6882543  -0.16100298 -0.29906115 -0.6624637
 -0.52737474 -1.9711088  -1.9340165  -1.0109391 ]
Rewards at

This was a fixed loop but we can also iterate until the agents are all set to "done":

In [None]:
import particle_v1
import supersuit as ss
from stable_baselines3 import PPO

env = particle_v1.parallel_env(num_agents=10, num_food_sources=1, flow='none', max_cycles=25, render_mode='human')
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, num_vec_envs=1, num_cpus=1, base_class="stable_baselines3")
obs = env.reset() # Do seed=42 for reproducibility

# Reload model
model = PPO.load("models/mappo_shared", env=env)

dones = [False] * env.num_envs  # Track done status for each environment

while not all(dones):
    # Predict actions for all agents
    actions, _ = model.predict(obs)
    
    # Step through the environment
    obs, rewards, dones, infos = env.step(actions)
    print(dones)

env.close()

[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0