In [1]:
from ma_v03 import MA_PARTY
import matplotlib
import supersuit as ss
from stable_baselines3 import PPO
import time

In [2]:
env = MA_PARTY(render_mode="human") # , debug_mode=True)
observations, infos = env.reset()

while env.agents:
    actions = {agent: env.action_space(agent).sample(infos[agent]["action_mask"]) for agent in env.agents}

    observations, rewards, terminations, truncations, infos = env.step(actions)
env.close()

Roll initiative!
Round 1 starts with 4 Goblins alive!
(3) Rogue turn starts ...
... attacks a goblin with a Rapier ...
... hits for 11 damage killing the goblin!
(0) Fighter turn starts ...
... moves to the backline!
Goblin turn starts ...
... attacking rogue ...
... misses!
Goblin turn starts ...
... attacking rogue ...
... hits for 5 damage leaving the rogue at 4 hp!
Goblin turn starts ...
... attacking rogue ...
... hits for 6 damage hitting the rogue unconcious!
(2) Wizard turn starts ...
... casts Firebolt ...
... hits for 2 damage leaving the goblin at 5 hp!
(6) Cleric turn starts ...
... casts Sacred Flame ...
... but misses!
rogue: 
{'hp': 0, 'alive': 0, 'zone': 2, 'spellslots': 0}
fighter: 
{'hp': 13, 'alive': 1, 'zone': 1, 'spellslots': 0}
wizard: 
{'hp': 5, 'alive': 1, 'zone': 1, 'spellslots': 2}
cleric: 
{'hp': 10, 'alive': 1, 'zone': 1, 'spellslots': 2}
goblins:
{'hp': 5, 'alive': 1}
{'hp': 7, 'alive': 1}
{'hp': 0, 'alive': 0}
{'hp': 7, 'alive': 1}
Round 2 starts with 3 Go

In [2]:
def train(env_fn, steps: int = 10_000, seed: int = 0, **env_kwargs):
    env = env_fn(**env_kwargs)
    
    # env = parallel_to_aec(env)
    # parallel_env = parallel_wrapper_fn(env)

    print(f"Preparing training on {str(env.metadata['name'])}.")

    # env = ss.black_death_v3(env)
    
    obs, _ = env.reset(seed)
    
    # env = ss.agent_indicator_v0(env)
    env = ss.pettingzoo_env_to_vec_env_v1(env)
    
    model = PPO(
        "MlpPolicy",
        env,
        verbose=3,
        batch_size=256,
        tensorboard_log="./ppo_ma_party_tensorboard/",
    )
    
    model.learn(total_timesteps=steps)
    model_name = f"{env.unwrapped.metadata.get('name')}_{time.strftime('%Y%m%d-%H%M%S')}"
    model.save(model_name)

    print("Model has been saved.")
    print(f"Finished training on {str(env.unwrapped.metadata['name'])}.")
    print(model_name)

    env.close()
    
model_name = None
train(env_fn=MA_PARTY, steps=100_000)

Preparing training on marl_heroes_vs_goblins_v01.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./ppo_ma_party_tensorboard/PPO_74
-----------------------------
| time/              |      |
|    fps             | 898  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 789        |
|    iterations           | 2          |
|    time_elapsed         | 5          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.01023704 |
|    clip_fraction        | 0.0664     |
|    clip_range           | 0.2        |
|    entropy_loss         | -2.19      |
|    explained_variance   | 0.000495   |
|    learning_rate        | 0.0003     |
|    loss                 | 127        |
|    n_updates            |

In [4]:
model = PPO.load("marl_heroes_vs_goblins_v01_20240731-121106")
env = MA_PARTY(render_mode="human") # , debug_mode=True)

reward_total = 0
all_heroes_alive = 0
runs = 1000

for run in range(1, runs):
    # print(f"Run {run}")
    
    observations, infos = env.reset()
    while env.agents:
        actions = {agent: model.predict(observations[agent])[0].item() for agent in env.agents}
        # data[0].item()
        # print(observations["rogue"])
        
        observations, rewards, terminations, truncations, infos = env.step(actions)
        
        # print(actions)
        # print(rewards)
        
        # print("###########################")
    reward_total += rewards["rogue"]
    if rewards["rogue"] == 100:
        all_heroes_alive += 1
    env.close()
    
reward_per_round = reward_total / runs
all_heroes_alive_average = all_heroes_alive / runs * 100
print(f"Reward per round: {reward_per_round}") # 87.51 r.p.r.
print(f"Success rate: {all_heroes_alive_average} %") # 81,8 %

Reward per round: 88.0
Success rate: 82.6 %


In [5]:
env = MA_PARTY(render_mode="human") # , debug_mode=True)
reward_total = 0
all_heroes_alive = 0
runs = 1000

for run in range(1, runs):
    # print(f"Run {run}")
    
    observations, infos = env.reset()
    while env.agents:
        actions = {
            agent: env.action_space(agent).sample(infos[agent]["action_mask"])
            for agent in env.agents
        }
        
        observations, rewards, terminations, truncations, infos = env.step(actions)
        
    reward_total += rewards["rogue"]
    if rewards["rogue"] == 100:
        # print("All heroes survive")
        all_heroes_alive += 1
    env.close()
    
reward_per_round = reward_total / runs
all_heroes_alive_average = all_heroes_alive / runs * 100
print(f"Reward per round: {reward_per_round}") # 60.9
print(f"Success rate: {all_heroes_alive_average} %") # 48.4 %

Reward per round: 61.26
Success rate: 48.3 %


In [7]:
env = MA_PARTY(render_mode="human") # , debug_mode=True)
reward_total = 0
all_heroes_alive = 0
no_heroes_alive = 0
timeout_times = 0
runs = 1000

for run in range(1, runs):
    # print(f"Run {run}")
    
    observations, infos = env.reset()
    while env.agents:
        actions = {
            agent: env.action_space(agent).sample()
            for agent in env.agents
        }
        
        observations, rewards, terminations, truncations, infos = env.step(actions)
        
    reward_total += rewards["rogue"]
    if rewards["rogue"] == 100:
        all_heroes_alive += 1
    if rewards["rogue"] == -100:
        no_heroes_alive += 1
    if rewards["rogue"] == -50:
        timeout_times += 1
    env.close()
    
reward_per_round = reward_total / runs
all_heroes_alive_average = all_heroes_alive / runs * 100
no_heroes_alive_average = no_heroes_alive / runs * 100

print(f"Reward per round: {reward_per_round}") # -17,93
print(f"Timeouts: {timeout_times}") # 29
print(f"Success rate: {all_heroes_alive_average} %") # 13.7 %
print(f"Failure rate: {no_heroes_alive_average} %") # 46,6 %

Reward per round: -16.07
Timeouts: 29
Success rate: 16.5 %
Failure rate: 46.6 %
