In [34]:
import numpy as np

from stable_baselines3 import PPO
from PO_grid_world import PO_GridWorld
from notebook_env_wrapper import NotebookEnvWrapper
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
env_po = PO_GridWorld(partially_observable=True)
env_notebook = NotebookEnvWrapper(PO_GridWorld(partially_observable=True), notebook_size=8)

In [3]:
arrows = ["↑", "↓", "←", "→"]

def print_policy(model):
    for i in range(6):
        for j in range(6):
            obs = i*6 + j
            pred = model.predict(obs, deterministic=True)[0]
            print(arrows[pred], end=" ")
        print()

def print_policy_po(po_model):
    for i in range(6):
        for j in range(6):
            obs = (i//3)*2 + (j//3)
            pred = po_model.predict(obs, deterministic=True)[0]
            print(arrows[pred], end=" ")
        print()

In [27]:
model_po = PPO.load("models_cmp/ppo_gridworld_po_2")
print_policy_po(model_po)

↓ ↓ ↓ ↑ ↑ ↑ 
↓ ↓ ↓ ↑ ↑ ↑ 
↓ ↓ ↓ ↑ ↑ ↑ 
→ → → ↑ ↑ ↑ 
→ → → ↑ ↑ ↑ 
→ → → ↑ ↑ ↑ 


In [28]:
n = 3

mean_rewards_po = []
std_rewards_po = []

for i in range(n):
    model = PPO.load(f"models_cmp/ppo_gridworld_po_{i}")
    mean_reward, std_reward = evaluate_policy(model, env_po, n_eval_episodes=1000)
    mean_rewards_po.append(mean_reward)
    std_rewards_po.append(std_reward)
    print(f"PO Model {i}: {mean_reward:.2f} +/- {std_reward:.2f}")

PO Model 0: 0.74 +/- 0.67
PO Model 1: 0.73 +/- 0.69
PO Model 2: 0.77 +/- 0.63


In [36]:
n = 3

mean_rewards_notebook = []
std_rewards_notebook = []

for i in range(n):
    model = PPO.load(f"models_cmp/ppo_gridworld_notebook_{i}")
    mean_reward, std_reward = evaluate_policy(model, env_notebook, n_eval_episodes=1000)
    mean_rewards_notebook.append(mean_reward)
    std_rewards_notebook.append(std_reward)
    print(f"Notebook Model {i}: {mean_reward:.2f} +/- {std_reward:.2f}")



Notebook Model 0: 0.88 +/- 0.48
Notebook Model 1: 0.81 +/- 0.58
Notebook Model 2: 0.81 +/- 0.50


In [37]:
print("PO models")
print(f"Mean reward: {np.mean(mean_rewards_po):.2f} +/- {np.mean(std_rewards_po):.2f}")
print("Notebook models")
print(f"Mean reward: {np.mean(mean_rewards_notebook):.2f} +/- {np.mean(std_rewards_notebook):.2f}")

PO models
Mean reward: 0.75 +/- 0.66
Notebook models
Mean reward: 0.83 +/- 0.52
