In [1]:
import numpy as np

from stable_baselines3 import PPO
from PO_grid_world import PO_GridWorld
from notebook_env_wrapper import NotebookEnvWrapper

In [None]:
# This code block is run when truncation hasn't been introduced
# No longer relevant

'''
from stable_baselines3.common.evaluation import evaluate_policy

mean_reward_norm, std_reward_norm = evaluate_policy(model_norm, env, n_eval_episodes=1000)
mean_reward_po, std_reward_po = evaluate_policy(model_po, env_po, n_eval_episodes=1000)
mean_reward_notebook, std_reward_notebook = evaluate_policy(model_notenook, env_notebook, n_eval_episodes=1000)

print(f"Mean reward normal: {mean_reward_norm:.2f} +/- {std_reward_norm:.2f}")
print(f"Mean reward PO: {mean_reward_po:.2f} +/- {std_reward_po:.2f}")
print(f"Mean reward notebook: {mean_reward_notebook:.2f} +/- {std_reward_notebook:.2f}")
'''



Mean reward normal: 0.99 +/- 0.13
Mean reward PO: 0.74 +/- 0.67
Mean reward notebook: 0.89 +/- 0.45


In [2]:
env = PO_GridWorld()
env_po = PO_GridWorld(partially_observable=True)
env_notebook = NotebookEnvWrapper(PO_GridWorld(partially_observable=True), notebook_size=8)

In [8]:
n = 3

# Train n PO models
for i in range(n):
    model = PPO("MlpPolicy", 
                env_po, 
                verbose=0,
                learning_rate=0.0001,
                gamma=0.9)
    model.learn(total_timesteps=500000)
    model.save(f"models_cmp/ppo_gridworld_po_{i}")
    print(f"PO Model {i} trained")

# Train n notebook models
for i in range(n):
    model = PPO("MlpPolicy", 
                env_notebook, 
                verbose=0,
                learning_rate=0.0001,
                gamma=0.9)
    model.learn(total_timesteps=500000)
    model.save(f"models_cmp/ppo_gridworld_notebook_{i}")
    print(f"Notebook Model {i} trained")

PO Model 0 trained
PO Model 1 trained
PO Model 2 trained
Notebook Model 0 trained
Notebook Model 1 trained
Notebook Model 2 trained


In [39]:
del model_norm, model_po, model_notenook, model

In [None]:
# Evaluate the models
mean_rewards_po = []
std_rewards_po = []
mean_rewards_notebook = []
std_rewards_notebook = []

for i in range(n):
    model = PPO.load(f"models_cmp/ppo_gridworld_po_{i}")
    mean_reward, std_reward = evaluate_policy(model, env_po, n_eval_episodes=100)
    mean_rewards_po.append(mean_reward)
    std_rewards_po.append(std_reward)
    print(f"PO Model {i}: {mean_reward:.2f} +/- {std_reward:.2f}")

for i in range(n):
    model = PPO.load(f"models_cmp/ppo_gridworld_notebook_{i}")
    mean_reward, std_reward = evaluate_policy(model, env_notebook, n_eval_episodes=100)
    mean_rewards_notebook.append(mean_reward)
    std_rewards_notebook.append(std_reward)
    print(f"Notebook Model {i}: {mean_reward:.2f} +/- {std_reward:.2f}")

print("PO models")
print(f"Mean reward: {np.mean(mean_rewards_po):.2f} +/- {np.mean(std_rewards_po):.2f}")
print("Notebook models")
print(f"Mean reward: {np.mean(mean_rewards_notebook):.2f} +/- {np.mean(std_rewards_notebook):.2f}")

PO Model 1: 0.80 +/- 0.60
PO Model 3: 0.82 +/- 0.57
PO Model 4: 0.90 +/- 0.44
PO Model 5: 0.92 +/- 0.39
PO Model 6: 0.90 +/- 0.44
PO Model 7: 0.76 +/- 0.65
PO Model 8: 0.88 +/- 0.47
PO Model 9: 0.80 +/- 0.60
Notebook Model 0: 0.78 +/- 0.63
Notebook Model 1: 0.72 +/- 0.69
Notebook Model 2: 0.80 +/- 0.60
Notebook Model 3: 0.94 +/- 0.34
Notebook Model 4: 0.92 +/- 0.39
Notebook Model 5: 0.78 +/- 0.63
Notebook Model 6: 0.72 +/- 0.69
Notebook Model 7: 0.82 +/- 0.57
Notebook Model 8: 0.52 +/- 0.85
Notebook Model 9: 0.76 +/- 0.65
PO models
Mean reward: 0.85 +/- 0.52
Notebook models
Mean reward: 0.78 +/- 0.60
