In [None]:
import glob
import numpy as np
import gym
import torch as th

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO
from imitation.algorithms import bc

env_id = "MountainCarContinuous-v0"

In [None]:
from MountainCar_utils import MountainCarContinuousObsWrapper

def wrapper(env):
    env = MountainCarContinuousObsWrapper(env) 
    return env

In [None]:
expert_angle_policy = []
for file in glob.glob('./Compare/test_expert/expert*.zip'):
    expert_angle_policy.append(PPO.load(file, print_system_info=False))
    
expert_angle_policy_n32b64 = []
for file in glob.glob('./Compare/expert_3264_1kk/expert*.zip'):
    expert_angle_policy_n32b64.append(PPO.load(file, print_system_info=False))

noob_policy = []
for file in glob.glob('./Compare/test_noob/noob*.zip'):
    noob_policy.append(PPO.load(file, print_system_info=False))
    
noob_policy_n32b64 = []
for file in glob.glob('./Compare/noob_3264_1kk/noob*.zip'):
    noob_policy_n32b64.append(PPO.load(file, print_system_info=False))

best_expert = PPO.load("./policy/ppo_MountainCarContinuous_expert16921.zip", print_system_info=False)
best_noob = PPO.load("./policy/ppo_MountainCarContinuous_noob32230.zip", print_system_info=False)
bc_policy = th.load('./policy/bc1.zip')

In [None]:
env_angle = make_vec_env(env_id, wrapper_class=wrapper, n_envs=10)
env = make_vec_env(env_id, n_envs=10)

In [None]:
expert_rewards = []
for expert in expert_angle_policy:
    env_angle.seed(0)
    expert_reward, expert_reward_std = evaluate_policy(expert, env_angle, 100)
    expert_rewards.append([expert_reward, expert_reward_std]) # [] -> ()
    
expert_rewards_n32b64 = []
for expert in expert_angle_policy_n32b64:
    env_angle.seed(0)
    expert_reward, expert_reward_std = evaluate_policy(expert, env_angle, 100)
    expert_rewards_n32b64.append([expert_reward, expert_reward_std])    

noob_rewards = []
for noob in noob_policy:
    env.seed(0)
    noob_reward, noob_reward_std = evaluate_policy(noob, env, 100)
    noob_rewards.append([noob_reward, noob_reward_std])
    
noob_rewards_n32b64 = []
for noob in noob_policy_n32b64:
    env.seed(0)
    noob_reward, noob_reward_std = evaluate_policy(noob, env, 100)
    noob_rewards_n32b64.append([noob_reward, noob_reward_std])    
    
env.seed(0)
bc_rew, bc_std = evaluate_policy(bc_policy, env, 100)
    
env_angle.seed(0)
best_expert_rew, best_expert_std = evaluate_policy(best_expert, env_angle, 100)

env.seed(0)
best_noob_rew, best_noob_std = evaluate_policy(best_noob, env, 100)

In [None]:
def filter_rew_and_std(rewards):
    sort_rewards = np.sort(rewards, axis=0)[2:-2]
    rew = np.mean(sort_rewards, axis=0)[0]
    std = np.linalg.norm(sort_rewards[:, 1])/np.sqrt(len(sort_rewards))
    return rew, std

def get_best_result(rewards):
    sort_rewards = np.sort(rewards, axis=0)
    return sort_rewards[-1]

In [None]:
expert_rew, expert_std = filter_rew_and_std(expert_rewards)
expert_rew_n32b64, expert_std_n32b64 = filter_rew_and_std(expert_rewards_n32b64)
noob_rew, noob_std = filter_rew_and_std(noob_rewards)
noob_rew_n32b64, noob_std_n32b64 = filter_rew_and_std(noob_rewards_n32b64)

best_expert_rew, best_expert_std = get_best_result(expert_rewards_n32b64)
best_noob_rew, best_noob_std = get_best_result(noob_rewards_n32b64)

In [None]:
# Add Padndas
# print(f'{"expert reward":<30} {expert_rew:.2f} +/- {expert_std:.2f}')
print(f'{"expert reward n32b64":<30} {expert_rew_n32b64:.2f} +/- {expert_std_n32b64:.2f}')
print(f'{"best expert reward":<30} {best_expert_rew:.2f} +/- {best_expert_std:.2f}')
# print(f'{"noob reward":<30} {noob_rew:.2f} +/- {noob_std:.2f}')
print(f'{"noob reward n32b64":<30} {noob_rew_n32b64:.2f} +/- {noob_std_n32b64:.2f}')
print(f'{"best noob reward":<30} {best_noob_rew:.2f} +/- {best_noob_std:.2f}')
print(f'{"BC reward":<30} {bc_rew:.2f} +/- {bc_std:.2f}')

expert reward n32b64           94.18 +/- 0.06
noob reward n32b64             93.39 +/- 0.66