In [None]:
import glob
import numpy as np
import gym
import torch as th

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO
from imitation.algorithms import bc

import gnwrapper

env_id = "CarRacing-v0"

In [None]:
from CarRacing_utils import CarRacingGroundTruthXYObsWrapper
def wrapper(env):
    env = CarRacingGroundTruthXYObsWrapper(env) 
    env = gnwrapper.Animation(env)
    return env

def wrapper_image(env):
    env = gym.wrappers.gray_scale_observation.GrayScaleObservation(env, keep_dim=True)
    env = gnwrapper.Animation(env)
    return env

In [None]:
expert_xy_policy = []
for file in glob.glob('./Compare/expert_xy_seed/expert*.zip'):
    expert_xy_policy.append(PPO.load(file, print_system_info=False))

noob_policy = []
for file in glob.glob('./Compare/noob_seed/noob*.zip'):
    noob_policy.append(PPO.load(file, print_system_info=False))
    
best_expert = PPO.load("./policy/ppo_CarRacing_XY_expert33046.zip", print_system_info=False)
best_noob = PPO.load("./policy/ppo_CarRacing_noob33046.zip", print_system_info=False)
bc_policy = th.load('policy/bc1.zip')

In [None]:
env_gt = make_vec_env(env_id, wrapper_class=wrapper, env_kwargs = { 'verbose': False }, n_envs=10)
env = make_vec_env(env_id, wrapper_class=wrapper_image, env_kwargs = { 'verbose': False }, n_envs=10)
env = VecFrameStack(env, 2)

In [None]:
expert_rewards = []
for expert in expert_xy_policy:
    expert_reward, expert_reward_std = evaluate_policy(expert, env_gt, 10)
    expert_rewards.append([expert_reward, expert_reward_std])

noob_rewards = []
for noob in noob_policy:
    noob_reward, noob_reward_std = evaluate_policy(noob, env, 10)
    noob_rewards.append([noob_reward, noob_reward_std])
    
bc_rewards = []
for i in range(10):
    bc_reward, bc_reward_std = evaluate_policy(bc_policy, env, 10)
    bc_rewards.append([bc_reward, bc_reward_std])    

In [None]:
best_expert_rew, best_expert_std = evaluate_policy(best_expert, env_gt, 10)
best_noob_rew, best_noob_std = evaluate_policy(best_noob, env, 10)

In [None]:
expert_rewards_sort = np.sort(expert_rewards, axis=0)[2:-2]
expert_rew = np.mean(expert_rewards_sort, axis=0)[0]
expert_std = np.linalg.norm(expert_rewards_sort[:, 1])/np.sqrt(len(expert_rewards_sort))
noob_rewards_sort = np.sort(noob_rewards, axis=0)[2:-2]
noob_rew = np.mean(noob_rewards_sort, axis=0)[0]
noob_std = np.linalg.norm(noob_rewards_sort[:, 1])/np.sqrt(len(noob_rewards_sort))
bc_rewards_sort = np.sort(bc_rewards, axis=0)[2:-2]
bc_rew = np.mean(bc_rewards_sort, axis=0)[0]
bc_std = np.linalg.norm(bc_rewards_sort[:, 1])/np.sqrt(len(bc_rewards_sort))

In [None]:
print(f'{"expert reward":<20} {expert_rew:.2f} +/- {expert_std:.2f}')
print(f'{"best expert reward":<20} {best_expert_rew:.2f} +/- {best_expert_std:.2f}')
print(f'{"noob reward":<20} {noob_rew:.2f} +/- {noob_std:.2f}')
print(f'{"best noob reward":<20} {best_noob_rew:.2f} +/- {best_noob_std:.2f}')
print(f'{"BC reward":<20} {bc_rew:.2f} +/- {bc_std:.2f}')