In [None]:
import glob
import numpy as np
import gym
import torch as th
from tqdm import tqdm
import time

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO
from imitation.algorithms import bc

import gnwrapper

env_id = "CarRacing-v0"

In [None]:
from CarRacing_utils import CarRacingGroundTruthXYObsWrapper
def wrapper(env):
    env = CarRacingGroundTruthXYObsWrapper(env) 
    env = gnwrapper.Animation(env)
    return env

def wrapper_image(env):
    env = gym.wrappers.gray_scale_observation.GrayScaleObservation(env, keep_dim=True)
    env = gnwrapper.Animation(env)
    return env

In [None]:
expert_xy_policy = []
for file in glob.glob('./Compare/expert_xy_seed_1kk/expert*.zip'):
    expert_xy_policy.append(PPO.load(file, print_system_info=False))

noob_policy = []
for file in glob.glob('./Compare/noob_seed_1kk/noob*.zip'):
    noob_policy.append(PPO.load(file, print_system_info=False))
    
# best_expert = PPO.load("./policy/ppo_CarRacing_XY_expert33046.zip", print_system_info=False)
# best_noob = PPO.load("./policy/ppo_CarRacing_noob33046.zip", print_system_info=False)
bc_policy = th.load('policy/bc1.zip')

In [None]:
env_gt = make_vec_env(env_id, wrapper_class=wrapper, env_kwargs = { 'verbose': False }, n_envs=10)
env = make_vec_env(env_id, wrapper_class=wrapper_image, env_kwargs = { 'verbose': False }, n_envs=10)
env = VecFrameStack(env, 2)

In [None]:
print('expert reward')
expert_rewards = []
for expert in tqdm(expert_xy_policy):
    env_gt.seed(0)
    expert_reward, expert_reward_std = evaluate_policy(expert, env_gt, 10)
    expert_rewards.append([expert_reward, expert_reward_std])

print('noob reward')    
noob_rewards = []
for noob in tqdm(noob_policy):
    env.seed(0)
    noob_reward, noob_reward_std = evaluate_policy(noob, env, 10)
    noob_rewards.append([noob_reward, noob_reward_std])

print('BC reward')
env.seed(0)
bc_rew, bc_std = evaluate_policy(bc_policy, env, 10)

In [None]:
# env_gt.seed(0)
# best_expert_rew, best_expert_std = evaluate_policy(best_expert, env_gt, 10)
# env.seed(0)
# best_noob_rew, best_noob_std = evaluate_policy(best_noob, env, 10)

In [None]:
def filter_rew_and_std(rewards):
    rewards = np.array(rewards)
    sort_rewards = rewards[np.argsort(rewards[:, 0])][2:-2]
    rew = np.mean(sort_rewards, axis=0)[0]
    std = np.linalg.norm(sort_rewards[:, 1])/np.sqrt(len(sort_rewards))
    return rew, std

def get_best_result(rewards):
    rewards = np.array(rewards)
    sort_rewards = rewards[np.argsort(rewards[:, 0])]
    return sort_rewards[-1]

In [None]:
expert_rew, expert_std = filter_rew_and_std(expert_rewards)
noob_rew, noob_std = filter_rew_and_std(noob_rewards)

best_expert_rew, best_expert_std = get_best_result(expert_rewards)
best_noob_rew, best_noob_std = get_best_result(noob_rewards)

In [None]:
print(f'{"expert reward":<20} {expert_rew:.2f} +/- {expert_std:.2f}')
print(f'{"best expert reward":<20} {best_expert_rew:.2f} +/- {best_expert_std:.2f}')
print(f'{"noob reward":<20} {noob_rew:.2f} +/- {noob_std:.2f}')
print(f'{"best noob reward":<20} {best_noob_rew:.2f} +/- {best_noob_std:.2f}')
print(f'{"BC reward":<20} {bc_rew:.2f} +/- {bc_std:.2f}')