In [None]:
import glob
import numpy as np
import gym
import torch as th

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO
from imitation.algorithms import bc

env_id = "MountainCarContinuous-v0"

In [None]:
from MountainCar_utils.observation_wrapper import MountainCarContinuousObsWrapper, MountainCarContinuousNoVelObsWrapper

def wrapper(env):
    env = MountainCarContinuousObsWrapper(env) 
    return env

def wrapper_no_vel(env):
    env = MountainCarContinuousNoVelObsWrapper(env) 
    return env

In [None]:
def filter_rew_and_std(rewards):
    rewards = np.array(rewards)
    sort_rewards = rewards[np.argsort(rewards[:, 0])][2:-2]
    rew = np.mean(sort_rewards, axis=0)[0]
    std = np.linalg.norm(sort_rewards[:, 1])/np.sqrt(len(sort_rewards))
    return rew, std

def get_best_result(rewards):
    rewards = np.array(rewards)
    sort_rewards = rewards[np.argsort(rewards[:, 0])]
    return sort_rewards[-1]

In [None]:
env_angle = make_vec_env(env_id, wrapper_class=wrapper, n_envs=10)
env_no_vel = make_vec_env(env_id, wrapper_class=wrapper_no_vel, n_envs=10)
env_no_vel_fs = VecFrameStack(env_no_vel, 2)
env = make_vec_env(env_id, n_envs=10)

# Full Obs

In [None]:
expert_angle_policy = []
for file in glob.glob('./Compare/test_expert/expert*.zip'):
    expert_angle_policy.append(PPO.load(file, print_system_info=False))
    
expert_angle_policy_n32b64 = []
for file in glob.glob('./Compare/expert_3264_1kk/expert*.zip'):
    expert_angle_policy_n32b64.append(PPO.load(file, print_system_info=False))

noob_policy = []
for file in glob.glob('./Compare/test_noob/noob*.zip'):
    noob_policy.append(PPO.load(file, print_system_info=False))
    
noob_policy_n32b64 = []
for file in glob.glob('./Compare/noob_3264_1kk/noob*.zip'):
    noob_policy_n32b64.append(PPO.load(file, print_system_info=False))

bc_policy = th.load('./policy/bc1.zip')

In [None]:
SEED = 449
NUM_EP = 100

expert_rewards = []
for expert in expert_angle_policy:
    env_angle.seed(SEED)
    reward, reward_std = evaluate_policy(expert, env_angle, NUM_EP)
    expert_rewards.append([reward, reward_std]) # [] -> ()

expert_rewards_n32b64 = []
for expert in expert_angle_policy_n32b64:
    env_angle.seed(SEED)
    reward, reward_std = evaluate_policy(expert, env_angle, NUM_EP)
    expert_rewards_n32b64.append([reward, reward_std])    

noob_rewards = []
for noob in noob_policy:
    env.seed(SEED)
    reward, reward_std = evaluate_policy(noob, env, NUM_EP)
    noob_rewards.append([reward, reward_std])

noob_rewards_n32b64 = []
for noob in noob_policy_n32b64:
    env.seed(SEED)
    reward, reward_std = evaluate_policy(noob, env, NUM_EP)
    noob_rewards_n32b64.append([reward, reward_std])    

env.seed(SEED)
bc_rew, bc_std = evaluate_policy(bc_policy, env, NUM_EP)

In [None]:
expert_rew, expert_std = filter_rew_and_std(expert_rewards)
expert_rew_n32b64, expert_std_n32b64 = filter_rew_and_std(expert_rewards_n32b64)
noob_rew, noob_std = filter_rew_and_std(noob_rewards)
noob_rew_n32b64, noob_std_n32b64 = filter_rew_and_std(noob_rewards_n32b64)

best_expert_rew, best_expert_std = get_best_result(expert_rewards)
best_expert_rew_n32b64, best_expert_std_n32b64 = get_best_result(expert_rewards_n32b64)
best_noob_rew, best_noob_std = get_best_result(noob_rewards)
best_noob_rew_n32b64, best_noob_std_n32b64 = get_best_result(noob_rewards_n32b64)

# NoVel

In [None]:
noob_lstm = []
for file in glob.glob('./Compare/noob_lstm_n32b128_1kk/noob*.zip'):
    noob_lstm.append(PPO.load(file, print_system_info=False))

noob_framestack = []
for file in glob.glob('./Compare/noob_novel_n32b64_1kk/noob*.zip'):
    noob_framestack.append(PPO.load(file, print_system_info=False))

In [None]:
noob_framestack[0].observation_space

In [None]:
SEED = 449
NUM_EP = 100

noob_lstm_rewards = []
for noob in noob_lstm:
    env_no_vel.seed(SEED)
    reward, reward_std = evaluate_policy(noob, env_no_vel, NUM_EP)
    noob_lstm_rewards.append([reward, reward_std])
    
noob_framestack_rewards = []
for noob in noob_framestack:
    env_no_vel_fs.seed(SEED)
    reward, reward_std = evaluate_policy(noob, env_no_vel_fs, NUM_EP)
    noob_framestack_rewards.append([reward, reward_std]) 

In [None]:
noob_lstm_rew, noob_lstm_std = filter_rew_and_std(noob_lstm_rewards)
noob_fs_rew, noob_fs_std = filter_rew_and_std(noob_framestack_rewards)

best_noob_lstm_rew, best_noob_lstm_std = get_best_result(noob_lstm_rewards)
best_fs_lstm_rew, best_noob_fs_std = get_best_result(noob_framestack_rewards)

# Results

In [None]:
line_lenght = 25
# Add Padndas
print(f'{"expert reward":<{line_lenght}} {expert_rew:.2f} +/- {expert_std:.2f}')
print(f'{"best expert reward":<{line_lenght}} {best_expert_rew:.2f} +/- {best_expert_std:.2f}')
print(f'{"noob reward":<{line_lenght}} {noob_rew:.2f} +/- {noob_std:.2f}')
print(f'{"best noob reward":<{line_lenght}} {best_noob_rew:.2f} +/- {best_noob_std:.2f}')
print('n32b64')
print(f'{"expert reward":<{line_lenght}} {expert_rew_n32b64:.2f} +/- {expert_std_n32b64:.2f}')
print(f'{"best expert reward":<{line_lenght}} {best_expert_rew_n32b64:.2f} +/- {best_expert_std_n32b64:.2f}')
print(f'{"noob reward":<{line_lenght}} {noob_rew_n32b64:.2f} +/- {noob_std_n32b64:.2f}')
print(f'{"best noob reward":<{line_lenght}} {best_noob_rew_n32b64:.2f} +/- {best_noob_std_n32b64:.2f}')
print('Imitation Learning')
print(f'{"BC reward":<{line_lenght}} {bc_rew:.2f} +/- {bc_std:.2f}')

In [None]:
line_lenght = 25
# Add Padndas
print(f'{"noob lstm reward":<{line_lenght}} {noob_lstm_rew:.2f} +/- {noob_lstm_std:.2f}')
print(f'{"best noob lstm reward":<{line_lenght}} {best_noob_lstm_rew:.2f} +/- {best_noob_lstm_std:.2f}')
print(f'{"noob fs reward":<{line_lenght}} {noob_fs_rew:.2f} +/- {noob_fs_std:.2f}')
print(f'{"best noob fs reward":<{line_lenght}} {best_fs_lstm_rew:.2f} +/- {best_noob_fs_std:.2f}')