In [None]:
import glob
import numpy as np
import gym
import torch as th
from tqdm import tqdm

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO
from imitation.algorithms import bc

env_id = "MountainCarContinuous-v0"

In [None]:
from MountainCar_utils.observation_wrapper import MountainCarContinuousObsWrapper, MountainCarContinuousNoVelObsWrapper

def wrapper(env):
    env = MountainCarContinuousObsWrapper(env) 
    return env

def wrapper_no_vel(env):
    env = MountainCarContinuousNoVelObsWrapper(env) 
    return env

In [None]:
def filter_rew_and_std(rewards):
    rewards = np.array(rewards)
    sort_rewards = rewards[np.argsort(rewards[:, 0])][2:-2]
    rew = np.mean(sort_rewards, axis=0)[0]
    std = np.linalg.norm(sort_rewards[:, 1])/np.sqrt(len(sort_rewards))
    return rew, std

def get_best_result(rewards):
    rewards = np.array(rewards)
    sort_rewards = rewards[np.argsort(rewards[:, 0])]
    return sort_rewards[-1]

def print_rew_std(label: str, expert_rew: float, expert_std: float, latex: bool = True, line_lenght: int = 25):
    if latex:
        return f'{label:<{line_lenght}} & ${expert_rew:.2f} \pm {expert_std:.2f}$ \\\ \hline'
    else:
        return f'{label:<{line_lenght}} {expert_rew:.2f} +/- {expert_std:.2f}'

In [None]:
env_angle = make_vec_env(env_id, wrapper_class=wrapper, n_envs=10)
env_no_vel = make_vec_env(env_id, wrapper_class=wrapper_no_vel, n_envs=10)
env_no_vel_fs2 = VecFrameStack(env_no_vel, 2)
env_no_vel_fs4 = VecFrameStack(env_no_vel, 4)
env = make_vec_env(env_id, n_envs=10)

# Full Obs

In [None]:
expert_angle_policy = []
for file in glob.glob('./Compare/expert_eval2/expert*.zip'):
    expert_angle_policy.append(PPO.load(file, print_system_info=False))    
    
expert_angle_policy_3264 = []
for file in glob.glob('./Compare/expert_3264_eval2/expert*.zip'):
    expert_angle_policy_3264.append(PPO.load(file, print_system_info=False))    
    
noob_policy = []
for file in glob.glob('./Compare/noob_eval2/noob*.zip'):
    noob_policy.append(PPO.load(file, print_system_info=False))

noob_policy_3264 = []
for file in glob.glob('./Compare/noob_3264_eval2/noob*.zip'):
    noob_policy_3264.append(PPO.load(file, print_system_info=False))

bc_policy = th.load('./policy/bc_expert3464.zip')
bc_policy_3264 = th.load('./policy/bc_expert46304_n32b64.zip')

In [None]:
# expert_angle_policy = []
# for file in glob.glob('./Compare/expert_eval2/logs/best_model/expert*/best_model.zip'):
#     expert_angle_policy.append(PPO.load(file, print_system_info=False))  
    
# expert_angle_policy_3264 = []
# for file in glob.glob('./Compare/expert_3264_eval2/logs/best_model/expert*/best_model.zip'):
#     expert_angle_policy_3264.append(PPO.load(file, print_system_info=False))

# noob_policy = []
# for file in glob.glob('./Compare/noob_eval2/logs/best_model/noob*/best_model.zip'):
#     noob_policy.append(PPO.load(file, print_system_info=False))

# noob_policy_3264 = []
# for file in glob.glob('./Compare/noob_3264_eval2/logs/best_model/noob*/best_model.zip'):
#     noob_policy_3264.append(PPO.load(file, print_system_info=False))

# # bc_policy = th.load('./policy/bc1.zip')

In [None]:
SEED = 0
NUM_EP = 1000

expert_rewards = []
for expert in tqdm(expert_angle_policy):
    env_angle.seed(SEED)
    reward, reward_std = evaluate_policy(expert, env_angle, NUM_EP)
    expert_rewards.append([reward, reward_std]) # [] -> ()

expert_rewards_3264 = []
for expert in tqdm(expert_angle_policy_3264):
    env_angle.seed(SEED)
    reward, reward_std = evaluate_policy(expert, env_angle, NUM_EP)
    expert_rewards_3264.append([reward, reward_std])    

noob_rewards = []
for noob in tqdm(noob_policy):
    env.seed(SEED)
    reward, reward_std = evaluate_policy(noob, env, NUM_EP)
    noob_rewards.append([reward, reward_std])

noob_rewards_3264 = []
for noob in tqdm(noob_policy_3264):
    env.seed(SEED)
    reward, reward_std = evaluate_policy(noob, env, NUM_EP)
    noob_rewards_3264.append([reward, reward_std])    

In [None]:
env.seed(SEED)
bc_rew, bc_std = evaluate_policy(bc_policy, env, NUM_EP)

env.seed(SEED)
bc_rew_n32b64, bc_std_n32b64 = evaluate_policy(bc_policy_3264, env, NUM_EP)

In [None]:
expert_rew, expert_std = filter_rew_and_std(expert_rewards)
expert_rew_n32b64, expert_std_n32b64 = filter_rew_and_std(expert_rewards_3264)
noob_rew, noob_std = filter_rew_and_std(noob_rewards)
noob_rew_n32b64, noob_std_n32b64 = filter_rew_and_std(noob_rewards_3264)

best_expert_rew, best_expert_std = get_best_result(expert_rewards)
best_expert_rew_n32b64, best_expert_std_n32b64 = get_best_result(expert_rewards_3264)
best_noob_rew, best_noob_std = get_best_result(noob_rewards)
best_noob_rew_n32b64, best_noob_std_n32b64 = get_best_result(noob_rewards_3264)

In [None]:
line_lenght = 25
# Add Padndas and use pandas.DataFrame.to_latex
# print(f'1 & {"expert reward":<{line_lenght}} & {expert_rew:.2f} {pm} {expert_std:.2f} \\\ \hline')
print(f'1 & {print_rew_std("expert reward", expert_rew, expert_std)}')
print(f'2 & {print_rew_std("best expert reward", best_expert_rew, best_expert_std)}')
print(f'3 & {print_rew_std("noob reward", noob_rew, noob_std)}')
print(f'4 & {print_rew_std("best noob reward", best_noob_rew, best_noob_std)}')
print(f'5 & {print_rew_std("BC reward", bc_rew, bc_std)}')
print('n32b64')
print(f'1 & {print_rew_std("expert reward", expert_rew_n32b64, expert_std_n32b64)}')
print(f'2 & {print_rew_std("best expert reward", best_expert_rew_n32b64, best_expert_std_n32b64)}')
print(f'3 & {print_rew_std("noob reward", noob_rew_n32b64, noob_std_n32b64)}')
print(f'4 & {print_rew_std("best noob reward", best_noob_rew_n32b64, best_noob_std_n32b64)}')
print(f'5 & {print_rew_std("BC reward", bc_rew_n32b64, bc_std_n32b64)}')
# print('Imitation Learning')

# NoVel

In [None]:
noob_tcn = []
for file in glob.glob('./Compare/noob_tcn_3264_eval2/noob*.zip'):
    noob_tcn.append(PPO.load(file, print_system_info=False))

noob_fs2 = []
for file in glob.glob('./Compare/noob_novel_3264_fs2_eval2/noob*.zip'):
    noob_fs2.append(PPO.load(file, print_system_info=False))
    
noob_fs4 = []
for file in glob.glob('./Compare/noob_novel_3264_fs4_eval2/noob*.zip'):
    noob_fs4.append(PPO.load(file, print_system_info=False))

bc_tcn_policy = th.load('./policy/bc_tcn_expert46304_n32b64.zip')

In [None]:
SEED = 0
NUM_EP = 1000

noob_tcn_rewards = []
for noob in tqdm(noob_tcn):
    env_no_vel_fs4.seed(SEED)
    reward, reward_std = evaluate_policy(noob, env_no_vel_fs4, NUM_EP)
    noob_tcn_rewards.append([reward, reward_std])
    
noob_fs2_rewards = []
for noob in tqdm(noob_fs2):
    env_no_vel_fs2.seed(SEED)
    reward, reward_std = evaluate_policy(noob, env_no_vel_fs2, NUM_EP)
    noob_fs2_rewards.append([reward, reward_std])

noob_fs4_rewards = []
for noob in tqdm(noob_fs4):
    env_no_vel_fs4.seed(SEED)
    reward, reward_std = evaluate_policy(noob, env_no_vel_fs4, NUM_EP)
    noob_fs4_rewards.append([reward, reward_std])     

In [None]:
env_no_vel_fs4.seed(SEED)
bc_tcn_rew, bc_tcn_std = evaluate_policy(bc_tcn_policy, env_no_vel_fs4, NUM_EP)

In [None]:
noob_tcn_rew, noob_tcn_std = filter_rew_and_std(noob_tcn_rewards)
noob_fs2_rew, noob_fs2_std = filter_rew_and_std(noob_fs2_rewards)
noob_fs4_rew, noob_fs4_std = filter_rew_and_std(noob_fs4_rewards)

best_noob_tcn_rew, best_noob_tcn_std = get_best_result(noob_tcn_rewards)
best_noob_fs2_rew, best_noob_fs2_std = get_best_result(noob_fs2_rewards)
best_noob_fs4_rew, best_noob_fs4_std = get_best_result(noob_fs4_rewards)

# Results

In [None]:
line_lenght = 25
# pm = '+/-'
pm = '\pm'
# Add Padndas
print(f'1 & {print_rew_std("expert reward", expert_rew_n32b64, expert_std_n32b64)}')
print(f'2 & {print_rew_std("best expert reward", best_expert_rew_n32b64, best_expert_std_n32b64)}')
print(f'3 & {print_rew_std("noob TCN reward", noob_tcn_rew, noob_tcn_std)}')
print(f'4 & {print_rew_std("best noob TCN reward", best_noob_tcn_rew, best_noob_tcn_std)}')
# print(f'5 & {print_rew_std("noob fs2 reward", noob_fs2_rew, noob_fs2_std)}')
# print(f'6 & {print_rew_std("best noob fs2 reward", best_noob_fs2_rew, best_noob_fs2_std)}')
print(f'5 & {print_rew_std("noob fs4 reward", noob_fs4_rew, noob_fs4_std)}')
print(f'6 & {print_rew_std("best noob fs4 reward", best_noob_fs4_rew, best_noob_fs4_std)}')
print(f'7 & {print_rew_std("BC TCN", bc_tcn_rew, bc_tcn_std)}')
