In [None]:
# import sys
# import gymnasium as gym
# sys.modules["gym"] = gym
import gym
gym.__version__

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecVideoRecorder
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

In [None]:
import numpy as np
import torch as th
from torch import nn
import matplotlib.pyplot as plt
from pprint import pprint

In [None]:
import gnwrapper

In [None]:
env_id = "MountainCarContinuous-v0"
NUM_CPU = 32  # Number of processes to use

In [None]:
from MountainCar_utils import MountainCarContinuousObsWrapper

def wrapper(env):
    env = MountainCarContinuousObsWrapper(env) 
    return env

In [None]:
expert = PPO.load("./policy/ppo_MountainCarContinuous_expert27469_n32b64.zip", print_system_info=True)

In [None]:
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
import dataclasses

def make_env(env_id):
    def _init():
        env = gym.make(env_id)
        env = RolloutInfoWrapper(env) # Wrapper to save origin obs
        env = wrapper(env) # Wrapper Obs
        return env
    
    return _init

bc_env = DummyVecEnv([make_env(env_id)]*NUM_CPU)
bc_env.observation_space

In [None]:
NUM_EPISODES = 500
rng = np.random.default_rng()
rollouts = rollout.rollout(
    expert,
    bc_env,
    rollout.make_sample_until(min_timesteps=None, min_episodes=NUM_EPISODES),
    rng = rng,
    unwrap = True,
)

bc_env.close()

In [None]:
top_rollouts = [rollout for rollout in rollouts if sum(rollout.rews) > 94]
transitions = rollout.flatten_trajectories(top_rollouts)

In [None]:
pprint(rollout.rollout_stats(rollouts))
pprint(rollout.rollout_stats(top_rollouts))

In [None]:
print(
    f"""The `rollout` function generated a list of {len(top_rollouts)} {type(top_rollouts[0])}.
After flattening, this list is turned into a {type(transitions)} object containing {len(transitions)} transitions.
The transitions object contains arrays for: {', '.join(transitions.__dict__.keys())}."
"""
)

In [None]:
from imitation.algorithms import bc
from stable_baselines3.common.policies import ActorCriticPolicy

env = gym.make(env_id)
env = DummyVecEnv([lambda: env])

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
    policy=ActorCriticPolicy(observation_space=env.observation_space,
                             action_space=env.action_space,
                             lr_schedule=lambda _: th.finfo(th.float32).max,
                             net_arch=[64, 64]
                             )
)

In [None]:
bc_trainer.train(n_epochs=10)

In [None]:
env = gym.make(env_id)
env = gnwrapper.Animation(env)
# env = gym.wrappers.RecordVideo(env, 'video')
# env = make_vec_env(env_id, wrapper_class=wrapper_image, n_envs=1)
# env = VecVideoRecorder(env, './video', True, 1000)

obs = env.reset()
while True:
    action, _states = bc_trainer.policy.predict(np.asarray(obs))
    obs, rewards, dones, info = env.step(action)
    env.render()
    if dones:
        break
    
env.close()

In [None]:
env = gym.make(env_id)
env = wrapper(env)
env = gnwrapper.Animation(env)
# env = wrapper(env)
# env = gym.wrappers.RecordVideo(env, 'video')
# env = make_vec_env(env_id, wrapper_class=wrapper, n_envs=1)
# env = VecVideoRecorder(env, './video', lambda x: True, 1000)

obs = env.reset()
while True:
    action, _states = expert.predict(obs.copy())
    obs, rewards, dones, info = env.step(action)
    env.render()
    if dones:
        break
    
env.close()

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
test_env_angle = make_vec_env(env_id, wrapper_class=wrapper, n_envs=10)
test_env = make_vec_env(env_id, n_envs=10)

expert_reward, expert_reward_std = evaluate_policy(expert, test_env_angle, 100)
bc_reward, bc_reward_std = evaluate_policy(bc_trainer.policy, test_env, 100)
# noob_reward, noob_reward_std = evaluate_policy(noob, test_env, 10)

In [None]:
print(f'expert reward {expert_reward:.2f} +/- {expert_reward_std:.2f}')
print(f'BC reward {bc_reward:.2f} +/- {bc_reward_std:.2f}')
# print(f'noob reward {noob_reward} +/- {noob_reward_std}')

In [None]:
bc_trainer.save_policy('policy/bc1.zip')