In [None]:
# import sys
# import gymnasium as gym
# sys.modules["gym"] = gym
import gym
gym.__version__

In [None]:
import gnwrapper

In [None]:
import os
import numpy as np
import torch
print(torch.cuda.is_available())

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

In [None]:
env_id = "MountainCarContinuous-v0"
NUM_CPU = 32  # Number of processes to use

# Observ Wrapper
```
class MountainCarContinuousObsWrapper(gym.ObservationWrapper):
    def __init__(self, env, num_obs_points: int = 6):
        super().__init__(env)
        self.max_angle = 1
        
        self.low_state = np.append(self.low_state, -self.max_angle)
        self.high_state = np.append(self.high_state, self.max_angle)
        
        self.observation_space = gym.spaces.Box(
            low=self.low_state, high=self.high_state, dtype=np.float32
        )
        
    def observation(self, obs):
        return np.append(obs, np.cos(3 * obs[0]))
```

In [None]:
from MountainCar_utils import MountainCarContinuousObsWrapper
def wrapper(env):
    env = MountainCarContinuousObsWrapper(env) 
    env = gnwrapper.Animation(env)
    return env

# Training 'expert' and 'noob' policy
Skip this if you have trained policies

In [None]:
# Parallel custom obs environments
train_env_f = make_vec_env(env_id, 
                           n_envs=NUM_CPU,
                           wrapper_class=MountainCarContinuousObsWrapper)
train_env_f.observation_space

In [None]:
eval_callback = EvalCallback(train_env_f, 
                             best_model_save_path="./logs/best_model/expert",
                             log_path="./logs/results",
                             eval_freq=500,
                             deterministic=True, render=False)
model = PPO("MlpPolicy", 
            train_env_f,
            verbose=1, 
            seed=0,
            batch_size=512,
            # ent_coef=0.00429,
            learning_rate=7.77e-05,
            n_epochs=10,
            n_steps=8*NUM_CPU,
            gae_lambda=0.9,
            gamma=0.9999,
            # clip_range=0.1,
            max_grad_norm=5,
            # vf_coef=0.19,
            use_sde=True,
            # policy_kwargs=dict(log_std_init=-3.29, ortho_init=False),
            tensorboard_log="./logs/ppo_MountainCar_tensorboard_expert/")

model.learn(total_timesteps=1_000_000, callback=eval_callback, progress_bar=True)
model.save("./logs/ppo_MountainCar_expert")

del model # remove to demonstrate saving and loading

In [None]:
# Parallel environments
train_env = make_vec_env(env_id, 
                         n_envs=NUM_CPU)
train_env.observation_space

In [None]:
eval_callback = EvalCallback(train_env, 
                             best_model_save_path="./logs/best_model/expert_noob",
                             log_path="./logs/results",
                             eval_freq=500,
                             deterministic=True, render=False)
model = PPO("MlpPolicy", 
            train_env,
            verbose=1, 
            seed=0,
            batch_size=512,
            # ent_coef=0.00429,
            learning_rate=7.77e-05,
            n_epochs=10,
            n_steps=8*NUM_CPU,
            gae_lambda=0.9,
            gamma=0.9999,
            # clip_range=0.1,
            max_grad_norm=5,
            # vf_coef=0.19,
            use_sde=True,
            # policy_kwargs=dict(log_std_init=-3.29, ortho_init=False),
            tensorboard_log="./logs/ppo_MountainCar_tensorboard_noob/")

model.learn(total_timesteps=1_000_000, callback=eval_callback, progress_bar=True)
model.save("./logs/ppo_MountainCar_noob")

del model # remove to demonstrate saving and loading

# Visual testing of policies

In [None]:
test_env_f = make_vec_env(env_id,
                        wrapper_class=wrapper,
                        n_envs=1)

expert = PPO.load("./logs/ppo_MountainCar_expert.zip", print_system_info=True)

obs = test_env_f.reset()

for _ in range(1000):
    action, _states = expert.predict(obs)
    obs, rewards, dones, info = test_env_f.step(action)
    test_env_f.render(mode="rgb_array")

    if dones:
        break

test_env_f.close()

In [None]:
test_env = make_vec_env(env_id,
                        wrapper_class=gnwrapper.Animation,
                        n_envs=1)

noob = PPO.load("./logs/ppo_MountainCar_noob.zip", print_system_info=True)

obs = test_env.reset()

for _ in range(1000):
    action, _states = noob.predict(obs)
    obs, rewards, dones, info = test_env.step(action)
    test_env.render(mode="rgb_array")

    if dones:
        break

test_env.close()

# Imitation Learning
In this section used Behavioral Cloning (BC)

In [None]:
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
import dataclasses
NUM_EPISODES = 100
# expert = PPO.load("./logs/ppo_MountainCar_expert.zip")

env = gym.make(env_id)
env = RolloutInfoWrapper(env) # Wrapper to save origin obs
env = MountainCarContinuousObsWrapper(env) # Wrapper to add angle to obs
env = DummyVecEnv([lambda: env]) # Vectorized env

rng = np.random.default_rng()
rollouts = rollout.rollout(
    expert,
    env,
    rollout.make_sample_until(min_timesteps=None, min_episodes=NUM_EPISODES),
    rng=rng,
    unwrap=True,
)

transitions = rollout.flatten_trajectories(rollouts)
env.close()

In [None]:
print(
    f"""The `rollout` function generated a list of {len(rollouts)} {type(rollouts[0])}.
After flattening, this list is turned into a {type(transitions)} object containing {len(transitions)} transitions.
The transitions object contains arrays for: {', '.join(transitions.__dict__.keys())}."
"""
)

In [None]:
from imitation.algorithms import bc
from stable_baselines3.common.policies import ActorCriticPolicy

env = gym.make(env_id)
env = gnwrapper.Animation(env)
bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
    policy=ActorCriticPolicy(observation_space=env.observation_space,
                             action_space=env.action_space,
                             lr_schedule=lambda _: torch.finfo(torch.float32).max,
                             net_arch=[64, 64]
                             )
)

In [None]:
bc_trainer.train(n_epochs=2)

# Policy Comparison

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
expert = PPO.load("./logs/ppo_MountainCar_expert.zip")
noob = PPO.load("./logs/ppo_MountainCar_noob.zip")

In [None]:
test_env_f = make_vec_env(env_id, wrapper_class=wrapper, n_envs=1)
test_env = make_vec_env(env_id, n_envs=1)

expert_reward, expert_reward_std = evaluate_policy(expert, test_env_f, 100)
bc_reward, bc_reward_std = evaluate_policy(bc_trainer.policy, test_env, 100)
noob_reward, noob_reward_std = evaluate_policy(noob, test_env, 100)
print(f'expert reward {expert_reward} +/- {expert_reward_std}')
print(f'BC reward {bc_reward} +/- {bc_reward_std}')
print(f'noob reward {noob_reward} +/- {noob_reward_std}')

## Compare policy architecture

In [None]:
expert.policy

In [None]:
noob.policy

In [None]:
bc_trainer.policy