In [None]:
# import sys
# import gymnasium as gym
# sys.modules["gym"] = gym
import gym
gym.__version__

In [None]:
import gnwrapper

In [None]:
import os
import numpy as np
import torch
print(torch.cuda.is_available())

In [None]:
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

In [None]:
env_id = "MountainCarContinuous-v0"
NUM_CPU = 32  # Number of processes to use

In [None]:
# Parallel environments
train_env_f = make_vec_env(env_id, n_envs=NUM_CPU)
train_env_f.observation_space

In [None]:
eval_callback = EvalCallback(train_env_f, 
                             best_model_save_path="./logs/best_model/expert",
                             log_path="./logs/results",
                             eval_freq=2**10/NUM_CPU,
                             deterministic=True, render=False)
model = SAC("MlpPolicy", 
            train_env_f,
            verbose=1, 
            seed=0,
            learning_rate=0.0003,
            buffer_size = 50000,
            learning_starts = 0,
            batch_size=512,
            tau = 0.01,
            gamma=0.9999,
            gradient_steps=2*NUM_CPU,
            use_sde=True,            
            tensorboard_log="./logs/sac_MountainCar_tensorboard_expert/")

model.learn(total_timesteps=50_000, callback=eval_callback, progress_bar=True)
model.save("./logs/sac_MountainCar_expert")

del model # remove to demonstrate saving and loading

In [None]:
# Parallel environments
train_env = make_vec_env(env_id, 
                         env_kwargs=dict(
                            full_obs=False
                         ),
                         n_envs=NUM_CPU)
train_env.observation_space

In [None]:
eval_callback = EvalCallback(train_env, 
                             best_model_save_path="./logs/best_model/expert_noob",
                             log_path="./logs/results",
                             eval_freq=500,
                             deterministic=True, render=False)
model = PPO("MlpPolicy", 
            train_env,
            verbose=1, 
            seed=0,
            batch_size=512,
            # ent_coef=0.00429,
            learning_rate=7.77e-05,
            n_epochs=10,
            n_steps=8*NUM_CPU,
            gae_lambda=0.9,
            gamma=0.9999,
            # clip_range=0.1,
            max_grad_norm=5,
            # vf_coef=0.19,
            use_sde=True,
            # policy_kwargs=dict(log_std_init=-3.29, ortho_init=False),
            tensorboard_log="./logs/ppo_MountainCar_tensorboard_noob/")

model.learn(total_timesteps=1_000_000, callback=eval_callback, progress_bar=True)
model.save("./logs/ppo_MountainCar_noob")

del model # remove to demonstrate saving and loading

In [None]:
# test_env = gym.make(env_id, render_mode="human")
# test_env = gnwrapper.Animation(test_env)
# test_env = DummyVecEnv([lambda: test_env])

test_env = make_vec_env(env_id,
                        # env_kwargs=dict(
                        #     render_mode="rgb_array"
                        # ),
                        wrapper_class=gnwrapper.Animation,
                        n_envs=1)

expert = PPO.load("./logs/ppo_MountainCar_expert.zip", print_system_info=True)

obs = test_env.reset()

for _ in range(1000):
    action, _states = expert.predict(obs)
    obs, rewards, dones, info = test_env.step(action)
    test_env.render(mode="rgb_array")

    if dones:
        break

test_env.close()

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

reward, _ = evaluate_policy(expert, test_env, 100)
print(reward)

In [None]:
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
import dataclasses

NUM_EPISODES = 10
full_env = gym.make(env_id)
rng = np.random.default_rng()
rollouts = rollout.rollout(
    expert,
    DummyVecEnv([lambda: RolloutInfoWrapper(full_env)]),
    rollout.make_sample_until(min_timesteps=None, min_episodes=100),
    rng=rng,
)

rollouts = [dataclasses.replace(rollout, obs=np.delete(rollout.obs, 2, 1)) for rollout in rollouts]
transitions = rollout.flatten_trajectories(rollouts)

In [None]:
print(
    f"""The `rollout` function generated a list of {len(rollouts)} {type(rollouts[0])}.
After flattening, this list is turned into a {type(transitions)} object containing {len(transitions)} transitions.
The transitions object contains arrays for: {', '.join(transitions.__dict__.keys())}."
"""
)

In [None]:
from imitation.algorithms import bc
from stable_baselines3.common.policies import ActorCriticPolicy

env = gym.make(env_id, full_obs=False)
env = gnwrapper.Animation(env)
bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
    # policy=ActorCriticPolicy(observation_space=env.observation_space,
    #                          action_space=env.action_space,
    #                          lr_schedule=lambda _: torch.finfo(torch.float32).max,
    #                          net_arch=[64, 64]
    #                          )
)

In [None]:
bc_trainer.train(n_epochs=2)

In [None]:
noob.policy

In [None]:
bc_trainer.policy

In [None]:
env.observation_space

In [None]:
reward, _ = evaluate_policy(bc_trainer.policy, env, 100)
print(f"BC reward: {reward}")

In [None]:
noob = PPO.load("./logs/ppo_MountainCar_noob.zip")
reward, _ = evaluate_policy(noob, env, 100)
print(f"PPO reward: {reward}")

In [None]:
env_f = gym.make(env_id)
env_f = gnwrapper.Animation(env_f)
env_f.observation_space

In [None]:
expert = PPO.load("./logs/ppo_MountainCar_expert.zip")
reward, _ = evaluate_policy(expert, env_f, 100)
print(f"Expert reward: {reward}")