# Collect trajectories from a trained agent

In [1]:
from stable_baselines3 import PPO
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
import numpy as np
import pickle

from imitation.data import types, rollout

# Generate environment and wrap it
env = MarioEnv(render=False, horizons=True, sticky=True)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

model = PPO.load("saved_agents/ppo_resize_sticky_500000_steps.zip", env=env) # Load agent

num_episodes = 100
trajectories = []

# for i in range(num_episodes):
while len(trajectories) < num_episodes: # in case we discard bad samples
    ep_acts, ep_obs, ep_rews, ep_dones, ep_infos = [], [], [], [], []
    obs = env.reset()
    ep_obs.append(obs[0])
    done = False
    print(len(trajectories), end="\r")
    while not done:
        action = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)

        ep_acts.append(action[0][0])
        ep_obs.append(obs[0])
        ep_rews.append(reward[0])
        ep_dones.append(done)
        ep_infos.append({})

    if ep_rews[-1] == 15.0: # Only keep trajectories that made it to the end flag (assume agent is not perfect)
        trajectories.append(types.TrajectoryWithRew(acts=np.array(ep_acts),
                                                    obs=np.array(ep_obs),
                                                    rews=np.array(ep_rews),
                                                    infos=ep_infos,
                                                    terminal=True))
    env.close()

with open("expert_horizons_sticky_100.pkl", "wb") as f:
    pickle.dump(trajectories, f)

  from .autonotebook import tqdm as notebook_tqdm


FileNotFoundError: [Errno 2] No such file or directory: '_logs_3/ppo_resize_sticky_500000_steps.zip.zip'

In [None]:
from imitation.algorithms import bc
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
import pickle
from imitation.data import types, rollout



# Generate environment and wrap it
env = MarioEnv(render=False)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

# Load trajectories
with open("expert_horizons_sticky_100.pkl", "rb") as f:
    rollouts = pickle.load(f)

transitions = rollout.flatten_trajectories(rollouts) # flatten into unordered obs, action, next_obs tuples

# Set up BC trainer model
bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
)

# Train agent
bc_trainer.train(n_epochs=100)
bc_trainer.save_policy("bc_policy_100epoch_expert")

In [None]:
from imitation.algorithms import bc
from mario_env import MarioEnv


from imitation.algorithms.adversarial.gail import GAIL
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecTransposeImage
import pickle

from gym.wrappers import ResizeObservation

import gym
from stable_baselines3.common.callbacks import CheckpointCallback
from imitation.data import rollout, types



# Generate environment and wrap it
env = MarioEnv(render=False, horizons=True, starts=False)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

with open("expert_horizons_sticky_100.pkl", "rb") as f:
    rollouts = pickle.load(f)

transitions = rollout.flatten_trajectories(rollouts)


learner = PPO(
    env=env,
    policy="CnnPolicy",
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
    
)

reward_net = BasicRewardNet(
    env.observation_space, env.action_space, normalize_input_layer=RunningNorm
)
gail_trainer = GAIL(
    demonstrations=transitions,
    demo_batch_size=1024,
    gen_replay_buffer_capacity=2048,
    n_disc_updates_per_round=4,
    venv=env,
    gen_algo=learner,
    reward_net=reward_net,
)

learner_rewards_before_training, _ = evaluate_policy(
    learner, env, 100, return_episode_rewards=True
)

for i in range(4*3):
    gail_trainer.train(25000)  # Note: set to 300000 for better results
    gail_trainer.gen_algo.save("agents/gail_expert_PC_" + str( (i+1) * 25000))

In [None]:
from stable_baselines3 import DQN, A2C, PPO
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
import time
import os
from imitation.algorithms import bc

# Generate environment and wrap it
env = MarioEnv(render=True, starts = False, sticky=False)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

model = bc.reconstruct_policy("saved_agents/bc_policy_100epoch_expert") # Load BC agent
# model = PPO.load("saved_agents/gail_expert_PC_5450000.zip") # Load GAIL agent

episodes = 1
for i in range(episodes):
    obs = env.reset()
    done = False
    while not done:
        action = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
    env.close()

In [None]:
# TODO: Accidentally deleted code to connect to PythonController.java and run sample() to collect trajectories from human. Remake this code eventually!
