# Collect trajectories from a trained agent

In [None]:
from stable_baselines3 import PPO
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
import numpy as np
import pickle

from imitation.data import types, rollout

# Generate environment and wrap it
env = MarioEnv(render=False, horizons=True, sticky=True)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

model = PPO.load("saved_agents/ppo_resize_sticky_500000_steps.zip", env=env) # Load agent

num_episodes = 100
trajectories = []

# for i in range(num_episodes):
while len(trajectories) < num_episodes: # in case we discard bad samples
    ep_acts, ep_obs, ep_rews, ep_dones, ep_infos = [], [], [], [], []
    obs = env.reset()
    ep_obs.append(obs[0])
    done = False
    print(len(trajectories), end="\r")
    while not done:
        action = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)

        ep_acts.append(action[0][0])
        ep_obs.append(obs[0])
        ep_rews.append(reward[0])
        ep_dones.append(done)
        ep_infos.append({})

    if ep_rews[-1] == 15.0: # Only keep trajectories that made it to the end flag (assume agent is not perfect)
        trajectories.append(types.TrajectoryWithRew(acts=np.array(ep_acts),
                                                    obs=np.array(ep_obs),
                                                    rews=np.array(ep_rews),
                                                    infos=ep_infos,
                                                    terminal=True))
    env.close()

with open("expert_horizons_sticky_100.pkl", "wb") as f:
    pickle.dump(trajectories, f)

# CODE TO COLLECT TRAJECTORIES FROM HUMAN

In [None]:
import pickle

with open("human_demo_1.pkl", "rb") as f:
# with open('expert_horizons_sticky_100.pkl', 'rb') as f:
    rollouts = pickle.load(f)

ep = rollouts[0]
for i in ep.obs:
    screen = i[0]
    for y in screen[::]:
        for x in y[::]:
            print(x, end="\t")
        print("")
# print(ep.obs.shape)
# print(ep.obs.dtype)
# screen = ep.obs[0][0]
# for y in screen[::]:
#     for x in y[::]:
#         print(x, end="\t")
#     print("")
# print(ep.acts.shape)
# print(len(ep.infos))

In [None]:
# TODO: Allow execution to come from python side OR purely through Java
# Do this with filewriter IF fps isnt affected
import numpy as np 
from imitation.data import types, rollout
import pickle
import cv2


ep_obs = []
with open('src/Data/human/lvl-1/1/obs.txt') as f:
    arr = (f.readline().strip().split(", "))
    # print(arr)

    stacked_obs = np.zeros([4, 84, 84])
    # obs = []
    # for j in range((int)(arr[1])):
    #     row = []
    #     for k in range((int)(arr[2])):
    #         row.append((int)(f.readline().strip()))
    #     obs.append(row)
    #     # print(obs)
    # obs = np.array(obs, dtype=np.uint8)

    #     # self.obs[self.obs == 0] = 0 # sky 
    # obs[obs == 17] = 25 # ground
    # obs[obs == 18] = 50 # stair block

    # obs[obs == 56] = 75 # flag pole   
    # obs[obs == 55] = 75 # flag top

    # obs[obs == 34] = 100 # pipe
    # obs[obs == 35] = 100
    # obs[obs == 36] = 100
    # obs[obs == 37] = 100

    # obs[obs == 22] = 125 # break block
    # obs[obs == 24] = 150 # coint block

    # obs[obs == 2] = 175 # enemy 
    # obs[obs == 22] = 125 # break block
    # obs[obs == 24] = 150 # coint block
      
    # obs[obs == 2] = 175 # enemy 

    # obs[obs == 12] = 200 # mushroom
    # obs[obs == 30] = 225 # coin

    # obs[obs == 99] = 255 # mario
    # obs[obs == 97] = 245 # mario left


    # obs = np.moveaxis(obs, -1, 0) # Swap x and y for readability
    # # print(obs)


    # obs = cv2.resize(obs, [80, 80], interpolation=cv2.INTER_AREA)
    # obs = np.pad(obs, [[2,2],[2,2]])
    # obs = obs.reshape([1, 84, 84])

    # s = np.zeros([4, 84, 84])

    # s[0] = stacked_obs[1]
    # s[1] = stacked_obs[2]
    # s[2] = stacked_obs[3]
    # s[3] = obs

    # ep_obs.append(s)
    # stacked_obs = s

    stride = 0
    print(arr)

    for i in range((int)(arr[0])):
        obs = []
        for j in range((int)(arr[1])):
            row = []
            for k in range((int)(arr[2])):
                row.append((int)(f.readline().strip()))
            obs.append(row)
        # print(obs)
        obs = np.array(obs, dtype=np.uint8)

        # self.obs[self.obs == 0] = 0 # sky 
        obs[obs == 17] = 25 # ground
        obs[obs == 18] = 50 # stair block

        obs[obs == 56] = 75 # flag pole   
        obs[obs == 55] = 75 # flag top

        obs[obs == 34] = 100 # pipe
        obs[obs == 35] = 100
        obs[obs == 36] = 100
        obs[obs == 37] = 100

        obs[obs == 22] = 125 # break block
        obs[obs == 24] = 150 # coint block

        obs[obs == 2] = 175 # enemy 
        obs[obs == 22] = 125 # break block
        obs[obs == 24] = 150 # coint block
        
        obs[obs == 2] = 175 # enemy 

        obs[obs == 12] = 200 # mushroom
        obs[obs == 30] = 225 # coin

        obs[obs == 99] = 255 # mario
        obs[obs == 97] = 245 # mario left


        obs = np.moveaxis(obs, -1, 0) # Swap x and y for readability
        # print(obs)


        obs = cv2.resize(obs, [80, 80], interpolation=cv2.INTER_AREA)
        obs = np.pad(obs, [[2,2],[2,2]])
        obs = obs.reshape([1, 84, 84])

        s = np.zeros([4, 84, 84])

        s[0] = stacked_obs[1]
        s[1] = stacked_obs[2]
        s[2] = stacked_obs[3]
        s[3] = obs

        if stride % 2 == 0 and stride > 0:
            ep_obs.append(s)
            stacked_obs = s

        stride += 1

    print(stacked_obs)


ep_acts = []
with open('src/Data/human/lvl-1/1/acts.txt') as f:
    arr = (f.readline().strip().split(", "))
    stride = 0
    potential_acts = []
    # print(arr)
    for i in range((int)(arr[0])):
        
        act = []
        for j in range((int)(arr[1])):
            act.append((int)(f.readline().strip()))
        act = (np.array(act)==1)

        # convert for action array to action 
        action = 0
        if np.all(act == [False, False, False, False, True]):
            action = 1 # Jump
        elif np.all(act == [True, False, False, False, False]):
            action = 2 # Left
        elif np.all(act == [True, False, False, True, False]):
            action = 3 # Left Run
        elif np.all(act == [True, False, False, False, True]):
            action = 4 # Left Jump
        elif np.all(act == [True, False, False, True, True]):
            action = 5 # Left Run Jump
        elif np.all(act == [False, True, False, False, False]):
            action = 6 # Right
        elif np.all(act == [False, True, False, True, False]):
            action = 7 # Right Run
        elif np.all(act == [False, True, False, False, True]):
            action = 8 # Right Jump
        elif np.all(act == [False, True, False, True, True]):
            action = 9 # Right Run Jump

        potential_acts.append(action)

        if stride % 2  == 0 and stride > 0:
            
            if 1 in potential_acts:
                ep_acts.append(1)
            elif 4 in potential_acts:
                ep_acts.append(4)
            elif 5 in potential_acts:
                ep_acts.append(5)
            elif 8 in potential_acts:
                ep_acts.append(8)
            elif 9 in potential_acts:
                ep_acts.append(9)
            else:
                ep_acts.append(np.bincount(potential_acts).argmax())
            print(potential_acts, ep_acts[-1])
            potential_acts = []
        stride += 1


ep_infos = []
ep_rews = []
with open('src/Data/human/lvl-1/1/infos.txt') as f:
    arr = (f.readline().strip().split(", "))
    stride = 0

    # print(arr)
    for i in range((int)(arr[0])):
        info = []en(ep_acts) == 0 or
        for j in range((int)(arr[1])):
            info.append((float)(f.readline().strip()))
        # print(info)

        if stride % 2 == 0 and stride > 0:
            ep_infos.append(info)
            ep_rews.append(0.0) # for now discard reward as it is not used in BC or GAIL
        stride += 1
   
if len(ep_acts) == len(ep_obs):
    ep_acts.pop()
    ep_rews.pop()
    ep_infos.pop()

print(len(ep_obs))
print(len(ep_acts))
print(len(ep_infos))
print(len(ep_rews))

trajectories = []
trajectories.append(types.TrajectoryWithRew(acts=np.array(ep_acts),
                                            obs=np.array(ep_obs, dtype=np.uint8),
                                            rews=np.array(ep_rews),
                                            infos=ep_infos,
                                            terminal=True))
                                    
with open("human_demo_1_2skip.pkl", "wb") as f:
    pickle.dump(trajectories, f)


# # print(ep_acts.shape)
# # # print(ep_infos)
# # print(ep_obs)

In [None]:
with open("human_demo_1_4skip.pkl", "rb") as f:
    rollouts = pickle.load(f)



In [None]:
from imitation.algorithms import bc
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
import pickle
from imitation.data import types, rollout



# Generate environment and wrap it
env = MarioEnv(render=False)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

# Load trajectories
# with open("expert_horizons_sticky_100.pkl", "rb") as f:
with open("human_demo_1_2skip.pkl", "rb") as f:
    rollouts = pickle.load(f)

transitions = rollout.flatten_trajectories(rollouts) # flatten into unordered obs, action, next_obs tuples

# Set up BC trainer model
bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
)

# Train agent
for i in range(50):
    bc_trainer.train(n_epochs=10000)
    bc_trainer.save_policy("agents/bc_human/bc_2skip_3_" + str((i+1)*10000))

In [None]:
from imitation.algorithms import bc
from mario_env import MarioEnv


from imitation.algorithms.adversarial.gail import GAIL
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecTransposeImage
import pickle

from gym.wrappers import ResizeObservation

import gym
from stable_baselines3.common.callbacks import CheckpointCallback
from imitation.data import rollout, types



# Generate environment and wrap it
env = MarioEnv(render=False, horizons=True, starts=False)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

with open("human_demo_1_2skip.pkl", "rb") as f:
    rollouts = pickle.load(f)

transitions = rollout.flatten_trajectories(rollouts)


learner = PPO(
    env=env,
    policy="CnnPolicy",
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
    
)

reward_net = BasicRewardNet(
    env.observation_space, env.action_space, normalize_input_layer=RunningNorm
)
gail_trainer = GAIL(
    demonstrations=transitions,
    demo_batch_size=512, # EDITED normally 1024
    gen_replay_buffer_capacity=2048,
    n_disc_updates_per_round=4,
    venv=env,
    gen_algo=learner,
    reward_net=reward_net,
)

# learner_rewards_before_training, _ = evaluate_policy(
#     learner, env, 100, return_episode_rewards=True
# )

# gail_trainer.train(3000)

for i in range(4*1000):
    gail_trainer.train(25000)  # Note: set to 300000 for better results
    gail_trainer.gen_algo.save("agents/gail/gail_expert_PC_2skip_" + str( (i+1) * 25000))

In [2]:
from stable_baselines3 import DQN, A2C, PPO
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
import time
import os
from imitation.algorithms import bc
import numpy as np
import pickle 

# Generate environment and wrap it
env = MarioEnv(render=True, starts = False, sticky=False, timer=45)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

# model = bc.reconstruct_policy("saved_agents/bc_policy_100epoch_expert") # Load BC agent
# model = bc.reconstruct_policy("agents/bc_human/bc_4skip_2_90000")
# model = bc.reconstruct_policy("agents/bc_human/bc_0skip_30000")
# model = PPO.load("saved_agents/gail_expert_PC_5450000.zip") # Load GAIL agent
model = PPO.load("agents/gail/gail_expert_PC_2skip_1500000.zip")

# model = bc.reconstruct_policy("agents/bc_human/bc_2skip_3_220000")
# obs = env.reset()

# screen = obs[0][3]
# for y in screen[::]:
#     for x in y[::]:
#         print(x, end="\t")
#     print("")

# print(obs.shape)
# print(np.sum(obs[0][0]))
# print(np.sum(obs[0][1]))
# print(np.sum(obs[0][2]))
# print(np.sum(obs[0][3]))

# action = model.predict(obs)
# obs, reward, done, info = env.step(action)

# screen = obs[0][3]
# for y in screen[::]:
#     for x in y[::]:
#         print(x, end="\t")
#     print("")


# print(np.sum(obs[0][0]))
# print(np.sum(obs[0][1]))
# print(np.sum(obs[0][2]))
# print(np.sum(obs[0][3]))

# action = model.predict(obs)
# obs, reward, done, info = env.step(action)

# screen = obs[0][3]
# for y in screen[::]:
#     for x in y[::]:
#         print(x, end="\t")
#     print("")


# print(np.sum(obs[0][0]))
# print(np.sum(obs[0][1]))
# print(np.sum(obs[0][2]))
# print(np.sum(obs[0][3]))

# action = model.predict(obs)
# obs, reward, done, info = env.step(action)

# screen = obs[0][3]
# for y in screen[::]:
#     for x in y[::]:
#         print(x, end="\t")
#     print("")


# print(np.sum(obs[0][0]))
# print(np.sum(obs[0][1]))
# print(np.sum(obs[0][2]))
# print(np.sum(obs[0][3]))

# action = model.predict(obs)
# obs, reward, done, info = env.step(action)

# print(np.sum(obs[0][0]))
# print(np.sum(obs[0][1]))
# print(np.sum(obs[0][2]))
# print(np.sum(obs[0][3]))

# screen = obs[0][3]
# for y in screen[::]:
#     for x in y[::]:
#         print(x, end="\t")
#     print("")

# env.close()
with open("human_demo_1_0skip.pkl", "rb") as f:
# with open('expert_horizons_sticky_100.pkl', 'rb') as f:
    rollouts = pickle.load(f)
ep = rollouts[0]

episodes = 1
for i in range(episodes):
    obs = env.reset()
    done = False
    # env.step([6])
    # env.step([0])

    s = 0
    while not done:
        # action = model.predict(obs, deterministic=True)
        action = [ep.acts[s]]
        obs, reward, done, info = env.step(action)
        s+=1
    env.close()

IndexError: index 1107 is out of bounds for axis 0 with size 1107

In [None]:
import pickle


with open("human_demo_1_4skip.pkl", "rb") as f:
# with open('expert_horizons_sticky_100.pkl', 'rb') as f:
    rollouts = pickle.load(f)

for i in range(100):

    ep = rollouts[0]
    print(ep.obs.shape)
    screen = ep.obs[i][3]
    for y in screen[::]:
        for x in y[::]:
            print(x, end="\t")
        print("")
    print(ep.acts[i])
    

    # print(ep.obs.shape)

    # print(np.sum(ep.obs[2][0]))
    # print(np.sum(ep.obs[2][1]))
    # print(np.sum(ep.obs[2][2]))
    # print(np.sum(ep.obs[2][3]))
    # print(ep.acts.shape)
    # print(len(ep.infos))

    # with open("human_demo_1.pkl", "rb") as f:
    # with open('expert_horizons_sticky_100.pkl', 'rb') as f:
    #     rollouts = pickle.load(f)

    # ep = rollouts[0]
    # # print(ep.obs.shape)

    # # print(np.sum(ep.obs[2][0]))
    # # print(np.sum(ep.obs[2][1]))
    # # print(np.sum(ep.obs[2][2]))
    # # print(np.sum(ep.obs[2][3]))

    # screen = ep.obs[i][3]
    # for y in screen[::]:
    #     for x in y[::]:
    #         print(x, end="\t")
    #     print("")

    # print(ep.acts[i])

    print("----------------------" + str(i) + "-----------------")
    # print(ep.acts.shape)
    # print(len(ep.infos))

In [None]:
# TODO: Accidentally deleted code to connect to PythonController.java and run sample() to collect trajectories from human. Remake this code eventually!
