# Pre-process trajectories collected by a human


In [25]:
import numpy as np 
from imitation.data import types, rollout
import pickle
import cv2

examples = ["src/Data/human/lvl-1/2",
            'src/Data/human/lvl-1/3',
            'src/Data/human/lvl-1/4',
            'src/Data/human/lvl-1/5',
            'src/Data/human/lvl-1/6',
            'src/Data/human/lvl-1/7',
            'src/Data/human/lvl-1/8',
            'src/Data/human/lvl-1/9',
            'src/Data/human/lvl-1/10',
            'src/Data/human/lvl-1/11',
            'src/Data/human/lvl-1/12',
            'src/Data/human/lvl-1/13',
            'src/Data/human/lvl-1/14',
            'src/Data/human/lvl-1/15',
            'src/Data/human/lvl-1/16',
            'src/Data/human/lvl-1/17',
            'src/Data/human/lvl-1/18',
            'src/Data/human/lvl-1/19',
            'src/Data/human/lvl-1/20',
            'src/Data/human/lvl-1/21',
]

subsample_rate = 1
horizons = True
max_timestep = 1025

trajectories = []

for example in examples:

    # Parse observations
    # print('Parsing Observations')
    ep_obs = []
    with open(example + '/obs.txt') as f:
        arr = (f.readline().strip().split(", ")) # first line used as argument to know length of arrays
        stacked_obs = np.zeros([4, 84, 84])
        stride = -1 # start stride counter at -1 so that we keep the first sample
        # print(arr)

        # Parse java multidimensional array
        for i in range((int)(arr[0])): 
            stride += 1

            if stride % subsample_rate != 0: # subsample every s sample point
                continue

            
            obs = []
            for j in range((int)(arr[1])):
                row = []
                for k in range((int)(arr[2])):
                    row.append((int)(f.readline().strip()))
                obs.append(row)
            obs = np.array(obs, dtype=np.uint8)

            # Like in the environment, manually replace tile encoding with our state representation
            # self.obs[self.obs == 0] = 0 # sky 
            obs[obs == 17] = 25 # ground
            obs[obs == 18] = 50 # stair block

            obs[obs == 56] = 75 # flag pole   
            obs[obs == 55] = 75 # flag top

            obs[obs == 34] = 100 # pipe
            obs[obs == 35] = 100
            obs[obs == 36] = 100
            obs[obs == 37] = 100

            obs[obs == 22] = 125 # break block
            obs[obs == 24] = 150 # coint block

            obs[obs == 2] = 175 # enemy 
            obs[obs == 22] = 125 # break block
            obs[obs == 24] = 150 # coint block
            
            obs[obs == 2] = 175 # enemy 

            obs[obs == 12] = 200 # mushroom
            obs[obs == 30] = 225 # coin

            obs[obs == 99] = 255 # mario
            obs[obs == 97] = 245 # mario left


            obs = np.moveaxis(obs, -1, 0) # Swap x and y for readability
            # print(obs)

            # resize from 16x16 to 84x84
            obs = cv2.resize(obs, [80, 80], interpolation=cv2.INTER_AREA)
            obs = np.pad(obs, [[2,2],[2,2]])
            obs = obs.reshape([1, 84, 84])


            s = np.zeros([4, 84, 84]) # Stack the last 4 accepted frames over the first channel
            s[0] = stacked_obs[1]
            s[1] = stacked_obs[2]
            s[2] = stacked_obs[3]
            s[3] = obs

            ep_obs.append(s)
            stacked_obs = s

        # print(stacked_obs)
    # print(len(ep_obs))

    # Parse Actions
    # print('Parsing Actions')

    ep_acts = []
    with open(example + '/acts.txt') as f:
        arr = (f.readline().strip().split(", ")) # first line used as argument to know length of arrays
        # print(arr)
        stride = -1 # start stride counter at -1 so that we keep the first sample
        potential_acts = [] # because of frame skipping we need to decide which action of the skipped frames to take

        # Parse java multidimensional array
        for i in range((int)(arr[0])):
            stride += 1

            act = []
            for j in range((int)(arr[1])):
                act.append((int)(f.readline().strip()))
            act = (np.array(act)==1)

            # convert for action array to action 
            action = 0
            if np.all(act == [False, False, False, False, True]):
                action = 1 # Jump
            elif np.all(act == [True, False, False, False, False]):
                action = 2 # Left
            elif np.all(act == [True, False, False, True, False]):
                action = 3 # Left Run
            elif np.all(act == [True, False, False, False, True]):
                action = 4 # Left Jump
            elif np.all(act == [True, False, False, True, True]):
                action = 5 # Left Run Jump
            elif np.all(act == [False, True, False, False, False]):
                action = 6 # Right
            elif np.all(act == [False, True, False, True, False]):
                action = 7 # Right Run
            elif np.all(act == [False, True, False, False, True]):
                action = 8 # Right Jump
            elif np.all(act == [False, True, False, True, True]):
                action = 9 # Right Run Jump

            potential_acts.append(action)

            if stride % subsample_rate != 0: # subsample every s sample point
                continue

                
            # Manually override so that jump actions take precedent within skipped frames
            if 1 in potential_acts:
                ep_acts.append(1)
            elif 4 in potential_acts:
                ep_acts.append(4)
            elif 5 in potential_acts:
                ep_acts.append(5)
            elif 8 in potential_acts:
                ep_acts.append(8)
            elif 9 in potential_acts:
                ep_acts.append(9)
            else: # Else, take the most frequent action
                ep_acts.append(np.bincount(potential_acts).argmax())
            # print(potential_acts, ep_acts[-1])

            potential_acts = []
    # print(len(ep_acts))


    # Parse infos/rewards
    # print('Parsing Infos/Rewards')
    ep_infos = []
    ep_rews = []
    with open(example + '/infos.txt') as f:
        arr = (f.readline().strip().split(", "))
        stride = -1

        # Parse java multidimensional array
        for i in range((int)(arr[0])):
            stride += 1

            if stride % subsample_rate != 0: # subsample every s sample point
                continue

            info = []
            for j in range((int)(arr[1])):
                info.append((float)(f.readline().strip()))
            # print(info)

            ep_infos.append(info) # info not used in either as well but good to have for debugging
            ep_rews.append(0.0) # for now discard reward as it is not used in BC or GAIL

    # NOTE: len(obs) == len(acts) + 1 == len(infos) + 1 == len(rews) + 1
    if len(ep_acts) == len(ep_obs):
        ep_acts.pop()
        ep_rews.pop()
        ep_infos.pop()


    if horizons:
        if len(ep_acts) < max_timestep - 1:
            
            stacked_o = ep_obs[-1]

        
            for i in range(4):
                stacked_o = ep_obs[-1]
                o = np.ones([1, 84, 84]) * 85

                s = np.zeros([4, 84, 84]) # Stack the last 4 accepted frames over the first channel
                s[0] = stacked_obs[1]
                s[1] = stacked_obs[2]
                s[2] = stacked_obs[3]
                s[3] = o

                ep_acts.append(0)
                ep_infos.append([])
                ep_rews.append(0.0)
                ep_obs.append(s)
                
            for i in range(max_timestep - len(ep_acts) - 4):

                ep_obs.append(np.ones([4, 84, 84]) * 85)
                ep_acts.append(0)
                ep_infos.append([])
                ep_rews.append(0.0)


    trajectories.append(types.TrajectoryWithRew(acts=np.array(ep_acts),
                                            obs=np.array(ep_obs, dtype=np.uint8),
                                            rews=np.array(ep_rews),
                                            infos=ep_infos,
                                            terminal=True))

print(len(trajectories))       

with open("human_20ep_0skip_horizons.pkl", "wb") as f:
    pickle.dump(trajectories, f)




20


# Test trajectory by running it through the env sequentially

In [2]:
from stable_baselines3 import DQN, A2C, PPO
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
import time
import os
from imitation.algorithms import bc
import numpy as np
import pickle 

# Generate environment and wrap it
env = MarioEnv(render=True, starts = False, sticky=False, timer=45, skip=1)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)



with open("human_20ep_0skip.pkl", "rb") as f:
    rollouts = pickle.load(f)


for episode in [rollouts[0]]:
    obs = env.reset()
    for step in range(len(episode.acts)):
        action = [episode.acts[step]]
        env.step(action)
    env.close()

# Training an Agent

In [None]:
from imitation.algorithms import bc
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
import pickle
from imitation.data import types, rollout



# Generate environment and wrap it
env = MarioEnv(render=False, skip=1)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

# Load trajectories
# with open("expert_horizons_sticky_100.pkl", "rb") as f:
with open("human_20ep_0skip.pkl", "rb") as f:
    rollouts = pickle.load(f)

transitions = rollout.flatten_trajectories(rollouts) # flatten into unordered obs, action, next_obs tuples

# Set up BC trainer model
bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
)

# Train agent
for i in range(10):
    bc_trainer.train(n_epochs=100) # 100 epochs ~4 mins
    bc_trainer.save_policy("agents/bc_human_20ep/bc_0skip_" + str((i+1)*100))

In [None]:
from imitation.algorithms import bc
from mario_env import MarioEnv


from imitation.algorithms.adversarial.gail import GAIL
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecTransposeImage
import pickle

from gym.wrappers import ResizeObservation

import gym
from stable_baselines3.common.callbacks import CheckpointCallback
from imitation.data import rollout, types


# Generate environment and wrap it
env = MarioEnv(render=False, horizons=True, starts=False, skip=1, max_timestep=1000)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

with open("human_20ep_0skip_horizons.pkl", "rb") as f:
    rollouts = pickle.load(f)

transitions = rollout.flatten_trajectories(rollouts)


learner = PPO(
    env=env,
    policy="CnnPolicy",
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
    
)

reward_net = BasicRewardNet(
    env.observation_space, env.action_space, normalize_input_layer=RunningNorm
)
gail_trainer = GAIL(
    demonstrations=transitions,
    demo_batch_size=512, # EDITED normally 1024
    gen_replay_buffer_capacity=2048,
    n_disc_updates_per_round=4,
    venv=env,
    gen_algo=learner,
    reward_net=reward_net,
)

gail_trainer.train(25000)
gail_trainer.gen_algo.save("agents/gail_human_20ep/gail")

# for i in range(4*10):
#     gail_trainer.train(25000) 
#     gail_trainer.gen_algo.save("agents/gail_human_20ep/gail_0skip_" + str( (i+1) * 25000))

# Testing agents

In [None]:
from stable_baselines3 import DQN, A2C, PPO
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
import time
import os
from imitation.algorithms import bc
import numpy as np
import pickle 

# Generate environment and wrap it
env = MarioEnv(render=True, starts = False, sticky=False, timer=45, skip=1)
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

model = bc.reconstruct_policy("agents/bc_human_20ep/bc_0skip_400") # Load BC agent
# model = PPO.load("saved_agents/gail_expert_PC_5450000.zip") # Load GAIL agent

episodes = 1
for i in range(episodes):
    obs = env.reset()
    done = False
    while not done:
        action = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)

        screen = obs[0][3]
        for y in screen[::]:
            for x in y[::]:
                print(x, end="\t")
            print("")
        print("---")

    env.close()