# Random initial setup tests


In [None]:
# Test that jpype and java environment is setup properly

import os
import sys
from pathlib import Path
import jpype

if not jpype.isJVMStarted():
    jpype.startJVM(jpype.getDefaultJVMPath(), "-ea")

jpype.addClassPath("/home/mikeg/Documents/CMPUT652FinalProject/CMPUT652FinalProject/Mario-AI-Framework/src") # TODO: figure how to use relative path here
main = jpype.JClass('PythonController')

# Run a sample to test jpype is set up correctly
main.reset(False)
result = main.step([True, False, False, False, True])

# Print out screen observation
for i in range(result.observation.length):
            for j in range(result.observation[i].length):
                print(result.observation[i][j], end=" ")
            print("")



In [None]:
# Test if current environment correctly implements OpenAI Gym standards

from stable_baselines3.common.env_checker import check_env
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation

env = MarioEnv(render=False)

# It will check your custom environment and output additional warnings if needed
check_env(env)

In [None]:
# Sample 100 random actions in the environment

from mario_env import MarioEnv
import numpy as np

env = MarioEnv()

for i in range(1):
    done = False
    env.reset()
    # while not done:
    for i in range(20):
        obs, reward, done, info = env.step(6 + np.random.randint(4)) # sample randomly from right-inputs only
        for j in obs:
            for k in j:
                print(k[0], end="\t")
            print("")
        print("---" + str(i) + "---")
        # print(info)


# --- Training an Agent ---

In [1]:
from stable_baselines3 import DQN, A2C, PPO
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
from stable_baselines3.common.callbacks import CheckpointCallback

# Set up a callback to save the model periodically
checkpoint_callback = CheckpointCallback(
  save_freq=25000,
  save_path="./agents/",
  name_prefix="lvl2/ppo_resize_baseline_framemax",
  save_replay_buffer=True,
  save_vecnormalize=True,
)

env = MarioEnv(render=False, sticky=False, starts = False, level='levels/original/lvl-2.txt')

# Perform some wrapping on the environment
print(env.observation_space.shape)
env = ResizeObservation(env, 84) # Resize observations from 16x16 to 84x84
print(env.observation_space.shape)
env = DummyVecEnv([lambda: env]) # turn into vectorized environment
print(env.observation_space.shape)
env = VecFrameStack(env, n_stack=4) # stack vectorized environment by 4 frames
print(env.observation_space.shape)
env = VecTransposeImage(env, skip=False) # needed for images to properly be processed by CnnPolicy
print(env.observation_space.shape)


model = PPO("CnnPolicy", env=env, verbose=1)
# model = PPO.load("agents/lvl2/ppo_resize_baseline_250000_steps.zip")
# model.set_env(env)
model.learn(total_timesteps=200000, callback=checkpoint_callback)
# model.learn(total_timesteps=250000) # Train without checkpointing
model.save("model") # save model a second time to model.zip so its easy to test new models right away

  from .autonotebook import tqdm as notebook_tqdm


(84, 84, 1)
(84, 84, 1)
(84, 84, 1)
(84, 84, 4)
(4, 84, 84)
Using cuda device
-----------------------------
| time/              |      |
|    fps             | 277  |
|    iterations      | 1    |
|    time_elapsed    | 7    |
|    total_timesteps | 2048 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 262       |
|    iterations           | 2         |
|    time_elapsed         | 15        |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 5.9192553 |
|    clip_fraction        | 0.879     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.982    |
|    explained_variance   | 0.00206   |
|    learning_rate        | 0.0003    |
|    loss                 | 464       |
|    n_updates            | 10        |
|    policy_gradient_loss | 0.316     |
|    value_loss           | 1.7e+03   |
--------------------------------

# --- Testing an Agent ---

In [1]:
from stable_baselines3 import DQN, A2C, PPO
from mario_env import MarioEnv
from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage, DummyVecEnv
from gym.wrappers import ResizeObservation
from imitation.algorithms import bc
import time
import os
import numpy as np

# Generate environment and wrap it
env = MarioEnv(render=True, starts=False, horizons=False, sticky=False, astar = False, level='levels/original/lvl-2.txt')
env = ResizeObservation(env, 84)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env, skip=False)

# model = PPO.load("model") # Load last trained model

model = PPO.load("agents/lvl2/ppo_resize_baseline_adjrew_75000_steps.zip")

# Load current best agents:
# model = PPO.load("saved_agents/ppo_resize_sticky_500000_steps.zip") # Current best PPO agent (can do with sticky actions)
# model = PPO.load("saved_agents/ppo_tile_4skip_fwd_goomba_200000_steps.zip") # goomba agent
# model = bc.reconstruct_policy("saved_agents/bc_policy_100epoch_expert") # Load BC agent
# model = PPO.load("saved_agents/gail_expert_PC_5450000.zip") # best GAIL so far (horizon on training examples)


episodes = 1
for i in range(episodes):
    obs = env.reset()
    done = False
    t = 0
    while not done:
    # for i in range(12):
        action = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
    
        screen = obs[0][3]
        for y in screen[::]:
            for x in y[::]:
                print(x, end="\t")
            print("")

        print(action)
        print(reward) 
        print(action)
        print(info)
        # time.sleep(0.5)
        print("------" + str(t)+"-----")
        t+=1
    env.close()


  from .autonotebook import tqdm as notebook_tqdm


0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	
0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

In [None]:
# SAMPLE OF CUSTOM CNN POLICY (NOT USED ANYMORE)

# import gym
# import torch as th
# import torch.nn as nn

# from stable_baselines3 import DQN
# from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

# from mario_env import MarioEnv
# from stable_baselines3.common.vec_env import VecFrameStack, VecTransposeImage
# from  stable_baselines3.common.vec_env import DummyVecEnv


# class CustomCNN(BaseFeaturesExtractor):
#     """
#     :param observation_space: (gym.Space)
#     :param features_dim: (int) Number of features extracted.
#         This corresponds to the number of unit for the last layer.
#     """

#     def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
#         super(CustomCNN, self).__init__(observation_space, features_dim)
#         # We assume CxHxW images (channels first)
#         # Re-ordering will be done by pre-preprocessing or wrapper
#         n_input_channels = observation_space.shape[0]
#         self.cnn = nn.Sequential(
#             nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4, padding=0),
#             nn.ReLU(),
#             nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),
#             nn.ReLU(),
#             nn.Flatten(),
#         )

#         # Compute shape by doing one forward pass
#         with th.no_grad():
#             n_flatten = self.cnn(
#                 th.as_tensor(observation_space.sample()[None]).float()
#             ).shape[1]

#         self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

#     def forward(self, observations: th.Tensor) -> th.Tensor:
#         return self.linear(self.cnn(observations))


# env = MarioEnv(render=False)
# print(env.observation_space.shape)
# env = DummyVecEnv([lambda: env])
# print(env.observation_space.shape)
# env = VecFrameStack(env, n_stack=4)
# print(env.observation_space.shape)
# env = VecTransposeImage(env, skip=True)
# print(env.observation_space.shape)

# policy_kwargs = dict(
#     features_extractor_class=CustomCNN,
#     features_extractor_kwargs=dict(features_dim=128),
# )
# model = DQN("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=1)
# model.learn(50000)
# model.save('cnn_50k_timesteps')