In [21]:
%load_ext tensorboard

In [2]:
#standard packages
import gym
import numpy as np
import cv2
import torch as th
from torch import nn
import os

# mario packages
import gym_super_mario_bros
from gym_super_mario_bros import SuperMarioBrosEnv
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import *

# Import Frame Stacker Wrapper and GrayScaling Wrapper
from gym.wrappers import GrayScaleObservation

# Import Vectorization Wrappers
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

# Import algo
from stable_baselines3 import A2C, PPO

# Import Base Callback for saving models
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import VecVideoRecorder, SubprocVecEnv, DummyVecEnv

In [4]:
tensorboard_logdir = os.path.abspath("./mario/cc_model")
reward_log_path = os.path.join(tensorboard_logdir, 'reward_log.csv')

In [5]:
%tensorboard --logdir $tensorboard_logdir

Reusing TensorBoard on port 6006 (pid 15792), started 19:08:40 ago. (Use '!kill 15792' to kill it.)

# Regular Environment

In [6]:
tensorboard_logdir = os.path.abspath("./mario/reg_model")
reward_log_path = os.path.join(tensorboard_logdir, 'reward_log.csv')

In [7]:
%tensorboard --logdir $tensorboard_logdir --port=6008

Reusing TensorBoard on port 6008 (pid 20216), started 19:05:05 ago. (Use '!kill 20216' to kill it.)

# Check reward

In [5]:
import pandas as pd

In [17]:
reward_ccmodel = pd.read_csv("mario/cc_model/reward_log.csv")
reward_ccmodel = reward_ccmodel.loc[reward_ccmodel.timesteps!="timesteps", :].reset_index(drop=True)
reward_ccmodel["timesteps"] = reward_ccmodel["timesteps"].astype("float")
reward_ccmodel["best_reward"] = reward_ccmodel["best_reward"].astype("float")
reward_ccmodel = reward_ccmodel.sort_values(by="best_reward", ascending=False)
print(reward_ccmodel.head(10))
print(reward_ccmodel.sort_values(by="timesteps", ascending=False).head(5))
# reward_ccmodel.loc[reward_ccmodel.timesteps=="timesteps", :]

      timesteps     reward  best_reward
2328  2328000.0   1857.85        3080.0
2120  2120000.0   2141.05        3079.0
2370  2370000.0    1960.9        3077.0
1418  1418000.0    2113.7        3077.0
1749  1749000.0    2303.9        3077.0
1375  1375000.0    1837.3        3076.0
2255  2255000.0   1991.75        3076.0
2091  2091000.0   2333.65        3075.0
2289  2289000.0    1766.2        3075.0
2480  2480000.0    2027.0        3075.0
      timesteps     reward  best_reward
2484  2484000.0    1609.5        3063.0
2483  2483000.0    2313.6        3065.0
2482  2482000.0    2507.5        3062.0
2481  2481000.0   2356.75        3069.0
2480  2480000.0    2027.0        3075.0


In [15]:
reward_regmodel = pd.read_csv("mario/reg_model/reward_log.csv").sort_values(by="best_reward", ascending=False)
print(reward_regmodel.head(10))
print(reward_regmodel.sort_values(by="timesteps", ascending=False).head(5))

      timesteps   reward  best_reward
2272    2273000  2289.75       3068.0
2447    2448000  2649.50       3068.0
2345    2346000  2287.05       3068.0
2346    2347000  2334.40       3068.0
2351    2352000  2586.05       3068.0
2443    2444000  2557.75       3068.0
2352    2353000  2543.20       3068.0
2425    2426000  2628.05       3068.0
2519    2520000  2150.35       3068.0
2424    2425000  2517.30       3068.0
      timesteps   reward  best_reward
2523    2524000  2163.10       3062.0
2522    2523000  2032.20       3062.0
2521    2522000  2332.50       3063.0
2520    2521000  2584.80       3062.0
2519    2520000  2150.35       3068.0


# Check results

In [2]:
#standard packages
import gym
import numpy as np
import cv2
import torch as th
from torch import nn
import os

# mario packages
import gym_super_mario_bros
from gym_super_mario_bros import SuperMarioBrosEnv
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import *
from gym.wrappers import RecordVideo

# Import Frame Stacker Wrapper and GrayScaling Wrapper
from gym.wrappers import GrayScaleObservation

# Import Vectorization Wrappers
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

# Import algo
from stable_baselines3 import A2C, PPO

# Import Base Callback for saving models
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import VecVideoRecorder, SubprocVecEnv, DummyVecEnv

class CoinCollectorSuperMarioBrosEnv(SuperMarioBrosEnv):
    #score btn 2 time frames can maybe go upto 8000 so we can just divide by 100 (reference https://www.mariowiki.com/Point)
    reward_range = (-15, 100)

    def __init__(self, rom_mode='vanilla', lost_levels=False, target=None):
        super().__init__(rom_mode=rom_mode, lost_levels=lost_levels, target=target)

        # variable to keep track of score deltas
        self._score_last = 0

    @property
    def _score_reward(self):
        _reward = self._score - self._score_last
        self._score_last = self._score
        return _reward/100

    # This should override the parent function
    def _get_reward(self):
        return self._x_reward + self._score_reward + self._time_penalty + self._death_penalty

'''
The code below registers this new environment in gym for us to reference later. Code borrowed from _registration.py of gym_super_mario_bros
'''
def _register_coin_collector_mario_stage_env(id, **kwargs):
    """
    Register a Super Mario Bros. (1/2) stage environment with OpenAI Gym.

    Args:
        id (str): id for the env to register
        kwargs (dict): keyword arguments for the SuperMarioBrosEnv initializer

    Returns:
        None

    """
    # register the environment
    gym.envs.registration.register(
        id=id,
        # entry_point='.:CoinCollectorSuperMarioBrosEnv',
        entry_point=CoinCollectorSuperMarioBrosEnv,
        max_episode_steps=9999999,
        reward_threshold=9999999,
        kwargs=kwargs,
        nondeterministic=True,
    )

def _register_all_coin_collector_envs():
    # a template for making individual stage environments
    _ID_TEMPLATE = 'CoinCollectorSuperMarioBrosEnv-{}-{}-v{}'
    # A list of ROM modes for each level environment
    _ROM_MODES = [
        'vanilla',
        'downsample',
        'pixel',
        'rectangle'
    ]

    # iterate over all the rom modes, worlds (1-8), and stages (1-4)
    for version, rom_mode in enumerate(_ROM_MODES):
        for world in range(1, 9):
            for stage in range(1, 5):
                # create the target
                target = (world, stage)
                # setup the frame-skipping environment
                env_id = _ID_TEMPLATE.format(world, stage, version)
                print(f"Registering Coin Collector {env_id} in gym for use later on.")
                _register_coin_collector_mario_stage_env(env_id, rom_mode=rom_mode, target=target)
                print(f"Successfully registered coin collector env {env_id}!")

def create_gym_env_from_level(world, stage, version, use_coin_collector_env):
    level_suffix = f"{world}-{stage}-v{version}"
    if not use_coin_collector_env:
        level = f"SuperMarioBros-{level_suffix}"
        env = gym_super_mario_bros.make(level)
    else:
        env_set = set(gym.envs.registration.registry.env_specs.copy().keys())
        level = f"CoinCollectorSuperMarioBrosEnv-{level_suffix}"
        if level not in env_set:
            # register all these custom environments for the first time
            _register_all_coin_collector_envs()

        assert level in set(
            gym.envs.registration.registry.env_specs.copy().keys()
        ), f"Looks like {level} was not registered correctly!"
        env = gym.make(level)

    return env

class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info

class ResizeEnv(gym.ObservationWrapper):
    def __init__(self, env, size):
        gym.ObservationWrapper.__init__(self, env)
        (oldh, oldw, oldc) = env.observation_space.shape
        newshape = (size, size, oldc)
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=newshape, dtype=np.uint8)

    def observation(self, frame):
        height, width, _ = self.observation_space.shape
        frame = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
        if frame.ndim == 2:
            frame = frame[:,:,None]
        return frame

def create_mario_env(world, stage, version, use_coin_collector_env):
    env = create_gym_env_from_level(world, stage, version, use_coin_collector_env)
    env = JoypadSpace(env, COMPLEX_MOVEMENT)
    env = SkipFrame(env, skip=4)
    env = GrayScaleObservation(env, keep_dim=True)
    env = ResizeEnv(env, size=84)
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 4, channels_order='last')
    return env

class MarioNet(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim):
        super(MarioNet, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(th.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))


In [3]:
def env_setup(world=1, stage=1, version=3, use_coin_collector_env=True):

    env = create_mario_env(world=world, stage=stage, version=version, use_coin_collector_env=use_coin_collector_env)
    env.reset()
    state, reward, done, info = env.step([0])
    print('state:', state.shape) #Color scale, height, width, num of stacks

    env = create_mario_env(world=world, stage=stage, version=version, use_coin_collector_env=use_coin_collector_env)
    return env

In [4]:
def record_plays(best_epoch=600000, plays=3, world=1, stage=1, version=3, use_coin_collector_env=True):
    if use_coin_collector_env:
        model_folder = 'cc_model'
    else:
        model_folder = 'reg_model'

    best_model_path = f'mario/{model_folder}/best_model_{best_epoch}.zip'

    env = env_setup(world=world, stage=stage, version=version, use_coin_collector_env=use_coin_collector_env)
    model = PPO.load(best_model_path)
    record_env = RecordVideo(env, f"mario_plays/{model_folder}/{best_epoch}", name_prefix=f"mario_ppo_{model_folder}_{world}-{stage}-{version}")

    for ep in range(plays):
        state = record_env.reset()
        done = False
        while not done:
            action, _ = model.predict(state)
            state, reward, done, info = record_env.step(action)
            record_env.render()
    print(f"Episode {ep} done")

    return done

In [5]:
best_cc_model_timestep = 2328000
best_reg_model_timestep = 2273000

record_plays(best_epoch=best_cc_model_timestep, plays=3, use_coin_collector_env=True)

Registering Coin Collector CoinCollectorSuperMarioBrosEnv-1-1-v0 in gym for use later on.
Successfully registered coin collector env CoinCollectorSuperMarioBrosEnv-1-1-v0!
Registering Coin Collector CoinCollectorSuperMarioBrosEnv-1-2-v0 in gym for use later on.
Successfully registered coin collector env CoinCollectorSuperMarioBrosEnv-1-2-v0!
Registering Coin Collector CoinCollectorSuperMarioBrosEnv-1-3-v0 in gym for use later on.
Successfully registered coin collector env CoinCollectorSuperMarioBrosEnv-1-3-v0!
Registering Coin Collector CoinCollectorSuperMarioBrosEnv-1-4-v0 in gym for use later on.
Successfully registered coin collector env CoinCollectorSuperMarioBrosEnv-1-4-v0!
Registering Coin Collector CoinCollectorSuperMarioBrosEnv-2-1-v0 in gym for use later on.
Successfully registered coin collector env CoinCollectorSuperMarioBrosEnv-2-1-v0!
Registering Coin Collector CoinCollectorSuperMarioBrosEnv-2-2-v0 in gym for use later on.
Successfully registered coin collector env CoinCol

  logger.warn(


Episode 2 done


array([ True])

In [11]:
record_plays(best_epoch=best_reg_model_timestep, plays=3, use_coin_collector_env=False)

state: (1, 84, 84, 4)
Episode 2 done


array([ True])

# 3 success 3 failures

In [21]:
best_cc_model_timestep = 2328000
best_reg_model_timestep = 2273000

# record_plays(best_epoch=best_cc_model_timestep, plays=3, world=1, stage=1, version=3, use_coin_collector_env=True)
record_plays(best_epoch=best_reg_model_timestep, plays=3, world=1, stage=1, version=3, use_coin_collector_env=False)

state: (1, 84, 84, 4)


  logger.warn(


Episode 2 done


array([ True])

## Evaluate generalizability

In [26]:
from tqdm import tqdm

In [27]:
def pass_rate(model=None, plays=1000, world=1, stage=1, version=3, use_coin_collector_env=True):
    get_count = 0
    coin_count = 0
    coin_collection = []

    env = env_setup(world=world, stage=stage, version=version, use_coin_collector_env=use_coin_collector_env)

    for _ in tqdm(range(plays)):
        state = env.reset()
        done = False
        while not done:
            action, _ = model.predict(state)
            state, reward, done, info = env.step(action)
        if info[0]['flag_get'] == True:
            get_count += 1
        coin_count_current_play = info[0]['coins']
        coin_collection.append(coin_count_current_play)
        coin_count += info[0]['coins']
    # print(f'flag get count: {get_count}')
    # print(f'total coin count: {coin_count}')

    return get_count, coin_count, coin_collection

def env_setup(world=1, stage=1, version=3, use_coin_collector_env=True):

    env = create_mario_env(world=world, stage=stage, version=version, use_coin_collector_env=use_coin_collector_env)
    env.reset()
    state, reward, done, info = env.step([0])
    print('state:', state.shape) #Color scale, height, width, num of stacks

    env = create_mario_env(world=world, stage=stage, version=version, use_coin_collector_env=use_coin_collector_env)
    return env

In [14]:
best_cc_model_timestep = 2328000
best_reg_model_timestep = 2273000

record_plays(best_epoch=best_cc_model_timestep, plays=3, world=3, stage=1, version=3, use_coin_collector_env=True)
record_plays(best_epoch=best_reg_model_timestep, plays=3, world=3, stage=1, version=3, use_coin_collector_env=False)

state: (1, 84, 84, 4)


  logger.warn(


Episode 2 done
state: (1, 84, 84, 4)


  logger.warn(


Episode 2 done


array([ True])

In [29]:
best_cc_model = 'mario/cc_model/best_model_{}.zip'.format(best_cc_model_timestep)
best_cc_model = PPO.load(best_cc_model)

pass_count_cc, coin_count_cc, coin_collected_cc = pass_rate(best_cc_model, plays=3000, world=1, stage=1, version=3, use_coin_collector_env=True)
print(pass_count_cc, coin_count_cc, coin_collected_cc)

state: (1, 84, 84, 4)


100%|██████████| 3000/3000 [2:04:27<00:00,  2.49s/it]  

1100 3064 [1, 2, 1, 0, 1, 1, 2, 0, 0, 2, 2, 1, 3, 2, 1, 2, 1, 0, 2, 1, 1, 1, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 1, 0, 2, 1, 3, 2, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 3, 0, 0, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 0, 2, 1, 4, 1, 2, 1, 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 2, 0, 0, 1, 2, 0, 1, 3, 1, 1, 2, 1, 1, 1, 0, 1, 1, 2, 0, 1, 2, 1, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 0, 0, 1, 1, 2, 1, 2, 1, 1, 0, 1, 0, 1, 1, 3, 2, 1, 1, 0, 2, 1, 2, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 0, 0, 0, 1, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 1, 0, 0, 0, 1, 2, 0, 1, 2, 0, 2, 0, 2, 1, 0, 2, 1, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 0, 0, 2, 1, 1, 1, 1, 1, 3, 0, 1, 1, 0, 2, 0, 3, 1, 1, 1, 0, 2, 0, 2, 2, 1, 1, 1, 0, 1, 1, 2, 2, 0, 1, 1, 2, 2, 1, 1, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1, 0,




In [30]:
best_cc_model = 'mario/cc_model/best_model_{}.zip'.format(best_cc_model_timestep)
best_cc_model = PPO.load(best_cc_model)

pass_count_cc, coin_count_cc, coin_collected_cc_stage2 = pass_rate(best_cc_model, plays=3000, world=1, stage=2, version=3, use_coin_collector_env=True)
print(pass_count_cc, coin_count_cc)

state: (1, 84, 84, 4)


100%|██████████| 3000/3000 [1:52:51<00:00,  2.26s/it]  

0 802



