In [2]:
%load_ext tensorboard

In [1]:
#standard packages
import gym
import numpy as np
import cv2
import torch as th
from torch import nn
import os

# mario packages
import gym_super_mario_bros
from gym_super_mario_bros import SuperMarioBrosEnv
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import *

# Import Frame Stacker Wrapper and GrayScaling Wrapper
from gym.wrappers import GrayScaleObservation

# Import Vectorization Wrappers
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

# Import algo
from stable_baselines3 import A2C, PPO

# Import Base Callback for saving models
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import VecVideoRecorder, SubprocVecEnv, DummyVecEnv

In [3]:
tensorboard_logdir = os.path.abspath("./mario/cc_model")
reward_log_path = os.path.join(tensorboard_logdir, 'reward_log.csv')

In [4]:
%tensorboard --logdir $tensorboard_logdir

Reusing TensorBoard on port 6006 (pid 15792), started 0:42:58 ago. (Use '!kill 15792' to kill it.)

# Regular Environment

In [5]:
tensorboard_logdir = os.path.abspath("./mario/reg_model")
reward_log_path = os.path.join(tensorboard_logdir, 'reward_log.csv')

In [6]:
%tensorboard --logdir $tensorboard_logdir --port=6008

Reusing TensorBoard on port 6008 (pid 20216), started 0:39:17 ago. (Use '!kill 20216' to kill it.)

# Check reward

In [15]:
import pandas as pd

In [73]:
reward_ccmodel = pd.read_csv("mario/cc_model/reward_log.csv")
reward_ccmodel = reward_ccmodel.loc[reward_ccmodel.timesteps!="timesteps", :].reset_index(drop=True)
reward_ccmodel["best_reward"] = reward_ccmodel["best_reward"].astype("float")
reward_ccmodel = reward_ccmodel.sort_values(by="best_reward", ascending=False)
print(reward_ccmodel.head(10))
# reward_ccmodel.loc[reward_ccmodel.timesteps=="timesteps", :]

    timesteps     reward  best_reward
615   616000     1931.4        3067.0
785   786000    1727.35        3066.0
635   636000    1434.65        3066.0
770   771000     1630.1        3065.0
709   710000    1527.95        3065.0
783   784000     1710.4        3065.0
701   702000     2153.4        3064.0
836   836000    1707.05        3064.0
780   781000     1941.0        3064.0
813   813000     1711.0        3063.0


In [39]:
reward_ccmodel = pd.read_csv("mario/reg_model/reward_log.csv").sort_values(by="best_reward", ascending=False)
print(reward_ccmodel.head(10))

     timesteps   reward  best_reward
673     674000  1706.35       3059.0
606     607000  1682.90       3059.0
779     780000  1447.25       3056.0
710     711000  1976.70       3056.0
765     766000  1431.55       3056.0
543     544000  1383.35       3056.0
548     549000  1373.75       3056.0
731     732000  1695.05       3056.0
679     680000  1662.95       3055.0
790     791000  1806.10       3055.0


# Check results

In [8]:
#standard packages
import gym
import numpy as np
import cv2
import torch as th
from torch import nn
import os

# mario packages
import gym_super_mario_bros
from gym_super_mario_bros import SuperMarioBrosEnv
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import *
from gym.wrappers import RecordVideo

# Import Frame Stacker Wrapper and GrayScaling Wrapper
from gym.wrappers import GrayScaleObservation

# Import Vectorization Wrappers
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

# Import algo
from stable_baselines3 import A2C, PPO

# Import Base Callback for saving models
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import VecVideoRecorder, SubprocVecEnv, DummyVecEnv

class CoinCollectorSuperMarioBrosEnv(SuperMarioBrosEnv):
    #score btn 2 time frames can maybe go upto 8000 so we can just divide by 100 (reference https://www.mariowiki.com/Point)
    reward_range = (-15, 100)

    def __init__(self, rom_mode='vanilla', lost_levels=False, target=None):
        super().__init__(rom_mode=rom_mode, lost_levels=lost_levels, target=target)

        # variable to keep track of score deltas
        self._score_last = 0

    @property
    def _score_reward(self):
        _reward = self._score - self._score_last
        self._score_last = self._score
        return _reward/100

    # This should override the parent function
    def _get_reward(self):
        return self._x_reward + self._score_reward + self._time_penalty + self._death_penalty

'''
The code below registers this new environment in gym for us to reference later. Code borrowed from _registration.py of gym_super_mario_bros
'''
def _register_coin_collector_mario_stage_env(id, **kwargs):
    """
    Register a Super Mario Bros. (1/2) stage environment with OpenAI Gym.

    Args:
        id (str): id for the env to register
        kwargs (dict): keyword arguments for the SuperMarioBrosEnv initializer

    Returns:
        None

    """
    # register the environment
    gym.envs.registration.register(
        id=id,
        # entry_point='.:CoinCollectorSuperMarioBrosEnv',
        entry_point=CoinCollectorSuperMarioBrosEnv,
        max_episode_steps=9999999,
        reward_threshold=9999999,
        kwargs=kwargs,
        nondeterministic=True,
    )

def _register_all_coin_collector_envs():
    # a template for making individual stage environments
    _ID_TEMPLATE = 'CoinCollectorSuperMarioBrosEnv-{}-{}-v{}'
    # A list of ROM modes for each level environment
    _ROM_MODES = [
        'vanilla',
        'downsample',
        'pixel',
        'rectangle'
    ]

    # iterate over all the rom modes, worlds (1-8), and stages (1-4)
    for version, rom_mode in enumerate(_ROM_MODES):
        for world in range(1, 9):
            for stage in range(1, 5):
                # create the target
                target = (world, stage)
                # setup the frame-skipping environment
                env_id = _ID_TEMPLATE.format(world, stage, version)
                print(f"Registering Coin Collector {env_id} in gym for use later on.")
                _register_coin_collector_mario_stage_env(env_id, rom_mode=rom_mode, target=target)
                print(f"Successfully registered coin collector env {env_id}!")

def create_gym_env_from_level(world, stage, version, use_coin_collector_env):
    level_suffix = f"{world}-{stage}-v{version}"
    if not use_coin_collector_env:
        level = f"SuperMarioBros-{level_suffix}"
        env = gym_super_mario_bros.make(level)
    else:
        env_set = set(gym.envs.registration.registry.env_specs.copy().keys())
        level = f"CoinCollectorSuperMarioBrosEnv-{level_suffix}"
        if level not in env_set:
            # register all these custom environments for the first time
            _register_all_coin_collector_envs()

        assert level in set(
            gym.envs.registration.registry.env_specs.copy().keys()
        ), f"Looks like {level} was not registered correctly!"
        env = gym.make(level)

    return env

class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info

class ResizeEnv(gym.ObservationWrapper):
    def __init__(self, env, size):
        gym.ObservationWrapper.__init__(self, env)
        (oldh, oldw, oldc) = env.observation_space.shape
        newshape = (size, size, oldc)
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=newshape, dtype=np.uint8)

    def observation(self, frame):
        height, width, _ = self.observation_space.shape
        frame = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
        if frame.ndim == 2:
            frame = frame[:,:,None]
        return frame

def create_mario_env(world, stage, version, use_coin_collector_env):
    env = create_gym_env_from_level(world, stage, version, use_coin_collector_env)
    env = JoypadSpace(env, COMPLEX_MOVEMENT)
    env = SkipFrame(env, skip=4)
    env = GrayScaleObservation(env, keep_dim=True)
    env = ResizeEnv(env, size=84)
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, 4, channels_order='last')
    return env

class MarioNet(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim):
        super(MarioNet, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(th.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))


In [78]:
# <world> is a number in {1, 2, 3, 4, 5, 6, 7, 8} indicating the world
world = 1
# <stage> is a number in {1, 2, 3, 4} indicating the stage within a world
stage = 1
version = 3
use_coin_collector_env = True

env = create_mario_env(world, stage, version, use_coin_collector_env)

env.reset()
state, reward, done, info = env.step([0])
print('state:', state.shape) #Color scale, height, width, num of stacks


env = create_mario_env(world, stage, version, use_coin_collector_env)
plays = 4

state: (1, 84, 84, 4)


In [80]:
best_epoch = 845000 #change as per the values inferred from the graph
best_model_path = 'mario/cc_model/best_model_{}.zip'.format(best_epoch)

# Load the best model
model = PPO.load(best_model_path)
record_env = RecordVideo(env, f"mario_plays/cc_model/{best_epoch}", name_prefix="mario_ppo")

for ep in range(plays):
    state = record_env.reset()
    done = False
    while not done:
        action, _ = model.predict(state)
        state, reward, done, info = record_env.step(action)
        record_env.render()
        print(f"Episode {ep} done")

  logger.warn(


Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 

In [81]:
# <world> is a number in {1, 2, 3, 4, 5, 6, 7, 8} indicating the world
world = 1
# <stage> is a number in {1, 2, 3, 4} indicating the stage within a world
stage = 1
version = 3
use_coin_collector_env = False

env = create_mario_env(world, stage, version, use_coin_collector_env)

env.reset()
state, reward, done, info = env.step([0])
print('state:', state.shape) #Color scale, height, width, num of stacks


env = create_mario_env(world, stage, version, use_coin_collector_env)
plays = 4

state: (1, 84, 84, 4)


In [85]:
best_epoch = 848000 #change as per the values inferred from the graph
best_model_path = 'mario/reg_model/best_model_{}.zip'.format(best_epoch)

# Load the best model
model = PPO.load(best_model_path)
record_env = RecordVideo(env, f"mario_plays/reg_model/{best_epoch}", name_prefix="mario_ppo")

for ep in range(plays):
    state = record_env.reset()
    done = False
    while not done:
        action, _ = model.predict(state)
        state, reward, done, info = record_env.step(action)
        record_env.render()
        print(f"Episode {ep} done")

Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 done
Episode 0 