## 1. Setup Mario

In [None]:
%pip install gym_super_mario_bros==7.3.0 nes_py

In [1]:
# Import game
import gym_super_mario_bros
# import joypad wrapper
from nes_py.wrappers import JoypadSpace
# import simplified controls
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

In [None]:
SIMPLE_MOVEMENT

simplify enviroment as much as possible so that it is easier to learn (7 actions)

In [None]:
# Setup game
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT) # wraps the 256 total actions into the an environment with 7 actions

In [None]:
env.action_space # inputs to the environment

In [None]:
env.observation_space # outputs of the environment (image)

In [None]:
done = True
for step in range(100000): # for each frame
    if done:
        # Start the game
        env.reset()

    # pass action to game
    state, reward, done, info = env.step(env.action_space.sample())
    env.render()

env.close()

see https://pypi.org/project/gym-super-mario-bros/ for the doc. Reward function etc...

## 2. Preprocess environment

In [None]:
%conda install cudatoolkit=11.3
%pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
%pip install matplotlib stable-baselines3 opencv-python
# %conda install freetype=2.10.4

stable baselines is a reinforcement learning framework

https://stable-baselines3.readthedocs.io/en/master/

In [2]:
# Import Frames stacker and grayscal
from gym.wrappers import GrayScaleObservation
# Import Vectorization wrapper
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

# Import matplotlib (show impact of frame stacker)
# from matplotlib import pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Create base environment
env = gym_super_mario_bros.make('SuperMarioBros-v0')
# Simplify controls
env = JoypadSpace(env, SIMPLE_MOVEMENT) # wraps the 256 total actions into the an environment with 7 actions
# Grayscale the environment
env = GrayScaleObservation(env, keep_dim=True)
# Wrap in dummy environment
env = DummyVecEnv([lambda: env])
# stack frames
env = VecFrameStack(env, 4, channels_order='last')

In [22]:
state = env.reset()
state.shape

(1, 240, 256, 4)

In [28]:
state, reward, done, info = env.step([env.action_space.sample()])

In [30]:
import cv2

state_vis = state[0]

for i in range(state_vis.shape[-1]):
    print(state_vis[..., i].shape)
    cv2.imshow(str(i), state_vis[..., i])
cv2.waitKey(10000)

(240, 256)
(240, 256)
(240, 256)
(240, 256)


-1

In [31]:
cv2.destroyAllWindows()

## 3. Build the model and train

In [3]:
import os
# Import algorithm
from stable_baselines3 import PPO
# Save callback
from stable_baselines3.common.callbacks import BaseCallback

There is a Callback in the tutorial that saves on N iterations. We will not be using this

In [4]:
CHECKPOINT_DIR = './mario/train' # for model weights
LOG_DIR = './mario/log' # for tf logs

In [6]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [7]:
callback = TrainAndLoggingCallback(50000, save_path=CHECKPOINT_DIR)

In [8]:
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR,
            learning_rate=0.000001, n_steps=512)

Using cuda device
Wrapping the env in a VecTransposeImage.


Policynetwork ? 

In [9]:
model.learn(total_timesteps=500000, callback=callback)
# model.save('name')

Logging to ./log\PPO_1


  return (self.ram[0x86] - self.ram[0x071c]) % 256


----------------------------
| time/              |     |
|    fps             | 65  |
|    iterations      | 1   |
|    time_elapsed    | 7   |
|    total_timesteps | 512 |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 2            |
|    time_elapsed         | 18           |
|    total_timesteps      | 1024         |
| train/                  |              |
|    approx_kl            | 1.518894e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.95        |
|    explained_variance   | -0.00152     |
|    learning_rate        | 1e-06        |
|    loss                 | 150          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.000158    |
|    value_loss           | 334          |
------------------------------------------
-----------------------

<stable_baselines3.ppo.ppo.PPO at 0x2b0bcefaec0>

## 4. Test model

In [6]:
import time

In [10]:
# Load the model
step = 50000 * 10
model = PPO.load(CHECKPOINT_DIR + f'/best_model_{step}')

In [11]:
# start game
state = env.reset()

while True:
    action, _state = model.predict(state)
    state, reward, done, info = env.step(action)
    env.render()
    time.sleep(1/50)

  return (self.ram[0x86] - self.ram[0x071c]) % 256


KeyboardInterrupt: 