In [1]:
import stable_baselines3
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO
from gym.wrappers import GrayScaleObservation
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import time
from matplotlib import pyplot as plt
from nes_py.wrappers import JoypadSpace
import os

In [2]:
# 保存最优模型
monitor_dir = r'./monitor_log/'
os.makedirs(monitor_dir,exist_ok=True)
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = GrayScaleObservation(env,keep_dim=True)
env = Monitor(env, monitor_dir)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4,channels_order='last')

In [3]:
import os

import numpy as np

from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.results_plotter import load_results, ts2xy
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """

    def __init__(self, check_freq, save_model_dir, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.save_path = os.path.join(save_model_dir, "best_model/")
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            print('self.n_calls: ', self.n_calls)
            model_path = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path)
        return True

In [4]:
learning_rate = 1e-6
n_steps = 128
tensorboard_log = r'./tensorboard_log/'
model = PPO("CnnPolicy", env, verbose=1,
        learning_rate=learning_rate,
        n_steps=n_steps,tensorboard_log=tensorboard_log
)

save_model_dir = r'./monitor_log/'
callback1 = SaveOnBestTrainingRewardCallback(check_freq=1000, save_model_dir=save_model_dir)

model.learn(total_timesteps=5000,callback=callback1)

Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to ./tensorboard_log/PPO_1
----------------------------
| time/              |     |
|    fps             | 39  |
|    iterations      | 1   |
|    time_elapsed    | 3   |
|    total_timesteps | 128 |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 46           |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 256          |
| train/                  |              |
|    approx_kl            | 2.742745e-07 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.95        |
|    explained_variance   | -0.00226     |
|    learning_rate        | 1e-06        |
|    loss                 | 210          |
|    n_updates            | 10           |
|    policy_gradient_loss | -2.57e-05    |
|    value_loss

<stable_baselines3.ppo.ppo.PPO at 0x163ea7157c0>

In [9]:
model = PPO.load(".\\monitor_log\\best_model\\model_5000.zip")

obs = env.reset()
obs = obs.copy()

done = True
while True:
    if done:
        state = env.reset()
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    obs=obs.copy()
    env.render()