In [5]:


from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import time
from matplotlib import pyplot as plt
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3 import PPO

from gym.wrappers import GrayScaleObservation

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
import numpy as np
import os
from stable_baselines3.common.callbacks import BaseCallback

import optuna

from stable_baselines3.common.evaluation import evaluate_policy

import os


env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)



log_dir = './log_dir2/'
os.makedirs(log_dir, exist_ok=True)

env = Monitor(env, log_dir)

env = GrayScaleObservation(env,keep_dim=True)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env,4,channels_order='last')



# PPO主要超参数

In [6]:
def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

# 超参数调优

In [7]:
def optimize_agent(trial):
    
    try:
        env = gym_super_mario_bros.make('SuperMarioBros-v0')
        env = JoypadSpace(env, SIMPLE_MOVEMENT)

        log_dir = './log_dir2/'
        os.makedirs(log_dir, exist_ok=True)

        env = Monitor(env, log_dir)

        env = GrayScaleObservation(env,keep_dim=True)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env,4,channels_order='last')

    
        model_params = optimize_ppo(trial) 
    

        tensorboard_log = r'./logs/'
        model = PPO("CnnPolicy", env, verbose=0,tensorboard_log=tensorboard_log,**model_params)
        # model.learn(total_timesteps=1000)
        model.learn(total_timesteps=200000)
    
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
    
    
        env.close()
    
        OPT_DIR  = r'./best_model'
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
    
        return mean_reward    

    except Exception as e:
        return -1000
    
    



In [8]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=100)

[32m[I 2022-08-26 18:42:04,910][0m A new study created in memory with name: no-name-e014bdd3-8967-4a2a-ba08-2f655898b425[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6516 and n_envs=1)
[32m[I 2022-08-26 19:25:19,585][0m Trial 0 finished with value: 741.0 and parameters: {'n_steps': 6516, 'gamma': 0.9456268741567587, 'learning_rate': 1.7190861040103122e-05, 'clip_range': 0.23792182144322943, 'gae_lambda': 0.8877650617671565}. Best is trial 0 with value: 741.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2653 and n_envs=1)
[32m[I 2022-08-26 20:07:43,267][0m Trial 1 finished with value: 684.0 and parameters: {'n_steps': 2653, 'gamma': 0.9103471772173634, 'learning_rate': 8.830111369215116e-05, 'clip_range': 0.3814701853146206, 'gae_lambda': 0.9450640015440726}. Best is trial 0 with value: 741.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=

KeyboardInterrupt: 

In [None]:
dir(study)

In [None]:
study.best_params

In [None]:
study.best_trial