In [2]:
import stable_baselines3
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO
from gym.wrappers import GrayScaleObservation
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import time
from matplotlib import pyplot as plt
from nes_py.wrappers import JoypadSpace
import os


In [3]:
monitor_dir = r'./monitor_log/'
os.makedirs(monitor_dir,exist_ok=True)
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = GrayScaleObservation(env,keep_dim=True)
env = Monitor(env, monitor_dir)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4,channels_order='last')

# 直接设置参数

In [5]:
tensorboard_log = r'./tensorboard_log/'

model = PPO('CnnPolicy', env, verbose=1,
        tensorboard_log=tensorboard_log)
# model.learn(total_timesteps = 25000)
# model.save("mario_model")

Using cuda device
Wrapping the env in a VecTransposeImage.


In [8]:
# dir(model)
print(model.learning_rate)
print(model.n_steps)

0.0003
2048


In [9]:
learning_rate = 1e-6
n_steps = 128
tensorboard_log = r'./tensorboard_log/'
model = PPO("CnnPolicy", env, verbose=1,
        learning_rate=learning_rate,
        n_steps=n_steps,tensorboard_log=tensorboard_log
)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [10]:
print(model.learning_rate)
print(model.n_steps)

1e-06
128


# 通过字典设置参数

In [15]:
model_parameter_1={
    'learning_rate' : 1e-8,
    'n_steps' : 1024
}

In [16]:
model = PPO("CnnPolicy", env, verbose=1,
        tensorboard_log=tensorboard_log,
        **model_parameter_1
)

Using cuda device
Wrapping the env in a VecTransposeImage.


[python */**用法](https://blog.csdn.net/u013250861/article/details/120557601?ops_request_misc=&request_id=&biz_id=102&utm_term=python%20**&utm_medium=distribute.pc_search_result.none-task-blog-2~all~sobaiduweb~default-1-120557601.nonecase&spm=1018.2226.3001.4187)

In [17]:
print(model.learning_rate)

1e-08


# 训练并保存模型

In [18]:
learning_rate = 1e-6
n_steps = 128
tensorboard_log = r'./tensorboard_log/'
model = PPO("CnnPolicy", env, verbose=1,
        learning_rate=learning_rate,
        n_steps=n_steps,tensorboard_log=tensorboard_log
)

model.learn(total_timesteps=100)
model.save("mario_model")

Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to ./tensorboard_log/PPO_1
----------------------------
| time/              |     |
|    fps             | 29  |
|    iterations      | 1   |
|    time_elapsed    | 4   |
|    total_timesteps | 128 |
----------------------------


# 加载模型修改超参数接着训练

In [22]:
del model
learning_rate = 1e-6
n_steps = 128
tensorboard_log = r'./tensorboard_log/'
model = PPO("CnnPolicy", env, verbose=1,
        learning_rate=learning_rate,
        n_steps=n_steps,tensorboard_log=tensorboard_log
)



Using cuda device
Wrapping the env in a VecTransposeImage.


In [25]:
model.set_parameters("mario_model")
print(model.learning_rate)
print(model.n_steps)

1e-06
128


In [27]:
model.learn(total_timesteps=200)
model.save("mario_model1")

Logging to ./tensorboard_log/PPO_4
----------------------------
| time/              |     |
|    fps             | 140 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 128 |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 84           |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 256          |
| train/                  |              |
|    approx_kl            | 1.821667e-06 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.95        |
|    explained_variance   | 0.0348       |
|    learning_rate        | 1e-06        |
|    loss                 | 227          |
|    n_updates            | 30           |
|    policy_gradient_loss | -0.000163    |
|    value_loss           | 505          |
-------------------------------

In [28]:
# 测试是否覆盖成功
model.set_parameters("mario_model1")
print(model.learning_rate)

1e-06


In [32]:
param1 = model.get_parameters() # 字典
# print(param1)
# dir(param1)
# for i in param1.items():
#     print(i)

('policy', OrderedDict([('features_extractor.cnn.0.weight', tensor([[[[-3.1032e-02,  1.6414e-02,  9.5126e-03,  ...,  4.2515e-02,
           -1.9870e-02, -1.5401e-01],
          [ 5.9201e-03, -1.8485e-02, -2.7850e-02,  ..., -6.2494e-02,
            2.1353e-02, -5.0945e-02],
          [ 1.1558e-01,  4.7228e-02, -1.4742e-01,  ...,  1.6333e-01,
           -1.1438e-01, -3.0629e-02],
          ...,
          [-8.3781e-02, -2.2278e-02,  5.4914e-02,  ...,  8.1573e-02,
            1.0179e-01,  4.8654e-02],
          [ 7.5787e-02,  9.0592e-02,  1.9072e-02,  ..., -8.0385e-02,
           -3.1447e-02, -3.2967e-03],
          [ 8.0033e-02,  4.6258e-02, -4.7584e-02,  ..., -3.6222e-02,
            3.9934e-02, -7.2287e-02]],

         [[ 4.9802e-02, -6.0619e-02, -3.9076e-02,  ..., -6.5100e-02,
           -7.8353e-02,  1.1993e-01],
          [-7.5444e-02, -2.9129e-02, -1.6797e-02,  ..., -5.2708e-02,
            7.5203e-02,  2.1820e-02],
          [ 2.0531e-02, -1.3889e-01,  9.2155e-02,  ..., -1.1174e-01