In [1]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback
from huggingface_sb3 import load_from_hub
from stable_baselines3.common.evaluation import evaluate_policy


In [5]:
def see_game(model):
    
    """ Функція, що показує нам як грає модель"""
    
    # Создание среды
    env = make_vec_env("LunarLander-v2", n_envs=1, seed=42)
    
    # Оценка модели
    print("Оценка модели")
    mean_reward, std_reward = evaluate_policy(
        model,
        env,
        n_eval_episodes=20,
        deterministic=True,
    )
    print(f"Средняя награда = {mean_reward:.2f} +/- {std_reward:.2f}")

    
    # Начало новой эпизода
    obs = env.reset()
    try:
        while True:
            action, _states = model.predict(obs, deterministic=True)
            obs, rewards, dones, info = env.step(action)
            env.render(mode='human')  # Рендеринг окружения
    except KeyboardInterrupt:
        pass


Спробуємо навчити модель та побачити що вона буде робити, коли буде не донавченною

In [6]:
# Create the environment
env_id = "LunarLander-v2"
n_envs = 16
env = make_vec_env(env_id, n_envs=n_envs, seed=42)

# Create the evaluation envs
eval_envs = make_vec_env(env_id, n_envs=5, seed=42)

# Adjust evaluation interval depending on the number of envs
eval_freq = int(1e5)
eval_freq = max(eval_freq // n_envs, 1)

# Create evaluation callback to save best model
# and monitor agent performance
eval_callback = EvalCallback(
    eval_envs,
    best_model_save_path="./logs/",
    eval_freq=eval_freq,
    n_eval_episodes=10,
)

# Instantiate the agent
# Hyperparameters from https://github.com/DLR-RM/rl-baselines3-zoo
model = PPO(
    "MlpPolicy",
    env,
    n_steps=1024,
    batch_size=64,
    gae_lambda=0.98,
    gamma=0.999,
    n_epochs=4,
    ent_coef=0.01,
    verbose=1,
)

# Train the agent (you can kill it before using ctrl+c)
env.reset()
try:
    model.learn(total_timesteps=100000, callback=eval_callback)
except KeyboardInterrupt:
    pass

# Save
model.save('logs/model')

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.2     |
|    ep_rew_mean     | -183     |
| time/              |          |
|    fps             | 7805     |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 16384    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 91.9         |
|    ep_rew_mean          | -140         |
| time/                   |              |
|    fps                  | 4760         |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 32768        |
| train/                  |              |
|    approx_kl            | 0.0043834154 |
|    clip_fraction        | 0.0244       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.00188      

Подивимось як граэ недонавченна наша модель

In [7]:
my_model = model.load('logs/model.zip')

In [8]:
checkpoint = load_from_hub("araffin/ppo-LunarLander-v2", "ppo-LunarLander-v2.zip")
best_model = PPO.load(checkpoint)

Exception: code expected at least 16 arguments, got 15
Exception: code expected at least 16 arguments, got 15


In [9]:
see_game(my_model)

Оценка модели
Средняя награда = -146.10 +/- 36.31


In [7]:
see_game(best_model)

Оценка модели
Средняя награда = 281.78 +/- 17.37
