In [2]:
import sys
sys.path.insert(0, '..')
import bug_lib as BL


# bug_version_list = [
#     [],
#     [5],
#     [6],
#     [7],
#     [8],
#     [10],
#     [11],
#     [12],
#     [13],
#     [14],
#     [15]
# ]

bug_version = [39]

BL.cover_then_inject_bugs(bug_version)

from typing import Any, ClassVar, Dict, Optional, Type, TypeVar, Union

import numpy as np
import torch as th
from gymnasium import spaces
from torch.nn import functional as F

from stable_baselines3.common.buffers import RolloutBuffer
from stable_baselines3.common.on_policy_algorithm import OnPolicyAlgorithm
from stable_baselines3.common.policies import ActorCriticCnnPolicy, ActorCriticPolicy, BasePolicy, MultiInputActorCriticPolicy
from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule
from stable_baselines3.common.utils import explained_variance, get_schedule_fn

SelfPPO = TypeVar("SelfPPO", bound="PPO")


class PPO(OnPolicyAlgorithm):
    """
    Proximal Policy Optimization algorithm (PPO) (clip version)

    Paper: https://arxiv.org/abs/1707.06347
    Code: This implementation borrows code from OpenAI Spinning Up (https://github.com/openai/spinningup/)
    https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail and
    Stable Baselines (PPO2 from https

After Injecting bugs, Restart the ipy kernal.

In [1]:
from stable_baselines3 import PPO, DQN, A2C
import gymnasium as gym
import imageio
import numpy as np
import os

def generate_gif(model, output_path, num_steps=350, fps=20):
    images = []
    obs = model.env.reset()
    img = model.env.render(mode="rgb_array")
    for i in range(num_steps):
        images.append(img)
        action, _ = model.predict(obs)
        obs, _, _, _ = model.env.step(action)
        img = model.env.render(mode="rgb_array")

    imageio.mimsave(output_path, [np.array(img) for img in images], fps=fps)

def train_frozen_lake(testbed='Frozenlake', model_type='ppo', total_timesteps=3000, tb_log_name="run"):
    if testbed == 'Frozenlake':
        env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=False, render_mode="rgb_array")
    else:
        env = gym.make("MountainCarContinuous-v0", render_mode='rgb_array')

    # 检查输出路径是否存在,如果不存在则创建
    output_path = f"./tensorboard_logs/{tb_log_name}/"
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    tensorboard_log_path = f"./tensorboard_logs/{tb_log_name}/"

    if model_type == 'ppo':
        model = PPO("MlpPolicy", env, tensorboard_log=tensorboard_log_path)
    elif model_type == 'dqn':
        model = DQN("MlpPolicy", env, tensorboard_log=tensorboard_log_path)
    elif model_type == 'a2c':
        model = A2C("MlpPolicy", env, tensorboard_log=tensorboard_log_path)
    
    # 在开始训练之前生成GIF
    generate_gif(model, f"./tensorboard_logs/{tb_log_name}/before_training.gif")
    
    model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name)
    
    # 在训练完成之后生成GIF
    generate_gif(model, f"./tensorboard_logs/{tb_log_name}/after_training.gif")
    
    return model

# before running this cell, modeify tb_log_name (important)
model_run1 = train_frozen_lake(testbed='Mountaincar', model_type='ppo', total_timesteps=140000, tb_log_name="Mountaincar-ppo-[39]-140000")
# model_run1 = train_frozen_lake(testbed='Frozenlake', model_type='dqn', total_timesteps=60000, tb_log_name="Frozenlake-dqn-[6]-60000")

Open terminal and use the following command to show tensorboard

In [None]:
# tensorboard --logdir=./tensorboard_logs/