# 自作のPPOノートブック

In [189]:
from dataclasses import dataclass, field, asdict, is_dataclass

import sys
import logging

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal

import gymnasium as gym

from myActivator import tanhAndScale
from myFunction import make_squashed_gaussian

In [190]:
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s",
                    stream=sys.stdout, datefmt="%H:%M:%S")

In [191]:
env = gym.make("Pendulum-v1")
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
    logging.info('%s: %s', key, vars(env.unwrapped)[key])

22:54:26 [INFO] id: Pendulum-v1
22:54:26 [INFO] entry_point: gymnasium.envs.classic_control.pendulum:PendulumEnv
22:54:26 [INFO] reward_threshold: None
22:54:26 [INFO] nondeterministic: False
22:54:26 [INFO] max_episode_steps: 200
22:54:26 [INFO] order_enforce: True
22:54:26 [INFO] disable_env_checker: False
22:54:26 [INFO] kwargs: {}
22:54:26 [INFO] additional_wrappers: ()
22:54:26 [INFO] vector_entry_point: None
22:54:26 [INFO] namespace: None
22:54:26 [INFO] name: Pendulum
22:54:26 [INFO] version: 1
22:54:27 [INFO] max_speed: 8
22:54:27 [INFO] max_torque: 2.0
22:54:27 [INFO] dt: 0.05
22:54:27 [INFO] g: 10.0
22:54:27 [INFO] m: 1.0
22:54:27 [INFO] l: 1.0
22:54:27 [INFO] render_mode: None
22:54:27 [INFO] screen_dim: 500
22:54:27 [INFO] screen: None
22:54:27 [INFO] clock: None
22:54:27 [INFO] isopen: True
22:54:27 [INFO] action_space: Box(-2.0, 2.0, (1,), float32)
22:54:27 [INFO] observation_space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
22:54:27 [INFO] spec: EnvSpec(id='Pendulum-

In [192]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [193]:
@dataclass
class Config:
    # ===== Pendulum-v1 specs =====
    obs_dim: int = 3
    act_dim: int = 1

    # TRPOAgent 側は Config.u_llim / Config.u_ulim を参照
    u_llim: list[float] = field(default_factory=lambda: [-2.0])
    u_ulim: list[float] = field(default_factory=lambda: [ 2.0])

    # ===== Network architecture =====
    V_net_in: int = 3
    P_net_in: int = 3

    V_net_sizes: list[int] = field(default_factory=lambda: [64, 64])
    P_net_sizes: list[int] = field(default_factory=lambda: [64, 64])

    V_net_out: int = 1
    P_net_out: int = 1  # = act_dim

    # ===== Optimizer =====
    V_lr: float = 1e-3
    P_lr: float = 3e-4

    # ===== GAE / discount =====
    gamma: float = 0.99
    lam: float = 0.97

    # ===== PPO hyperparameters =====
    clip_ratio: float = 0.2
    policy_train_iters: int = 50
    target_kl: float = 0.01
    reward_scaling: float = 0.01

    # ===== Value function training =====
    value_train_iters: int = 50
    value_l2_reg: float = 1e-3
    v_clip_epsilon: float = 0.2

In [194]:
class PPOAgent:
    def __init__(self, Config, device=None):
        if Config is None:
            raise ValueError("No Config!!")
        self.Config = Config

        # device
        if device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device(device)

        self.u_low = torch.tensor(Config.u_llim, dtype=torch.float32, device=self.device)
        self.u_high = torch.tensor(Config.u_ulim, dtype=torch.float32, device=self.device)

        # networks
        self.V_net = self.build_net(Config.V_net_in, Config.V_net_sizes, Config.V_net_out).to(self.device)
        self.P_net = self.build_net(Config.P_net_in, Config.P_net_sizes, Config.P_net_out).to(self.device)

        self.V_net.train()
        self.P_net.train()

        # log_std は状態に依存しないパラメータ
        action_dim = Config.P_net_out
        self.log_std = nn.Parameter(torch.zeros(action_dim, device=self.device))

        # optimizer（baselineはAdam）
        self.V_optim = optim.Adam(self.V_net.parameters(), lr=Config.V_lr)
        self.P_optim = optim.Adam(
            list(self.P_net.parameters()) + [self.log_std], 
            lr=Config.P_lr
        )

        # hyperparams
        self.gamma = float(Config.gamma)
        self.tau = float(Config.lam)  # baselineの TAU (= GAE lambda)
        self.target_kl = float(getattr(Config, "target_kl", 0.01))
        self.reward_scaling = float(getattr(Config, "reward_scaling", 0.01))

        self.policy_train_iters = int(getattr(Config, "policy_train_iters", 80))
        self.value_train_iters = int(getattr(Config, "value_train_iters", 5))
        self.value_l2_reg = float(getattr(Config, "value_l2_reg", 1e-3))
        self.v_clip_epsilon = float(getattr(Config, "v_clip_epsilon", 0.2))

    def build_net(self, input_size, hidden_sizes, output_size):
        layers = []
        in_size = input_size
        for h_size in hidden_sizes:
            layers.append(nn.Linear(in_size, h_size))
            layers.append(nn.ReLU())
            in_size = h_size
        layers.append(nn.Linear(in_size, output_size))
        net = nn.Sequential(*layers)
        return net
    
    @torch.no_grad()
    def get_action_and_log_prob(self, state, deterministic=False):
        """
        deterministic: Trueなら平均値(mean)を返す。Falseならサンプリング。
        """
        s = torch.as_tensor(state, dtype=torch.float32, device=self.device)
        if s.dim() == 1:
            s = s.unsqueeze(0)  # (1, obs_dim)

        dist = self._policy_dist(s)  # Normal(mean, std)

        if deterministic:
            a = dist.mean  # (1, act_dim)
            logp = None
        else:
            a = dist.sample()
            logp = dist.log_prob(a).sum(axis=-1)  # (1,)

        # 返すactionはclip前
        # envに入れるときにnp.clipする
        a = a.squeeze(0)
        if logp is not None:
            logp = logp.squeeze(0)
        return a, logp
    
    @torch.no_grad()
    def step(self, state):
        """
        推論時に使う用のwrapper関数
        """
        a, _ = self.get_action_and_log_prob(state, deterministic=True)
        return a.cpu().numpy()
    
    def _policy_mean(self, states):
        """
        方策ネットから行動平均を計算するラッパー関数
        states: (batch_size, obs_dim)
        return: (batch_size, act_dim)
        """
        mean = self.P_net(states)  # (batch_size, act_dim)
        return mean
    
    def _policy_dist(self, states):
        """
        方策ネットから平均を計算し、パラメータから分散を計算して、正規分布を返すラッパー関数
        states: (batch_size, obs_dim)
        return: Normal distribution
        """
        mean = self._policy_mean(states)  # (batch_size, act_dim)
        std = torch.exp(self.log_std)  # (act_dim,)
        std = std.unsqueeze(0).expand_as(mean)  # (batch_size, act_dim)
        dist = Normal(mean, std)
        return dist
    
    @torch.no_grad()
    def _compute_gae(self, rewards, values, next_values, dones):
        """
        GAEを計算する関数
        rewards: (batch_size,)
        values: (batch_size,)
        next_values: (batch_size,)
        dones: (batch_size,)
        return: advantages: (batch_size,), returns: (batch_size,)
        """
        batch_size = rewards.shape[0]
        adv = torch.zeros_like(rewards, device=self.device)
        gae = 0.0

        for t in reversed(range(batch_size)):
            if t == batch_size - 1:
                nv = next_values[t]
            else:
                nv = values[t + 1]
            delta = rewards[t] + self.gamma * nv * (1 - dones[t]) - values[t]
            gae = delta + self.gamma * self.tau * (1 - dones[t]) * gae
            adv[t] = gae

        ret = adv + values
        return adv, ret
    
    def _ppo_step(self, states, actions, old_log_probs, advantages):
        """
        PPOの方策ネット更新を行う関数
        states: (batch_size, obs_dim)
        actions: (batch_size, act_dim)
        old_log_probs: (batch_size,)
        advantages: (batch_size,)
        return: policy_loss
        """

        for _ in range(self.policy_train_iters):
            dist = self._policy_dist(states)  # Normal(mean, std)
            log_probs = dist.log_prob(actions).sum(axis=-1)  # (batch_size,)

            ratios = torch.exp(log_probs - old_log_probs)  # (batch_size,)

            surr1 = ratios * advantages  # (batch_size,)
            surr2 = torch.clamp(ratios, 1.0 - self.Config.clip_ratio, 1.0 + self.Config.clip_ratio) * advantages  # (batch_size,)

            policy_loss = -torch.min(surr1, surr2).mean()
            self.P_optim.zero_grad()
            policy_loss.backward()
            self.P_optim.step()

        # どれくらい変化したかを確認する
        change = (old_log_probs - log_probs).mean()

        return policy_loss, change
    
    def _update_value_function(self, states, returns, old_values):
        """
        価値関数ネットワークの更新を行う関数
        states: (batch_size, obs_dim)
        returns: (batch_size,)
        return: value_loss
        """
        for _ in range(self.value_train_iters):
            values = self.V_net(states).squeeze(-1)  # (batch_size,)
            value_loss = F.mse_loss(values, returns)

            # クリッピング版の価値関数損失（Vの計算が暴走するのを防ぐため）
            v_clip = old_values + torch.clamp(values - old_values, self.v_clip_epsilon, self.v_clip_epsilon)
            v_clip_loss = F.mse_loss(v_clip, returns)

            # L2正則化
            l2_reg = torch.tensor(0., device=self.device)
            for param in self.V_net.parameters():
                l2_reg += torch.norm(param)**2
            loss = torch.max(value_loss, v_clip_loss) + self.value_l2_reg * l2_reg
            # loss = value_loss + self.value_l2_reg * l2_reg

            self.V_optim.zero_grad()
            loss.backward()
            self.V_optim.step()

        return loss
    
    def update_net(self, states, actions, log_probs, rewards, next_states, dones):
        """
        ネットワークを更新する関数
        states: (batch_size, obs_dim)
        actions: (batch_size, act_dim)
        log_probs: (batch_size,)
        rewards: (batch_size,)
        next_states: (batch_size, obs_dim)
        dones: (batch_size,)
        return: dict of losses
        """
        states = torch.as_tensor(states, dtype=torch.float32, device=self.device)
        actions = torch.as_tensor(actions, dtype=torch.float32, device=self.device)
        log_probs = torch.as_tensor(log_probs, dtype=torch.float32, device=self.device)
        rewards = torch.as_tensor(rewards, dtype=torch.float32, device=self.device)
        next_states = torch.as_tensor(next_states, dtype=torch.float32, device=self.device)
        dones = torch.as_tensor(dones, dtype=torch.float32, device=self.device)

        # 一応shapeを揃えておく
        actions = torch.as_tensor(actions, dtype=torch.float32, device=self.device).view(-1, self.Config.act_dim)

        # old_log_probs も念のためshapeをそろえてdetachしておく
        old_log_probs = torch.as_tensor(log_probs, dtype=torch.float32, device=self.device).view(-1).detach()

        # GAEの計算
        with torch.no_grad():
            values = self.V_net(states).squeeze(-1)  # (batch_size,)
            next_values = self.V_net(next_states).squeeze(-1)  # (batch_size,)

            advantages, returns = self._compute_gae(rewards, values, next_values, dones)
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)  # 正規化

        # 方策ネットワークの更新
        policy_loss, kl_change = self._ppo_step(states, actions, old_log_probs, advantages)

        # 変化量に応じて学習率を調整
        if kl_change > 1.5 * self.target_kl:
            for param_group in self.P_optim.param_groups:
                param_group['lr'] = max(param_group['lr'] / 1.5, 1e-5)
                logging.info(f"Decreased policy learning rate to {param_group['lr']}")
        elif kl_change < self.target_kl / 1.5:
            for param_group in self.P_optim.param_groups:
                param_group['lr'] = min(param_group['lr'] * 1.5, 1e-2)
                logging.info(f"Increased policy learning rate to {param_group['lr']}")

        # 価値観数ネットワークの更新
        value_loss = self._update_value_function(states, returns, values)

        return {"policy_loss": policy_loss.item(), "value_loss": value_loss.item()}
    
    def to(self, device):
        self.device = torch.device(device)
        self.V_net.to(self.device)
        self.P_net.to(self.device)
        self.log_std.data = self.log_std.data.to(self.device)
        self.u_low = self.u_low.to(self.device)
        self.u_high = self.u_high.to(self.device)
        return self

    def mode2eval(self):
        self.V_net.eval()
        self.P_net.eval()

    def mode2train(self):
        self.V_net.train()
        self.P_net.train()

    def save_all(self, path: str, extra: dict | None = None):
        cfg = asdict(self.Config) if is_dataclass(self.Config) else self.Config
        save_dict = {
            "config": cfg,
            "V_net_state_dict": self.V_net.state_dict(),
            "P_net_state_dict": self.P_net.state_dict(),
            "log_std": self.log_std.data,
        }
        if extra is not None:
            save_dict.update(extra)
        torch.save(save_dict, path)
        
    def load_all(self, path: str, map_location=None):
        load_dict = torch.load(path, map_location=map_location)
        self.V_net.load_state_dict(load_dict["V_net_state_dict"])
        self.P_net.load_state_dict(load_dict["P_net_state_dict"])
        self.log_std.data = load_dict["log_std"].to(self.device)

In [195]:
import numpy as np
import torch
import logging

def train_ppo(
    env,
    agent,
    total_step: int = 200_000,
    batch_steps: int | None = None,
    random_steps: int = 0,
    bootstrap_on_timeout: bool | None = None,
    log_interval_updates: int = 1,
):
    """
    PPO Agentのための学習ループ
    """
    if batch_steps is None:
        batch_steps = int(getattr(agent.Config, "batch_steps", 2048)) # PPOはバッチサイズ大きめが一般的
    if bootstrap_on_timeout is None:
        bootstrap_on_timeout = bool(getattr(agent.Config, "bootstrap_on_timeout", False))

    print(f"Start PPO Training: Device={agent.device}, Batch={batch_steps}")

    # 変数名修正に伴う変更
    low_np = agent.u_low.detach().cpu().numpy()
    high_np = agent.u_high.detach().cpu().numpy()

    # 履歴保存用
    loss_history = {"policy_loss": [], "value_loss": []}
    train_reward_history = [] # ノイズあり（学習中の報酬）

    # rollout buffer
    rollout = {"obs": [], "act": [], "logp": [], "rew": [], "obs_next": [], "done": []}

    def rollout_clear():
        for k in rollout:
            rollout[k].clear()

    obs, info = env.reset()
    ep_return = 0.0
    episode_num = 0
    update_num = 0

    for t in range(total_step):
        # --- (1) 行動選択 ---
        # ランダムステップ期間、もしくは学習初期
        if t < random_steps:
            action_raw = np.atleast_1d(env.action_space.sample()).astype(np.float32)
            # ランダム行動の場合のlogpは適当(0.0)あるいは計算不要だが、
            # PPOの更新で使うなら整合性を取るためにAgentからサンプリングした方が無難。
            # ここではあくまで「完全ランダム」として扱うため、logp=0としてUpdateに使わない手もあるが、
            # 実装を単純にするため、random_steps期間はバッファに入れないか、
            # もしくはagentを使ってサンプリングする形が推奨されます。
            # 今回は「agentを使う」形に倒します。
            with torch.no_grad():
                a_t, logp_t = agent.get_action_and_log_prob(obs, deterministic=False)
                logp_val = logp_t.item()
        else:
            with torch.no_grad():
                # deterministic=False (確率的方策に従って探索)
                a_t, logp_t = agent.get_action_and_log_prob(obs, deterministic=False)
            
            action_raw = np.atleast_1d(a_t.detach().cpu().numpy()).astype(np.float32)
            logp_val = logp_t.item()

        # --- (2) env step ---
        # クリップして環境に入力
        action_env = np.clip(action_raw, low_np, high_np)
        obs_next, reward, terminated, truncated, info = env.step(action_env)

        ep_return += float(reward)

        # GAE計算用のdoneフラグ (TimeLimitによる打ち切りはFalse扱いにする場合が多い)
        if bootstrap_on_timeout:
            done_for_gae = float(terminated)
        else:
            done_for_gae = float(terminated or truncated)
        
        # 環境リセット用のdoneフラグ
        done_for_reset = (terminated or truncated)

        # 【AI用】学習用の変数は「スケーリングした reward」を作る
        scaled_reward = float(reward) * agent.Config.reward_scaling  # Pendulum用に 1/100 にする
        # scaled_reward = float(reward)

        # --- (3) バッファに保存 ---
        rollout["obs"].append(np.asarray(obs, dtype=np.float32))
        rollout["act"].append(np.asarray(action_raw, dtype=np.float32))
        rollout["logp"].append(logp_val)
        # rollout["rew"].append(float(reward))
        rollout["rew"].append(float(scaled_reward))
        rollout["obs_next"].append(np.asarray(obs_next, dtype=np.float32))
        rollout["done"].append(float(done_for_gae))

        # --- (4) Reset判定 ---
        if done_for_reset:
            train_reward_history.append(ep_return)
            episode_num += 1
            ep_return = 0.0
            obs, info = env.reset()
        else:
            obs = obs_next

        # --- (5) Update ---
        # バッチサイズ分たまったら更新
        if len(rollout["obs"]) >= batch_steps:
            update_num += 1

            # list -> numpy
            states      = np.stack(rollout["obs"], axis=0)
            actions     = np.stack(rollout["act"], axis=0)
            log_probs   = np.array(rollout["logp"], dtype=np.float32)
            rewards     = np.array(rollout["rew"], dtype=np.float32)
            states_next = np.stack(rollout["obs_next"], axis=0)
            dones       = np.array(rollout["done"], dtype=np.float32)

            # Update実行
            loss_dict = agent.update_net(states, actions, log_probs, rewards, states_next, dones)
            
            # バッファクリア
            rollout_clear()

            # ログ記録
            loss_history["policy_loss"].append(loss_dict["policy_loss"])
            loss_history["value_loss"].append(loss_dict["value_loss"])

            # 評価とログ出力
            if (update_num % log_interval_updates) == 0:
                # 決定論的モードで評価
                eval_score = evaluate(env, agent, n_episodes=3)
                
                logging.info(
                    f"Update {update_num:4d} | Step {t:6d} | "
                    f"Eval: {eval_score:8.2f} | "
                    f"P_Loss: {loss_dict['policy_loss']:.4f} | "
                    f"V_Loss: {loss_dict['value_loss']:.4f}"
                )

    return loss_history, train_reward_history

# 評価用関数（元のコードと同じものでOKですが、念のため再掲）
def evaluate(env, agent, n_episodes=3):
    scores = []
    # 変数名修正対応
    low_np = agent.u_low.detach().cpu().numpy()
    high_np = agent.u_high.detach().cpu().numpy()

    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        score = 0.0
        while not done:
            # step() は deterministic=True になっているはず
            action = agent.step(obs)
            action = np.clip(action, low_np, high_np)
            
            obs, rew, term, trunc, _ = env.step(action)
            score += rew
            done = term or trunc
        scores.append(score)
    return np.mean(scores)

In [196]:
agent = PPOAgent(Config=Config(),device=device)
total_step= 1000000

lh, rh = train_ppo(
    env=env,
    agent=agent,
    total_step=total_step,
    batch_steps=2048,
)

Start PPO Training: Device=cuda, Batch=2048


22:54:32 [INFO] Increased policy learning rate to 0.00045
22:54:33 [INFO] Update    1 | Step   2047 | Eval: -1066.38 | P_Loss: -0.0055 | V_Loss: 2.4054
22:54:37 [INFO] Increased policy learning rate to 0.000675
22:54:38 [INFO] Update    2 | Step   4095 | Eval: -1107.56 | P_Loss: -0.0076 | V_Loss: 2.2814
22:54:43 [INFO] Increased policy learning rate to 0.0010125
22:54:43 [INFO] Update    3 | Step   6143 | Eval: -1207.60 | P_Loss: -0.0088 | V_Loss: 2.5116
22:54:49 [INFO] Update    4 | Step   8191 | Eval: -1150.87 | P_Loss: -0.0119 | V_Loss: 2.3314
22:54:53 [INFO] Increased policy learning rate to 0.00151875
22:54:54 [INFO] Update    5 | Step  10239 | Eval: -1072.44 | P_Loss: -0.0110 | V_Loss: 2.6153
22:54:57 [INFO] Increased policy learning rate to 0.0022781249999999998
22:54:58 [INFO] Update    6 | Step  12287 | Eval: -1173.62 | P_Loss: -0.0118 | V_Loss: 2.4338
22:55:03 [INFO] Update    7 | Step  14335 | Eval: -1307.88 | P_Loss: -0.0127 | V_Loss: 2.2117
22:55:08 [INFO] Increased policy

In [197]:
env.close()

In [198]:
from pathlib import Path
from datetime import datetime

def make_unique_path(path: str | Path) -> Path:
    """
    path が既に存在する場合、末尾に _1, _2, ... を付けて未使用のパスを返す。
    例: ddpg_final_20251221_235959.pth -> ddpg_final_20251221_235959_1.pth -> ...
    """
    p = Path(path)

    # 存在しないならそのまま使う
    if not p.exists():
        return p

    parent = p.parent
    stem = p.stem      # 拡張子抜きファイル名
    suffix = p.suffix  # ".pth"

    i = 1
    while True:
        cand = parent / f"{stem}_{i}{suffix}"
        if not cand.exists():
            return cand
        i += 1


# 推論用に eval モードにしておく（保存自体は train のままでも可）
agent.mode2eval()

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

models_dir = Path("./models")
models_dir.mkdir(parents=True, exist_ok=True)

base_path = models_dir / f"ppo_final_{stamp}.pth"
save_path = make_unique_path(base_path)

agent.save_all(
    save_path.as_posix(),
    extra={
        "total_step": int(total_step),
        "reward_history": rh,  # 必要ならそのままでOK
    }
)

print(f"saved to {save_path}")

saved to models/ppo_final_20260201_234103.pth
