# 自作のPPOノートブック

In [1]:
from dataclasses import dataclass, field, asdict, is_dataclass

import sys
import logging

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal

import gymnasium as gym

from myActivator import tanhAndScale
from myFunction import make_squashed_gaussian

In [2]:
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s",
                    stream=sys.stdout, datefmt="%H:%M:%S")

In [3]:
env = gym.make("Pendulum-v1")
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
    logging.info('%s: %s', key, vars(env.unwrapped)[key])

22:40:57 [INFO] id: Pendulum-v1
22:40:57 [INFO] entry_point: gymnasium.envs.classic_control.pendulum:PendulumEnv
22:40:57 [INFO] reward_threshold: None
22:40:57 [INFO] nondeterministic: False
22:40:57 [INFO] max_episode_steps: 200
22:40:57 [INFO] order_enforce: True
22:40:57 [INFO] disable_env_checker: False
22:40:57 [INFO] kwargs: {}
22:40:57 [INFO] additional_wrappers: ()
22:40:57 [INFO] vector_entry_point: None
22:40:57 [INFO] namespace: None
22:40:57 [INFO] name: Pendulum
22:40:57 [INFO] version: 1
22:40:57 [INFO] max_speed: 8
22:40:57 [INFO] max_torque: 2.0
22:40:57 [INFO] dt: 0.05
22:40:57 [INFO] g: 10.0
22:40:57 [INFO] m: 1.0
22:40:57 [INFO] l: 1.0
22:40:57 [INFO] render_mode: None
22:40:57 [INFO] screen_dim: 500
22:40:57 [INFO] screen: None
22:40:57 [INFO] clock: None
22:40:57 [INFO] isopen: True
22:40:57 [INFO] action_space: Box(-2.0, 2.0, (1,), float32)
22:40:57 [INFO] observation_space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
22:40:57 [INFO] spec: EnvSpec(id='Pendulum-

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
@dataclass
class Config:
    # ===== Pendulum-v1 specs =====
    obs_dim: int = 3
    act_dim: int = 1

    # TRPOAgent 側は Config.u_llim / Config.u_ulim を参照
    u_llim: list[float] = field(default_factory=lambda: [-2.0])
    u_ulim: list[float] = field(default_factory=lambda: [ 2.0])

    # ===== Network architecture =====
    V_net_in: int = 3
    P_net_in: int = 3

    V_net_sizes: list[int] = field(default_factory=lambda: [64, 64])
    P_net_sizes: list[int] = field(default_factory=lambda: [64, 64])

    V_net_out: int = 1
    P_net_out: int = 1  # = act_dim

    # ===== Optimizer =====
    V_lr: float = 1e-3
    P_lr: float = 3e-4

    # ===== GAE / discount =====
    gamma: float = 0.99
    lam: float = 0.97

    # ===== PPO hyperparameters =====
    clip_ratio: float = 0.2
    policy_train_iters: int = 5
    target_kl: float = 0.01
    reward_scaling: float = 0.01

    # ===== Value function training =====
    value_train_iters: int = 5
    value_l2_reg: float = 1e-3
    v_clip_epsilon: float = 0.2

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from dataclasses import dataclass, field, asdict, is_dataclass
import logging

# --- 1. Obs Norm 用のモジュール ---
class EmpiricalNormalization(nn.Module):
    def __init__(self, shape, epsilon=1e-4):
        super().__init__()
        self.register_buffer("running_mean", torch.zeros(shape))
        self.register_buffer("running_var", torch.ones(shape))
        self.register_buffer("count", torch.tensor(0.0))
        self.epsilon = epsilon

    def update(self, x):
        with torch.no_grad():
            x = x.view(-1, x.shape[-1])
            batch_mean = x.mean(dim=0)
            batch_var = x.var(dim=0, unbiased=False)
            batch_count = x.shape[0]

            if self.count == 0:
                self.running_mean = batch_mean
                self.running_var = batch_var
                self.count = torch.tensor(float(batch_count), device=x.device)
            else:
                delta = batch_mean - self.running_mean
                total_count = self.count + batch_count
                
                # 平均・分散のオンライン更新 (Welford's algorithm)
                new_mean = self.running_mean + delta * (batch_count / total_count)
                m_a = self.running_var * self.count
                m_b = batch_var * batch_count
                m_2 = m_a + m_b + delta**2 * self.count * batch_count / total_count
                new_var = m_2 / total_count

                self.running_mean = new_mean
                self.running_var = new_var
                self.count = total_count

    def forward(self, x):
        # Normalize: (x - mean) / std
        # -10 ~ 10 にクリップして異常値を弾く
        return torch.clamp(
            (x - self.running_mean) / torch.sqrt(self.running_var + self.epsilon),
            -10.0, 10.0
        )

# --- 2. 重み初期化用の関数 ---
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

@dataclass
class Config:
    # ===== Pendulum-v1 specs =====
    obs_dim: int = 3
    act_dim: int = 1
    u_llim: list[float] = field(default_factory=lambda: [-2.0])
    u_ulim: list[float] = field(default_factory=lambda: [ 2.0])

    # ===== Network architecture =====
    V_net_in: int = 3
    P_net_in: int = 3
    # 64x64 で十分
    V_net_sizes: list[int] = field(default_factory=lambda: [64, 64])
    P_net_sizes: list[int] = field(default_factory=lambda: [64, 64])
    V_net_out: int = 1
    P_net_out: int = 1

    # ===== Optimizer =====
    # PPOは LR 3e-4 が鉄板
    V_lr: float = 1e-3 # Valueは少し高めでもOK
    P_lr: float = 3e-4 

    # ===== GAE / discount =====
    gamma: float = 0.99
    lam: float = 0.95

    # ===== PPO hyperparameters =====
    clip_ratio: float = 0.2
    # ★重要: Epochは少なめに
    policy_train_iters: int = 10
    target_kl: float = 0.015  # 少し緩める
    reward_scaling: float = 0.01

    # ===== Value function training =====
    value_train_iters: int = 10
    value_l2_reg: float = 1e-3
    v_clip_epsilon: float = 0.2


class PPOAgent:
    def __init__(self, Config, device=None):
        if Config is None: raise ValueError("No Config!!")
        self.Config = Config
        
        if device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device(device)

        self.u_low = torch.tensor(Config.u_llim, dtype=torch.float32, device=self.device)
        self.u_high = torch.tensor(Config.u_ulim, dtype=torch.float32, device=self.device)

        # ★追加: 入力正規化モジュール
        self.obs_norm = EmpiricalNormalization(shape=(Config.obs_dim,)).to(self.device)

        # networks
        self.V_net = self.build_net(Config.V_net_in, Config.V_net_sizes, Config.V_net_out, is_value=True).to(self.device)
        self.P_net = self.build_net(Config.P_net_in, Config.P_net_sizes, Config.P_net_out, is_value=False).to(self.device)

        self.V_net.train()
        self.P_net.train()

        # log_std
        action_dim = Config.P_net_out
        self.log_std = nn.Parameter(torch.zeros(action_dim, device=self.device))

        # optimizer (eps=1e-5 は数値安定性のために推奨)
        self.V_optim = optim.Adam(self.V_net.parameters(), lr=Config.V_lr, eps=1e-5)
        self.P_optim = optim.Adam(
            list(self.P_net.parameters()) + [self.log_std], 
            lr=Config.P_lr, 
            eps=1e-5
        )

        # hyperparams
        self.gamma = float(Config.gamma)
        self.tau = float(Config.lam)
        self.target_kl = float(getattr(Config, "target_kl", 0.015))
        
        self.policy_train_iters = int(getattr(Config, "policy_train_iters", 10))
        self.value_train_iters = int(getattr(Config, "value_train_iters", 10))
        self.value_l2_reg = float(getattr(Config, "value_l2_reg", 1e-3))
        self.v_clip_epsilon = float(getattr(Config, "v_clip_epsilon", 0.2))

    def build_net(self, input_size, hidden_sizes, output_size, is_value=False):
        layers = []
        in_size = input_size
        for h_size in hidden_sizes:
            # 中間層は Tanh + Orthogonal Init (gain=sqrt(2))
            layers.append(layer_init(nn.Linear(in_size, h_size), std=np.sqrt(2)))
            layers.append(nn.Tanh()) 
            in_size = h_size
        
        # 出力層の初期化（これが超重要）
        if is_value:
            # Valueは gain=1.0
            layers.append(layer_init(nn.Linear(in_size, output_size), std=1.0))
        else:
            # Policyは gain=0.01 (初期行動を0付近にして、ランダム探索を促進)
            layers.append(layer_init(nn.Linear(in_size, output_size), std=0.01))
            
        return nn.Sequential(*layers)
    
    @torch.no_grad()
    def get_action_and_log_prob(self, state, deterministic=False, update_rms=False):
        """
        update_rms=True のときだけ Obs Norm を更新する
        """
        s = torch.as_tensor(state, dtype=torch.float32, device=self.device)
        if s.dim() == 1:
            s = s.unsqueeze(0)

        # ★正規化の更新と適用
        if update_rms:
            self.obs_norm.update(s)
        s = self.obs_norm(s)

        dist = self._policy_dist(s)

        if deterministic:
            a = dist.mean
            logp = None
        else:
            a = dist.sample()
            logp = dist.log_prob(a).sum(axis=-1)

        a = a.squeeze(0)
        if logp is not None:
            logp = logp.squeeze(0)
        return a, logp
    
    @torch.no_grad()
    def step(self, state):
        # 推論時は RMS 更新しない (update_rms=False)
        a, _ = self.get_action_and_log_prob(state, deterministic=True, update_rms=False)
        return a.cpu().numpy()
    
    def _policy_mean(self, states):
        return self.P_net(states)
    
    def _policy_dist(self, states):
        mean = self._policy_mean(states)
        std = torch.exp(self.log_std)
        std = std.unsqueeze(0).expand_as(mean)
        return Normal(mean, std)
    
    @torch.no_grad()
    def _compute_gae(self, rewards, values, next_values, dones):
        batch_size = rewards.shape[0]
        adv = torch.zeros_like(rewards, device=self.device)
        gae = 0.0
        for t in reversed(range(batch_size)):
            if t == batch_size - 1:
                nv = next_values[t]
            else:
                nv = values[t + 1]
            delta = rewards[t] + self.gamma * nv * (1 - dones[t]) - values[t]
            gae = delta + self.gamma * self.tau * (1 - dones[t]) * gae
            adv[t] = gae
        ret = adv + values
        return adv, ret
    
    def _ppo_step(self, states, actions, old_log_probs, advantages):
        for _ in range(self.policy_train_iters):
            dist = self._policy_dist(states)
            log_probs = dist.log_prob(actions).sum(axis=-1)
            ratios = torch.exp(log_probs - old_log_probs)

            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1.0 - self.Config.clip_ratio, 1.0 + self.Config.clip_ratio) * advantages
            
            policy_loss = -torch.min(surr1, surr2).mean()
            
            self.P_optim.zero_grad()
            policy_loss.backward()
            # 勾配クリップ (事故防止)
            nn.utils.clip_grad_norm_(self.P_net.parameters(), max_norm=0.5)
            self.P_optim.step()

        # KL計算
        with torch.no_grad():
            dist = self._policy_dist(states)
            log_probs = dist.log_prob(actions).sum(axis=-1)
            change = (old_log_probs - log_probs).mean() # 近似KL
        return policy_loss, change
    
    def _update_value_function(self, states, returns, old_values):
        for _ in range(self.value_train_iters):
            values = self.V_net(states).squeeze(-1)
            value_loss = F.mse_loss(values, returns)
            
            # Value Clip
            v_clip = old_values + torch.clamp(values - old_values, -self.v_clip_epsilon, self.v_clip_epsilon)
            v_clip_loss = F.mse_loss(v_clip, returns)
            
            # L2 reg
            l2_reg = torch.tensor(0., device=self.device)
            for param in self.V_net.parameters():
                l2_reg += torch.norm(param)**2
            
            loss = torch.max(value_loss, v_clip_loss) + self.value_l2_reg * l2_reg

            self.V_optim.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(self.V_net.parameters(), max_norm=0.5)
            self.V_optim.step()
        return loss
    
    def update_net(self, states, actions, log_probs, rewards, next_states, dones):
        states = torch.as_tensor(states, dtype=torch.float32, device=self.device)
        actions = torch.as_tensor(actions, dtype=torch.float32, device=self.device).view(-1, self.Config.act_dim)
        log_probs = torch.as_tensor(log_probs, dtype=torch.float32, device=self.device).view(-1).detach()
        rewards = torch.as_tensor(rewards, dtype=torch.float32, device=self.device)
        next_states = torch.as_tensor(next_states, dtype=torch.float32, device=self.device)
        dones = torch.as_tensor(dones, dtype=torch.float32, device=self.device)

        # ★追加: バッチ全体に正規化を適用
        states_norm = self.obs_norm(states)
        next_states_norm = self.obs_norm(next_states)

        with torch.no_grad():
            # 正規化された state を使う
            values = self.V_net(states_norm).squeeze(-1)
            next_values = self.V_net(next_states_norm).squeeze(-1)
            
            advantages, returns = self._compute_gae(rewards, values, next_values, dones)
            # Advantage Normalization (学習速度向上に必須)
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # Update (正規化データを使用)
        policy_loss, kl_change = self._ppo_step(states_norm, actions, log_probs, advantages)

        # Adaptive LR
        if kl_change > 1.5 * self.target_kl:
            for param_group in self.P_optim.param_groups:
                # 下限を少し緩く (1e-6 -> 1e-5)
                param_group['lr'] = max(param_group['lr'] / 1.5, 1e-5)
                # logging.info(f"Decreased LR to {param_group['lr']}")
        elif kl_change < self.target_kl / 1.5:
            for param_group in self.P_optim.param_groups:
                param_group['lr'] = min(param_group['lr'] * 1.5, 1e-2)
                # logging.info(f"Increased LR to {param_group['lr']}")

        value_loss = self._update_value_function(states_norm, returns, values)

        return {"policy_loss": policy_loss.item(), "value_loss": value_loss.item()}
    
    # ... (to, save_all, load_all はそのまま) ...
    def to(self, device):
        self.device = torch.device(device)
        self.V_net.to(self.device)
        self.P_net.to(self.device)
        self.log_std.data = self.log_std.data.to(self.device)
        self.u_low = self.u_low.to(self.device)
        self.u_high = self.u_high.to(self.device)
        self.obs_norm.to(self.device) # 忘れずに
        return self
    
    def mode2eval(self):
        self.V_net.eval()
        self.P_net.eval()

    def mode2train(self):
        self.V_net.train()
        self.P_net.train()

    def save_all(self, path: str, extra: dict | None = None):
        cfg = asdict(self.Config) if is_dataclass(self.Config) else self.Config
        save_dict = {
            "config": cfg,
            "V_net_state_dict": self.V_net.state_dict(),
            "P_net_state_dict": self.P_net.state_dict(),
            "log_std": self.log_std.data,
            # Normの統計情報も保存する
            "obs_norm_state_dict": self.obs_norm.state_dict() 
        }
        if extra is not None:
            save_dict.update(extra)
        torch.save(save_dict, path)
        
    def load_all(self, path: str, map_location=None):
        load_dict = torch.load(path, map_location=map_location)
        self.V_net.load_state_dict(load_dict["V_net_state_dict"])
        self.P_net.load_state_dict(load_dict["P_net_state_dict"])
        self.log_std.data = load_dict["log_std"].to(self.device)
        # Normのロード
        if "obs_norm_state_dict" in load_dict:
            self.obs_norm.load_state_dict(load_dict["obs_norm_state_dict"])


# 評価用関数（元のコードと同じものでOKですが、念のため再掲）
def evaluate(env, agent, n_episodes=3):
    scores = []
    # 変数名修正対応
    low_np = agent.u_low.detach().cpu().numpy()
    high_np = agent.u_high.detach().cpu().numpy()

    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        score = 0.0
        while not done:
            # step() は deterministic=True になっているはず
            action = agent.step(obs)
            action = np.clip(action, low_np, high_np)
            
            obs, rew, term, trunc, _ = env.step(action)
            score += rew
            done = term or trunc
        scores.append(score)
    return np.mean(scores)

In [7]:
def train_ppo(
    env,
    agent,
    total_step: int = 200_000,
    batch_steps: int = 2048,
    random_steps: int = 0,
    bootstrap_on_timeout: bool = False,
    log_interval_updates: int = 1,
):
    print(f"Start PPO Training: Device={agent.device}, Batch={batch_steps}")
    low_np = agent.u_low.detach().cpu().numpy()
    high_np = agent.u_high.detach().cpu().numpy()

    loss_history = {"policy_loss": [], "value_loss": []}
    train_reward_history = []
    
    rollout = {"obs": [], "act": [], "logp": [], "rew": [], "obs_next": [], "done": []}
    def rollout_clear():
        for k in rollout: rollout[k].clear()

    obs, info = env.reset()
    ep_return = 0.0
    update_num = 0

    for t in range(total_step):
        # --- (1) 行動選択 ---
        if t < random_steps:
            action_raw = np.atleast_1d(env.action_space.sample()).astype(np.float32)
            with torch.no_grad():
                a_t, logp_t = agent.get_action_and_log_prob(obs, deterministic=False, update_rms=True)
                logp_val = logp_t.item()
        else:
            with torch.no_grad():
                # ★修正: update_rms=True を追加して、見たデータを学習させる
                a_t, logp_t = agent.get_action_and_log_prob(obs, deterministic=False, update_rms=True)
            
            action_raw = np.atleast_1d(a_t.detach().cpu().numpy()).astype(np.float32)
            logp_val = logp_t.item()

        # --- (2) env step ---
        action_env = np.clip(action_raw, low_np, high_np)
        obs_next, reward, terminated, truncated, info = env.step(action_env)

        ep_return += float(reward)

        if bootstrap_on_timeout:
            done_for_gae = float(terminated)
        else:
            done_for_gae = float(terminated or truncated)
        done_for_reset = (terminated or truncated)

        # 報酬スケーリング
        scaled_reward = float(reward) * agent.Config.reward_scaling

        # --- (3) バッファに保存 ---
        rollout["obs"].append(np.asarray(obs, dtype=np.float32))
        rollout["act"].append(np.asarray(action_raw, dtype=np.float32))
        rollout["logp"].append(logp_val)
        rollout["rew"].append(float(scaled_reward)) # Scaled reward
        rollout["obs_next"].append(np.asarray(obs_next, dtype=np.float32))
        rollout["done"].append(float(done_for_gae))

        if done_for_reset:
            train_reward_history.append(ep_return)
            ep_return = 0.0
            obs, info = env.reset()
        else:
            obs = obs_next

        # --- (5) Update ---
        if len(rollout["obs"]) >= batch_steps:
            update_num += 1
            states      = np.stack(rollout["obs"], axis=0)
            actions     = np.stack(rollout["act"], axis=0)
            log_probs   = np.array(rollout["logp"], dtype=np.float32)
            rewards     = np.array(rollout["rew"], dtype=np.float32)
            states_next = np.stack(rollout["obs_next"], axis=0)
            dones       = np.array(rollout["done"], dtype=np.float32)

            loss_dict = agent.update_net(states, actions, log_probs, rewards, states_next, dones)
            rollout_clear()

            loss_history["policy_loss"].append(loss_dict["policy_loss"])
            loss_history["value_loss"].append(loss_dict["value_loss"])

            if (update_num % log_interval_updates) == 0:
                eval_score = evaluate(env, agent, n_episodes=3)
                logging.info(
                    f"Update {update_num:4d} | Step {t:6d} | "
                    f"Eval: {eval_score:8.2f} | "
                    f"P_Loss: {loss_dict['policy_loss']:.4f} | "
                    f"V_Loss: {loss_dict['value_loss']:.4f}"
                )

    return loss_history, train_reward_history

In [8]:
agent = PPOAgent(Config=Config(),device=device)
total_step= 1000000

lh, rh = train_ppo(
    env=env,
    agent=agent,
    total_step=total_step,
    batch_steps=2048,
)

Start PPO Training: Device=cuda, Batch=2048


Consider using tensor.detach() first. (Triggered internally at /pytorch/aten/src/ATen/native/Scalar.cpp:22.)
  return {"policy_loss": policy_loss.item(), "value_loss": value_loss.item()}


22:41:12 [INFO] Update    1 | Step   2047 | Eval: -1135.39 | P_Loss: -0.0015 | V_Loss: 0.7993
22:41:21 [INFO] Update    2 | Step   4095 | Eval: -1112.23 | P_Loss: -0.0006 | V_Loss: 0.8726
22:41:27 [INFO] Update    3 | Step   6143 | Eval: -1102.30 | P_Loss: -0.0035 | V_Loss: 0.6703
22:41:33 [INFO] Update    4 | Step   8191 | Eval: -1285.84 | P_Loss: -0.0008 | V_Loss: 0.7049
22:41:40 [INFO] Update    5 | Step  10239 | Eval:  -893.76 | P_Loss: -0.0046 | V_Loss: 0.5726
22:41:48 [INFO] Update    6 | Step  12287 | Eval: -1800.23 | P_Loss: -0.0022 | V_Loss: 0.5199
22:41:55 [INFO] Update    7 | Step  14335 | Eval: -1239.03 | P_Loss: -0.0021 | V_Loss: 0.4523
22:42:01 [INFO] Update    8 | Step  16383 | Eval: -1617.89 | P_Loss: -0.0029 | V_Loss: 0.5012
22:42:07 [INFO] Update    9 | Step  18431 | Eval:  -915.50 | P_Loss: -0.0032 | V_Loss: 0.5521
22:42:15 [INFO] Update   10 | Step  20479 | Eval: -1217.77 | P_Loss: -0.0040 | V_Loss: 0.5993
22:42:22 [INFO] Update   11 | Step  22527 | Eval: -1191.29 |

KeyboardInterrupt: 

In [None]:
env.close()

In [None]:
from pathlib import Path
from datetime import datetime

def make_unique_path(path: str | Path) -> Path:
    """
    path が既に存在する場合、末尾に _1, _2, ... を付けて未使用のパスを返す。
    例: ddpg_final_20251221_235959.pth -> ddpg_final_20251221_235959_1.pth -> ...
    """
    p = Path(path)

    # 存在しないならそのまま使う
    if not p.exists():
        return p

    parent = p.parent
    stem = p.stem      # 拡張子抜きファイル名
    suffix = p.suffix  # ".pth"

    i = 1
    while True:
        cand = parent / f"{stem}_{i}{suffix}"
        if not cand.exists():
            return cand
        i += 1


# 推論用に eval モードにしておく（保存自体は train のままでも可）
agent.mode2eval()

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

models_dir = Path("./models")
models_dir.mkdir(parents=True, exist_ok=True)

base_path = models_dir / f"ppo_final_{stamp}.pth"
save_path = make_unique_path(base_path)

agent.save_all(
    save_path.as_posix(),
    extra={
        "total_step": int(total_step),
        "reward_history": rh,  # 必要ならそのままでOK
    }
)

print(f"saved to {save_path}")

saved to models/ppo_final_20260201_222738.pth
