# 自作のTRPOノートブック

In [49]:
import numpy as np
import copy
from dataclasses import dataclass, field, asdict, is_dataclass

import sys
import logging

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal

import gymnasium as gym

from myActivator import tanhAndScale
from myFunction import make_squashed_gaussian

In [50]:
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s",
                    stream=sys.stdout, datefmt="%H:%M:%S")

In [51]:
env = gym.make("Pendulum-v1")
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
    logging.info('%s: %s', key, vars(env.unwrapped)[key])

04:10:35 [INFO] id: Pendulum-v1
04:10:35 [INFO] entry_point: gymnasium.envs.classic_control.pendulum:PendulumEnv
04:10:35 [INFO] reward_threshold: None
04:10:35 [INFO] nondeterministic: False
04:10:35 [INFO] max_episode_steps: 200
04:10:35 [INFO] order_enforce: True
04:10:35 [INFO] disable_env_checker: False
04:10:35 [INFO] kwargs: {}
04:10:35 [INFO] additional_wrappers: ()
04:10:35 [INFO] vector_entry_point: None
04:10:35 [INFO] namespace: None
04:10:35 [INFO] name: Pendulum
04:10:35 [INFO] version: 1
04:10:35 [INFO] max_speed: 8
04:10:35 [INFO] max_torque: 2.0
04:10:35 [INFO] dt: 0.05
04:10:35 [INFO] g: 10.0
04:10:35 [INFO] m: 1.0
04:10:35 [INFO] l: 1.0
04:10:35 [INFO] render_mode: None
04:10:35 [INFO] screen_dim: 500
04:10:35 [INFO] screen: None
04:10:35 [INFO] clock: None
04:10:35 [INFO] isopen: True
04:10:35 [INFO] action_space: Box(-2.0, 2.0, (1,), float32)
04:10:35 [INFO] observation_space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
04:10:35 [INFO] spec: EnvSpec(id='Pendulum-

In [52]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [53]:
from dataclasses import dataclass, field

@dataclass
class Config:
    # ===== Pendulum-v1 specs =====
    obs_dim: int = 3
    act_dim: int = 1

    # TRPOAgent 側は Config.u_llim / Config.u_ulim を参照
    u_llim: list[float] = field(default_factory=lambda: [-2.0])
    u_ulim: list[float] = field(default_factory=lambda: [ 2.0])

    # ===== Network architecture =====
    V_net_in: int = 3
    P_net_in: int = 3

    # ★修正: [256, 256] -> [64, 64] に変更
    # Pendulumのような単純な系でパラメータが多すぎると、TRPOの共役勾配法が不安定になります
    V_net_sizes: list[int] = field(default_factory=lambda: [64, 64])
    P_net_sizes: list[int] = field(default_factory=lambda: [64, 64])

    V_net_out: int = 1
    P_net_out: int = 1  # = act_dim

    # ===== Optimizer =====
    V_lr: float = 1e-3
    P_lr: float = 3e-4   # unused

    # ===== GAE / discount =====
    gamma: float = 0.99
    lam: float = 0.97

    # ===== TRPO hyperparameters =====
    max_kl: float = 1e-2
    cg_iters: int = 10
    cg_damping: float = 0.1

    ls_max_steps: int = 10
    ls_backtrack: float = 0.8
    ls_accept_ratio: float = 0.1

    # ===== Value function training =====
    value_train_iters: int = 80
    value_l2_reg: float = 1e-3

    # ===== Rollout / training loop side =====
    batch_steps: int = 5000
    bootstrap_on_timeout: bool = False

In [54]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from dataclasses import asdict, is_dataclass

class TRPOAgent:
    """
    Baseline（動いてる版）に寄せた TRPO 実装（修正済み完全版）。

    修正点:
    1. step() メソッドをデフォルトで「決定論的（平均値）」に変更し、評価時のスコアを安定化。
    2. update_net() で actions の形状を (Batch, act_dim) に強制し、Broadcasting事故を防止。
    3. update_net() で log_probs がリスト(float)かTensorか判別し、確実に計算グラフから切断。
    """

    def __init__(self, Config, device=None):
        if Config is None:
            raise ValueError("No Config!!")
        self.Config = Config

        # device
        if device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device(device)

        # action bounds (for env clip only)
        self.u_low = torch.as_tensor(Config.u_llim, dtype=torch.float32, device=self.device)
        self.u_high = torch.as_tensor(Config.u_ulim, dtype=torch.float32, device=self.device)

        # networks
        self.V_net = self.build_net(Config.V_net_in, Config.V_net_sizes, Config.V_net_out).to(self.device)
        self.P_net = self.build_net(Config.P_net_in, Config.P_net_sizes, Config.P_net_out).to(self.device)

        self.V_net.train()
        self.P_net.train()

        # log_std は状態に依存しないパラメータ
        action_dim = Config.P_net_out
        self.log_std = nn.Parameter(torch.zeros(action_dim, device=self.device))

        # critic optimizer（baselineはAdam）
        self.V_optim = optim.Adam(self.V_net.parameters(), lr=Config.V_lr)

        # hyperparams
        self.gamma = float(Config.gamma)
        self.tau = float(Config.lam)  # baselineの TAU (= GAE lambda)
        self.max_kl = float(Config.max_kl)
        self.cg_iters = int(Config.cg_iters)
        self.cg_damping = float(Config.cg_damping)

        self.value_train_iters = int(getattr(Config, "value_train_iters", 5))
        self.value_l2_reg = float(getattr(Config, "value_l2_reg", 1e-3))

        self.backtrack_coeff = float(getattr(Config, "ls_backtrack", 0.8))
        self.backtrack_iters = int(getattr(Config, "ls_max_steps", 10))

    def build_net(self, input_size, hidden_sizes, output_size):
        layers = []
        prev = input_size
        for h in hidden_sizes:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.Tanh())
            prev = h
        layers.append(nn.Linear(prev, output_size))
        return nn.Sequential(*layers)

    # ------------------------------------------------------------
    # Policy helpers (baseline-style)
    # ------------------------------------------------------------
    def _policy_mean(self, states: torch.Tensor) -> torch.Tensor:
        # (T, obs_dim) -> (T, act_dim)
        return self.P_net(states)

    def _policy_dist(self, states: torch.Tensor) -> Normal:
        mean = self._policy_mean(states)
        std = torch.exp(self.log_std)  # (act_dim,)
        # broadcasting: (T, act_dim) with (act_dim,)
        return Normal(mean, std)

    def _log_prob(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
        # returns shape (T,)
        dist = self._policy_dist(states)
        return dist.log_prob(actions).sum(dim=-1)

    @torch.no_grad()
    def get_action_and_log_prob(self, state, deterministic=False):
        """
        deterministic: Trueなら平均値(mean)を返す。Falseならサンプリング。
        """
        s = torch.as_tensor(state, dtype=torch.float32, device=self.device)
        if s.dim() == 1:
            s = s.unsqueeze(0)  # (1, obs_dim)

        dist = self._policy_dist(s)  # Normal(mean, std)

        if deterministic:
            a = dist.mean  # (1, act_dim)
            logp = None
        else:
            a = dist.sample()
            logp = dist.log_prob(a).sum(dim=-1)  # (1,)

        # 返す action は “clip前” を返す（baselineと同じ）
        # envに入れる直前で clip してください
        a = a.squeeze(0)
        if logp is not None:
            logp = logp.squeeze(0)
        return a, logp

    @torch.no_grad()
    def step(self, state):
        """
        【重要修正】推論時はデフォルトで「決定論的（平均値）」を使用する。
        これにより、評価時のスコアが安定して高くなる。
        """
        a, _ = self.get_action_and_log_prob(state, deterministic=True)
        return a.cpu().numpy()

    # ------------------------------------------------------------
    # Flat params / grads (baseline-style)
    # ------------------------------------------------------------
    def _flat_params(self) -> torch.Tensor:
        return torch.cat([p.data.view(-1) for p in list(self.P_net.parameters()) + [self.log_std]])

    def _set_flat_params(self, flat: torch.Tensor):
        idx = 0
        # P_net params
        for p in self.P_net.parameters():
            n = p.numel()
            p.data.copy_(flat[idx:idx+n].view_as(p))
            idx += n
        # log_std
        n = self.log_std.numel()
        self.log_std.data.copy_(flat[idx:idx+n].view_as(self.log_std))
        idx += n

    def _flat_grad(self, scalar: torch.Tensor, retain_graph=False, create_graph=False) -> torch.Tensor:
        params = list(self.P_net.parameters()) + [self.log_std]
        grads = torch.autograd.grad(
            scalar, params,
            retain_graph=retain_graph,
            create_graph=create_graph,
            allow_unused=False,
        )
        return torch.cat([g.contiguous().view(-1) for g in grads])

    # ------------------------------------------------------------
    # GAE (baseline-style)
    # ------------------------------------------------------------
    @torch.no_grad()
    def _compute_gae(self, rewards, values, next_values, dones):
        """
        rewards, dones: (T,)
        values, next_values: (T,)
        """
        T = rewards.shape[0]
        adv = torch.zeros_like(rewards)
        gae = 0.0

        for t in reversed(range(T)):
            if t == T - 1:
                nv = next_values[t]
            else:
                nv = values[t + 1]
            delta = rewards[t] + self.gamma * nv * (1.0 - dones[t]) - values[t]
            gae = delta + self.gamma * self.tau * (1.0 - dones[t]) * gae
            adv[t] = gae

        ret = adv + values
        return adv, ret

    # ------------------------------------------------------------
    # TRPO core (baseline-style)
    # ------------------------------------------------------------
    def _conjugate_gradient(self, Avp_fn, b, n_iters=10, residual_tol=1e-10):
        x = torch.zeros_like(b)
        r = b.clone()
        p = b.clone()
        rdotr = torch.dot(r, r)

        for _ in range(n_iters):
            Ap = Avp_fn(p)
            alpha = rdotr / (torch.dot(p, Ap) + 1e-8)
            x = x + alpha * p
            r = r - alpha * Ap
            new_rdotr = torch.dot(r, r)
            if new_rdotr < residual_tol:
                break
            beta = new_rdotr / (rdotr + 1e-12)
            p = r + beta * p
            rdotr = new_rdotr
        return x

    def _fisher_vector_product(self, states: torch.Tensor, v: torch.Tensor):
        dist_new = self._policy_dist(states)

        # detach old
        mean_old = dist_new.mean.detach()
        std_old = dist_new.stddev.detach()
        dist_old = Normal(mean_old, std_old)

        # KL(old||new)
        kl = torch.distributions.kl_divergence(dist_old, dist_new).sum(dim=-1).mean()

        # grad KL
        kl_grad = self._flat_grad(kl, retain_graph=True, create_graph=True)

        # (grad KL)^T v
        kl_grad_v = torch.dot(kl_grad, v)

        # Hessian-vector product
        hvp = self._flat_grad(kl_grad_v, retain_graph=True, create_graph=False)

        return hvp + self.cg_damping * v

    def _surrogate_loss(self, states, actions, advantages, old_log_probs):
        new_logp = self._log_prob(states, actions)
        ratio = torch.exp(new_logp - old_log_probs)
        return -(ratio * advantages).mean()

    def _trpo_step(self, states, actions, advantages, old_log_probs):
        # 1) policy gradient of surrogate loss
        loss = self._surrogate_loss(states, actions, advantages, old_log_probs)
        g = self._flat_grad(loss, retain_graph=True, create_graph=False)

        # 2) CG: solve F x = -g
        def Fvp(v):
            return self._fisher_vector_product(states, v)

        step_dir = self._conjugate_gradient(Fvp, -g, n_iters=self.cg_iters)

        # 3) scale to satisfy KL constraint
        shs = 0.5 * torch.dot(step_dir, Fvp(step_dir))
        if shs.item() <= 0.0:
            return False

        lm = torch.sqrt(shs / self.max_kl)
        full_step = step_dir / (lm + 1e-8)

        # 4) line search
        old_params = self._flat_params()
        old_loss = loss.item()

        step_frac = 1.0
        for _ in range(self.backtrack_iters):
            new_params = old_params + step_frac * full_step
            self._set_flat_params(new_params)

            with torch.no_grad():
                new_loss = self._surrogate_loss(states, actions, advantages, old_log_probs).item()

            if new_loss < old_loss:
                return True

            step_frac *= self.backtrack_coeff

        # fail: revert
        self._set_flat_params(old_params)
        return False

    # ------------------------------------------------------------
    # Value update (baseline-style)
    # ------------------------------------------------------------
    def _update_value_function(self, states, returns):
        last_loss = None
        for _ in range(self.value_train_iters):
            v_pred = self.V_net(states).squeeze(-1)
            v_loss = (v_pred - returns).pow(2).mean()

            # L2 reg
            l2 = 0.0
            for p in self.V_net.parameters():
                l2 = l2 + p.pow(2).sum()
            v_loss = v_loss + self.value_l2_reg * l2

            self.V_optim.zero_grad()
            v_loss.backward()
            self.V_optim.step()

            last_loss = v_loss.item()
        return last_loss

    # ------------------------------------------------------------
    # Public update API (Fixed Version)
    # ------------------------------------------------------------
    def update_net(self, states, actions, log_probs, rewards, states_next, dones):
        """
        学習ループから呼ばれるメイン更新関数。

        Fixes:
        1. actions.view(-1, act_dim) で形状不一致バグを修正。
        2. old_log_probs の detach 漏れを修正。
        """
        states = torch.as_tensor(states, dtype=torch.float32, device=self.device)
        states_next = torch.as_tensor(states_next, dtype=torch.float32, device=self.device)
        rewards = torch.as_tensor(rewards, dtype=torch.float32, device=self.device).view(-1)
        dones = torch.as_tensor(dones, dtype=torch.float32, device=self.device).view(-1)

        # 【修正】Actionの次元を明示的に整形 (Broadcasting事故防止)
        # 入力が (Batch,) だと broadcasting で (Batch, Batch) になり計算が壊れるため、(Batch, 1) 等に強制する
        actions = torch.as_tensor(actions, dtype=torch.float32, device=self.device).view(-1, self.Config.act_dim)

        # 【修正】old_log_probs を確実に計算グラフから切断 (.detach())
        if isinstance(log_probs, (list, tuple)):
            # floatのリストが来た場合 (推奨) -> Tensor化
            old_log_probs = torch.tensor(log_probs, dtype=torch.float32, device=self.device).view(-1)
        else:
            # Tensorが来た場合 -> detachしてコピー
            old_log_probs = torch.as_tensor(log_probs, dtype=torch.float32, device=self.device).view(-1).detach()

        # 1) values
        with torch.no_grad():
            values = self.V_net(states).squeeze(-1)
            next_values = self.V_net(states_next).squeeze(-1)

        # 2) GAE
        with torch.no_grad():
            advantages, returns = self._compute_gae(rewards, values, next_values, dones)
            # normalize advantage
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # 3) TRPO policy update
        trpo_ok = self._trpo_step(states, actions, advantages, old_log_probs)

        # 4) Value update
        v_loss = self._update_value_function(states, returns)

        return {"V_loss": v_loss, "trpo_ok": trpo_ok}

    # ------------------------------------------------------------
    # misc
    # ------------------------------------------------------------
    def to(self, device):
        self.device = torch.device(device)
        self.V_net.to(self.device)
        self.P_net.to(self.device)
        self.log_std.data = self.log_std.data.to(self.device)
        self.u_low = self.u_low.to(self.device)
        self.u_high = self.u_high.to(self.device)
        return self

    def mode2eval(self):
        self.V_net.eval()
        self.P_net.eval()

    def mode2train(self):
        self.V_net.train()
        self.P_net.train()

    def save_all(self, path: str, extra: dict | None = None):
        cfg = asdict(self.Config) if is_dataclass(self.Config) else self.Config
        save_dict = {
            "Config": cfg,
            "V_net_state_dict": self.V_net.state_dict(),
            "P_net_state_dict": self.P_net.state_dict(),
            "log_std": self.log_std.data,
        }
        if extra is not None:
            save_dict.update(extra)
        torch.save(save_dict, path)
        
    def load_all(self, path: str, map_location=None):
        load_dict = torch.load(path, map_location=map_location)
        self.V_net.load_state_dict(load_dict["V_net_state_dict"])
        self.P_net.load_state_dict(load_dict["P_net_state_dict"])
        self.log_std.data = load_dict["log_std"].to(self.device)

In [55]:
import numpy as np
import torch
import logging

def train_trpo(
    env,
    agent,
    total_step: int = 200_000,
    batch_steps: int | None = None,
    random_steps: int = 0,
    bootstrap_on_timeout: bool | None = None,
    log_interval_updates: int = 1,
):
    if batch_steps is None:
        batch_steps = int(getattr(agent.Config, "batch_steps", 5000))
    if bootstrap_on_timeout is None:
        bootstrap_on_timeout = bool(getattr(agent.Config, "bootstrap_on_timeout", False))

    print(f"Start Training: Device={agent.device}, Batch={batch_steps}")

    low_np = agent.u_low.detach().cpu().numpy()
    high_np = agent.u_high.detach().cpu().numpy()

    V_loss_history = []
    trpo_ok_history = []
    
    # 評価用に学習中のReward推移（ノイズあり）と評価用Reward（ノイズなし）を分ける
    train_reward_history = []

    # rollout buffer
    rollout = {"obs": [], "act": [], "logp": [], "rew": [], "obs_next": [], "done": []}

    def rollout_clear():
        for k in rollout:
            rollout[k].clear()

    obs, info = env.reset()
    ep_return = 0.0
    episode_num = 1
    update_num = 0

    for t in range(total_step):
        # --- (1) 行動選択 ---
        if t < random_steps:
            action_raw = np.atleast_1d(env.action_space.sample()).astype(np.float32)
            logp_val = 0.0
            use_for_update = False
        else:
            use_for_update = True
            with torch.no_grad():
                # 学習用データ取集なので deterministic=False (探索する)
                a_t, logp_t = agent.get_action_and_log_prob(obs, deterministic=False)
            
            action_raw = np.atleast_1d(a_t.detach().cpu().numpy()).astype(np.float32)
            # ★修正: メモリ節約のため item() で float にして保持する
            logp_val = logp_t.item()

        # --- (2) env step ---
        action_env = np.clip(action_raw, low_np, high_np)
        obs_next, reward, terminated, truncated, info = env.step(action_env)

        ep_return += float(reward)

        if bootstrap_on_timeout:
            done_for_gae = float(terminated)
        else:
            done_for_gae = float(terminated or truncated)
        
        done_for_reset = (terminated or truncated)

        # --- (3) 保存 ---
        if use_for_update:
            rollout["obs"].append(np.asarray(obs, dtype=np.float32))
            rollout["act"].append(np.asarray(action_raw, dtype=np.float32))
            rollout["logp"].append(logp_val)  # float
            rollout["rew"].append(float(reward))
            rollout["obs_next"].append(np.asarray(obs_next, dtype=np.float32))
            rollout["done"].append(float(done_for_gae))

        # --- (4) Reset判定 ---
        if done_for_reset:
            train_reward_history.append(ep_return)
            episode_num += 1
            ep_return = 0.0
            obs, info = env.reset()
        else:
            obs = obs_next

        # --- (5) Update ---
        if len(rollout["obs"]) >= batch_steps:
            update_num += 1

            # numpy/list -> tensor conversion
            states      = np.stack(rollout["obs"], axis=0)
            actions     = np.stack(rollout["act"], axis=0)
            log_probs   = rollout["logp"] # list of floats
            rewards     = np.asarray(rollout["rew"], dtype=np.float32)
            states_next = np.stack(rollout["obs_next"], axis=0)
            dones       = np.asarray(rollout["done"], dtype=np.float32)

            # Update
            out = agent.update_net(states, actions, log_probs, rewards, states_next, dones)
            rollout_clear()

            if isinstance(out, dict):
                V_loss_history.append(float(out.get("V_loss", 0.0)))
                trpo_ok_history.append(bool(out.get("trpo_ok", False)))

            # --- ★修正: 評価フェーズ (Evaluation) ---
            # ノイズなしでプレイして真の実力を測る
            if (update_num % log_interval_updates) == 0:
                eval_score = evaluate(env, agent, n_episodes=3)
                vloss = V_loss_history[-1] if V_loss_history else 0.0
                trpo_res = "OK" if trpo_ok_history[-1] else "NG"
                
                logging.info(
                    f"Update {update_num:4d} | Eval Score: {eval_score:8.2f} | V_Loss: {vloss:.4f} | TRPO: {trpo_res}"
                )

    return V_loss_history, trpo_ok_history, train_reward_history

# ★追加: 評価用関数
def evaluate(env, agent, n_episodes=3):
    scores = []
    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        score = 0.0
        while not done:
            # step() は内部で deterministic=True に修正済み
            action = agent.step(obs)
            # Env側でclip
            action = np.clip(action, agent.u_low.cpu().numpy(), agent.u_high.cpu().numpy())
            
            obs, rew, term, trunc, _ = env.step(action)
            score += rew
            done = term or trunc
        scores.append(score)
    return np.mean(scores)

In [56]:
agent = TRPOAgent(Config=Config(),device=device)
total_step=2500000

Vh, ph, rh = train_trpo(
    env=env,
    agent=agent,
    total_step=total_step,
    batch_steps=2048,
)

Start Training: Device=cuda, Batch=2048
04:10:41 [INFO] Update    1 | Eval Score: -1507.51 | V_Loss: 13777.2090 | TRPO: OK
04:10:45 [INFO] Update    2 | Eval Score: -1229.02 | V_Loss: 20503.7539 | TRPO: OK
04:10:50 [INFO] Update    3 | Eval Score:  -995.58 | V_Loss: 15797.1523 | TRPO: OK
04:10:54 [INFO] Update    4 | Eval Score: -1050.86 | V_Loss: 14221.4336 | TRPO: OK
04:10:59 [INFO] Update    5 | Eval Score:  -965.41 | V_Loss: 13971.7393 | TRPO: OK
04:11:03 [INFO] Update    6 | Eval Score:  -979.62 | V_Loss: 18486.4316 | TRPO: OK
04:11:07 [INFO] Update    7 | Eval Score:  -984.24 | V_Loss: 12865.2773 | TRPO: OK
04:11:11 [INFO] Update    8 | Eval Score: -1416.60 | V_Loss: 13844.6689 | TRPO: OK
04:11:16 [INFO] Update    9 | Eval Score:  -990.24 | V_Loss: 16324.8418 | TRPO: OK
04:11:20 [INFO] Update   10 | Eval Score: -1237.56 | V_Loss: 11457.9531 | TRPO: OK
04:11:25 [INFO] Update   11 | Eval Score: -1196.27 | V_Loss: 13341.6514 | TRPO: OK
04:11:29 [INFO] Update   12 | Eval Score: -1286

In [57]:
env.close()

In [58]:
from pathlib import Path
from datetime import datetime

def make_unique_path(path: str | Path) -> Path:
    """
    path が既に存在する場合、末尾に _1, _2, ... を付けて未使用のパスを返す。
    例: ddpg_final_20251221_235959.pth -> ddpg_final_20251221_235959_1.pth -> ...
    """
    p = Path(path)

    # 存在しないならそのまま使う
    if not p.exists():
        return p

    parent = p.parent
    stem = p.stem      # 拡張子抜きファイル名
    suffix = p.suffix  # ".pth"

    i = 1
    while True:
        cand = parent / f"{stem}_{i}{suffix}"
        if not cand.exists():
            return cand
        i += 1


# 推論用に eval モードにしておく（保存自体は train のままでも可）
agent.mode2eval()

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

models_dir = Path("./models")
models_dir.mkdir(parents=True, exist_ok=True)

base_path = models_dir / f"trpo_final_{stamp}.pth"
save_path = make_unique_path(base_path)

agent.save_all(
    save_path.as_posix(),
    extra={
        "total_step": int(total_step),
        "reward_history": rh,  # 必要ならそのままでOK
    }
)

print(f"saved to {save_path}")

saved to models/trpo_final_20260129_053609.pth
