In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from dataclasses import dataclass, field, asdict, is_dataclass

@dataclass
class Config:
    # ===== Pendulum-v1 specs =====
    obs_dim: int = 3
    act_dim: int = 1
    u_llim: list[float] = field(default_factory=lambda: [-2.0])
    u_ulim: list[float] = field(default_factory=lambda: [ 2.0])

    # ===== Network architecture =====
    V_net_in: int = 3
    P_net_in: int = 3

    # ★変更点1: Pendulum用にサイズを軽量化 (256->64)
    # これで学習の立ち上がりが早くなり、検証しやすくなります
    V_net_sizes: list[int] = field(default_factory=lambda: [64, 64])
    P_net_sizes: list[int] = field(default_factory=lambda: [64, 64])

    V_net_out: int = 1
    P_net_out: int = 1

    # ===== Optimizer =====
    V_lr: float = 1e-3
    P_lr: float = 3e-4  # TRPOでは未使用

    # ===== GAE / discount =====
    gamma: float = 0.99
    lam: float = 0.97

    # ===== TRPO hyperparameters =====
    max_kl: float = 1e-2
    cg_iters: int = 10
    cg_damping: float = 0.1

    ls_max_steps: int = 10
    ls_backtrack: float = 0.8
    ls_accept_ratio: float = 0.1

    # ===== Value function training =====
    value_train_iters: int = 20
    value_l2_reg: float = 1e-3
    
    # ===== Training Loop =====
    batch_steps: int = 5000
    bootstrap_on_timeout: bool = False


class TRPOAgent:
    """
    v3/v4 検証用修正版
    
    変更点:
    1. step() を deterministic=True (平均値使用) に変更
    2. 念の為 update_net で actions の view(-1, 1) を追加 (事故防止)
    """

    def __init__(self, Config, device=None):
        if Config is None:
            raise ValueError("No Config!!")
        self.Config = Config

        # device
        if device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device(device)

        # action bounds
        self.u_low = torch.as_tensor(Config.u_llim, dtype=torch.float32, device=self.device)
        self.u_high = torch.as_tensor(Config.u_ulim, dtype=torch.float32, device=self.device)

        # networks
        self.V_net = self.build_net(Config.V_net_in, Config.V_net_sizes, Config.V_net_out).to(self.device)
        self.P_net = self.build_net(Config.P_net_in, Config.P_net_sizes, Config.P_net_out).to(self.device)

        self.V_net.train()
        self.P_net.train()

        # log_std (State-Independent Parameter)
        action_dim = Config.P_net_out
        self.log_std = nn.Parameter(torch.zeros(action_dim, device=self.device))

        # critic optimizer
        self.V_optim = optim.Adam(self.V_net.parameters(), lr=Config.V_lr)

        # hyperparams
        self.gamma = float(Config.gamma)
        self.tau = float(Config.lam)
        self.max_kl = float(Config.max_kl)
        self.cg_iters = int(Config.cg_iters)
        self.cg_damping = float(Config.cg_damping)

        self.value_train_iters = int(getattr(Config, "value_train_iters", 5))
        self.value_l2_reg = float(getattr(Config, "value_l2_reg", 1e-3))
        self.backtrack_coeff = float(getattr(Config, "ls_backtrack", 0.8))
        self.backtrack_iters = int(getattr(Config, "ls_max_steps", 10))

    def build_net(self, input_size, hidden_sizes, output_size):
        layers = []
        prev = input_size
        for h in hidden_sizes:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.Tanh())
            prev = h
        layers.append(nn.Linear(prev, output_size))
        return nn.Sequential(*layers)

    # ------------------------------------------------------------
    # Policy helpers
    # ------------------------------------------------------------
    def _policy_mean(self, states: torch.Tensor) -> torch.Tensor:
        return self.P_net(states)

    def _policy_dist(self, states: torch.Tensor) -> Normal:
        mean = self._policy_mean(states)
        std = torch.exp(self.log_std)
        return Normal(mean, std)

    def _log_prob(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
        dist = self._policy_dist(states)
        return dist.log_prob(actions).sum(dim=-1)

    @torch.no_grad()
    def get_action_and_log_prob(self, state, deterministic=False):
        s = torch.as_tensor(state, dtype=torch.float32, device=self.device)
        if s.dim() == 1:
            s = s.unsqueeze(0)

        dist = self._policy_dist(s)

        if deterministic:
            a = dist.mean
            logp = None
        else:
            a = dist.sample()
            logp = dist.log_prob(a).sum(dim=-1)

        a = a.squeeze(0)
        if logp is not None:
            logp = logp.squeeze(0)
        return a, logp

    @torch.no_grad()
    def step(self, state):
        """
        ★変更点2: ここを deterministic=True に変更。
        これで「実力（平均値）」が出力されるようになります。
        """
        a, _ = self.get_action_and_log_prob(state, deterministic=True)
        return a.cpu().numpy()

    # ------------------------------------------------------------
    # Flat params / grads
    # ------------------------------------------------------------
    def _flat_params(self) -> torch.Tensor:
        return torch.cat([p.data.view(-1) for p in list(self.P_net.parameters()) + [self.log_std]])

    def _set_flat_params(self, flat: torch.Tensor):
        idx = 0
        for p in self.P_net.parameters():
            n = p.numel()
            p.data.copy_(flat[idx:idx+n].view_as(p))
            idx += n
        n = self.log_std.numel()
        self.log_std.data.copy_(flat[idx:idx+n].view_as(self.log_std))
        idx += n

    def _flat_grad(self, scalar: torch.Tensor, retain_graph=False, create_graph=False) -> torch.Tensor:
        params = list(self.P_net.parameters()) + [self.log_std]
        grads = torch.autograd.grad(
            scalar, params,
            retain_graph=retain_graph,
            create_graph=create_graph,
            allow_unused=False,
        )
        return torch.cat([g.contiguous().view(-1) for g in grads])

    # ------------------------------------------------------------
    # GAE
    # ------------------------------------------------------------
    @torch.no_grad()
    def _compute_gae(self, rewards, values, next_values, dones):
        T = rewards.shape[0]
        adv = torch.zeros_like(rewards)
        gae = 0.0

        for t in reversed(range(T)):
            if t == T - 1:
                nv = next_values[t]
            else:
                nv = values[t + 1]
            delta = rewards[t] + self.gamma * nv * (1.0 - dones[t]) - values[t]
            gae = delta + self.gamma * self.tau * (1.0 - dones[t]) * gae
            adv[t] = gae

        ret = adv + values
        return adv, ret

    # ------------------------------------------------------------
    # TRPO core
    # ------------------------------------------------------------
    def _conjugate_gradient(self, Avp_fn, b, n_iters=10, residual_tol=1e-10):
        x = torch.zeros_like(b)
        r = b.clone()
        p = b.clone()
        rdotr = torch.dot(r, r)

        for _ in range(n_iters):
            Ap = Avp_fn(p)
            alpha = rdotr / (torch.dot(p, Ap) + 1e-8)
            x = x + alpha * p
            r = r - alpha * Ap
            new_rdotr = torch.dot(r, r)
            if new_rdotr < residual_tol:
                break
            beta = new_rdotr / (rdotr + 1e-12)
            p = r + beta * p
            rdotr = new_rdotr
        return x

    def _fisher_vector_product(self, states: torch.Tensor, v: torch.Tensor):
        dist_new = self._policy_dist(states)
        mean_old = dist_new.mean.detach()
        std_old = dist_new.stddev.detach()
        dist_old = Normal(mean_old, std_old)

        kl = torch.distributions.kl_divergence(dist_old, dist_new).sum(dim=-1).mean()
        kl_grad = self._flat_grad(kl, retain_graph=True, create_graph=True)
        kl_grad_v = torch.dot(kl_grad, v)
        hvp = self._flat_grad(kl_grad_v, retain_graph=True, create_graph=False)

        return hvp + self.cg_damping * v

    def _surrogate_loss(self, states, actions, advantages, old_log_probs):
        new_logp = self._log_prob(states, actions)
        ratio = torch.exp(new_logp - old_log_probs)
        return -(ratio * advantages).mean()

    def _trpo_step(self, states, actions, advantages, old_log_probs):
        loss = self._surrogate_loss(states, actions, advantages, old_log_probs)
        g = self._flat_grad(loss, retain_graph=True, create_graph=False)

        def Fvp(v):
            return self._fisher_vector_product(states, v)

        step_dir = self._conjugate_gradient(Fvp, -g, n_iters=self.cg_iters)

        shs = 0.5 * torch.dot(step_dir, Fvp(step_dir))
        if shs.item() <= 0.0:
            return False

        lm = torch.sqrt(shs / self.max_kl)
        full_step = step_dir / (lm + 1e-8)

        old_params = self._flat_params()
        old_loss = loss.item()

        step_frac = 1.0
        for _ in range(self.backtrack_iters):
            new_params = old_params + step_frac * full_step
            self._set_flat_params(new_params)

            with torch.no_grad():
                new_loss = self._surrogate_loss(states, actions, advantages, old_log_probs).item()

            if new_loss < old_loss:
                return True

            step_frac *= self.backtrack_coeff

        self._set_flat_params(old_params)
        return False

    def _update_value_function(self, states, returns):
        last_loss = None
        for _ in range(self.value_train_iters):
            v_pred = self.V_net(states).squeeze(-1)
            v_loss = (v_pred - returns).pow(2).mean()

            l2 = 0.0
            for p in self.V_net.parameters():
                l2 = l2 + p.pow(2).sum()
            v_loss = v_loss + self.value_l2_reg * l2

            self.V_optim.zero_grad()
            v_loss.backward()
            self.V_optim.step()

            last_loss = v_loss.item()
        return last_loss

    # ------------------------------------------------------------
    # Update API
    # ------------------------------------------------------------
    def update_net(self, states, actions, log_probs, rewards, states_next, dones):
        states = torch.as_tensor(states, dtype=torch.float32, device=self.device)
        states_next = torch.as_tensor(states_next, dtype=torch.float32, device=self.device)
        rewards = torch.as_tensor(rewards, dtype=torch.float32, device=self.device).view(-1)
        dones = torch.as_tensor(dones, dtype=torch.float32, device=self.device).view(-1)

        # ★念の為の保険: Broadcasting事故防止
        actions = torch.as_tensor(actions, dtype=torch.float32, device=self.device).view(-1, self.Config.act_dim)

        if isinstance(log_probs, (list, tuple)):
            # 【ここを修正】torch.stack ではなく torch.tensor を使う
            # torch.stack(log_probs) -> エラー (中身がfloatだから)
            # torch.tensor(log_probs) -> 正解
            old_log_probs = torch.tensor(log_probs, dtype=torch.float32, device=self.device).view(-1)
        else:
            old_log_probs = torch.as_tensor(log_probs, dtype=torch.float32, device=self.device).view(-1)

        with torch.no_grad():
            values = self.V_net(states).squeeze(-1)
            next_values = self.V_net(states_next).squeeze(-1)

        with torch.no_grad():
            advantages, returns = self._compute_gae(rewards, values, next_values, dones)
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        trpo_ok = self._trpo_step(states, actions, advantages, old_log_probs)
        v_loss = self._update_value_function(states, returns)

        return {"V_loss": v_loss, "trpo_ok": trpo_ok}

    # ------------------------------------------------------------
    # Misc
    # ------------------------------------------------------------
    def to(self, device):
        self.device = torch.device(device)
        self.V_net.to(self.device)
        self.P_net.to(self.device)
        self.log_std.data = self.log_std.data.to(self.device)
        return self

    def mode2eval(self):
        self.V_net.eval()
        self.P_net.eval()

    def mode2train(self):
        self.V_net.train()
        self.P_net.train()

    def save_all(self, path: str, extra: dict | None = None):
        cfg = asdict(self.Config) if is_dataclass(self.Config) else self.Config
        save_dict = {
            "Config": cfg,
            "V_net_state_dict": self.V_net.state_dict(),
            "P_net_state_dict": self.P_net.state_dict(),
            "log_std": self.log_std.data,
        }
        if extra is not None:
            save_dict.update(extra)
        torch.save(save_dict, path)
        
    def load_all(self, path: str, map_location=None):
        load_dict = torch.load(path, map_location=map_location)
        self.V_net.load_state_dict(load_dict["V_net_state_dict"])
        self.P_net.load_state_dict(load_dict["P_net_state_dict"])
        self.log_std.data = load_dict["log_std"].to(self.device)

In [6]:
import numpy as np
import torch
import gymnasium as gym
import logging
import sys

# ログ設定（標準出力に見やすく出す）
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    datefmt='%H:%M:%S',
    stream=sys.stdout
)

def evaluate(env, agent, n_episodes=3):
    """
    検証の本丸：決定論的（平均値）な行動で真の実力を測る
    """
    scores = []
    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        score = 0.0
        while not done:
            # 修正版Agentの step() は deterministic=True になっているので、
            # ここを呼ぶだけで「ノイズなし」の行動が出る
            action = agent.step(obs)
            
            # EnvのAction Spaceに合わせてクリップ
            action = np.clip(action, -2.0, 2.0)
            
            obs, rew, term, trunc, _ = env.step(action)
            score += rew
            done = term or trunc
        scores.append(score)
    return np.mean(scores)

def train_trpo(
    env_name,
    agent,
    total_steps=100_000,  # Pendulumなら10万ステップもあれば十分収束します
    log_interval=1        # 何回の更新ごとにログを出すか
):
    # 環境作成
    env = gym.make(env_name)
    # 評価用環境（学習用とは別インスタンスにするのが作法）
    eval_env = gym.make(env_name)

    batch_steps = agent.Config.batch_steps
    logging.info(f"Start Training: Env={env_name}, Batch={batch_steps}, Device={agent.device}")

    # ロールアウトバッファ
    rollout = {"obs": [], "act": [], "logp": [], "rew": [], "obs_next": [], "done": []}

    obs, _ = env.reset()
    episode_reward = 0.0
    episodes_completed = 0
    updates_completed = 0
    
    # 学習中の生スコア推移（ノイズあり）
    train_score_history = []

    for t in range(1, total_steps + 1):
        
        # --- (1) 行動選択（データ収集フェーズ） ---
        # 重要：学習データ集めなので、あえて deterministic=False (確率的) にする。
        # v3/v4はここが step() と混同されていたのが敗因でした。
        with torch.no_grad():
            action_tensor, logp_tensor = agent.get_action_and_log_prob(obs, deterministic=False)
        
        action_np = action_tensor.cpu().numpy()     # (1,)
        logp_val = logp_tensor.item()               # float
        
        # クリップして環境へ
        action_env = np.clip(action_np, -2.0, 2.0)
        
        # --- (2) 環境ステップ ---
        obs_next, reward, terminated, truncated, _ = env.step(action_env)
        done = terminated or truncated
        episode_reward += reward

        # --- (3) データ保存 ---
        rollout["obs"].append(obs)
        rollout["act"].append(action_np) # クリップ前を保存
        rollout["logp"].append(logp_val)
        rollout["rew"].append(reward)
        rollout["obs_next"].append(obs_next)
        rollout["done"].append(float(terminated)) # GAE用（truncatedはdone扱いしないのが一般的）

        # --- (4) エピソード完了処理 ---
        if done:
            episodes_completed += 1
            train_score_history.append(episode_reward)
            episode_reward = 0.0
            obs, _ = env.reset()
        else:
            obs = obs_next

        # --- (5) TRPO更新（バッチが溜まったら） ---
        if len(rollout["obs"]) >= batch_steps:
            updates_completed += 1
            
            # NumPy配列化
            states = np.array(rollout["obs"], dtype=np.float32)
            actions = np.array(rollout["act"], dtype=np.float32)
            # log_probs は float のリストなので、Agent側で Tensor化してもらう
            log_probs = rollout["logp"] 
            rewards = np.array(rollout["rew"], dtype=np.float32)
            states_next = np.array(rollout["obs_next"], dtype=np.float32)
            dones = np.array(rollout["done"], dtype=np.float32)
            
            # 更新実行
            result = agent.update_net(states, actions, log_probs, rewards, states_next, dones)
            
            # バッファクリア
            for k in rollout: rollout[k].clear()
            
            # --- (6) 評価とログ ---
            if updates_completed % log_interval == 0:
                # ★ここが検証の肝！
                # ノイズなしの決定論的ポリシーで評価する
                eval_score = evaluate(eval_env, agent, n_episodes=3)
                
                # 直近の学習中スコア（ノイズあり）の平均
                train_score_avg = np.mean(train_score_history[-10:]) if train_score_history else -9999
                
                logging.info(
                    f"Update {updates_completed:3d} | "
                    f"Eval Score: {eval_score:8.2f} (True Skill) | "
                    f"Train Score: {train_score_avg:8.2f} (Noisy) | "
                    f"V_Loss: {result['V_loss']:.4f}"
                )
                
                # Pendulum-v1 は -200 以上なら概ねクリア
                if eval_score > -200:
                    logging.info(">>> Solved! (Verification Successful)")
                    # break # 検証完了ならここで止めてもOK

# ============================================================
# 実行ブロック
# ============================================================
if __name__ == "__main__":
    # 前回の修正版 Agent クラスと Config クラスをここでインスタンス化
    
    # 1. 設定 (検証用にネットワークを軽量化済み)
    cfg = Config() 
    
    # 2. Agent作成
    # ここには前回の回答にある「検証用修正版 TRPOAgent」クラスを使ってください
    agent = TRPOAgent(cfg)
    
    # 3. 学習開始
    train_trpo("Pendulum-v1", agent, total_steps=100_000)

03:10:15 [INFO] Start Training: Env=Pendulum-v1, Batch=5000, Device=cuda


03:10:25 [INFO] Update   1 | Eval Score: -1272.93 (True Skill) | Train Score: -1526.99 (Noisy) | V_Loss: 33155.3477
03:10:36 [INFO] Update   2 | Eval Score: -1308.81 (True Skill) | Train Score: -1255.34 (Noisy) | V_Loss: 26219.9844
03:10:46 [INFO] Update   3 | Eval Score:  -998.92 (True Skill) | Train Score: -1168.16 (Noisy) | V_Loss: 25213.6934
03:10:57 [INFO] Update   4 | Eval Score:  -868.22 (True Skill) | Train Score: -1210.66 (Noisy) | V_Loss: 19499.9453
03:11:07 [INFO] Update   5 | Eval Score: -1047.40 (True Skill) | Train Score: -1190.00 (Noisy) | V_Loss: 21980.7754
03:11:16 [INFO] Update   6 | Eval Score: -1106.48 (True Skill) | Train Score: -1070.02 (Noisy) | V_Loss: 16576.6719
03:11:27 [INFO] Update   7 | Eval Score: -1019.76 (True Skill) | Train Score:  -996.95 (Noisy) | V_Loss: 16764.0293
03:11:37 [INFO] Update   8 | Eval Score: -1120.99 (True Skill) | Train Score: -1083.86 (Noisy) | V_Loss: 18692.4316
03:11:48 [INFO] Update   9 | Eval Score:  -901.69 (True Skill) | Train S