In [None]:
import os
import sys
import time
import glob
import copy
import logging
from dataclasses import dataclass, asdict, is_dataclass
from types import SimpleNamespace

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gymnasium as gym

from myActivator import tanhAndScale

@dataclass
class Config:
    # ニューラルネットの設定
    Q_net_sizes = [6,12,6]
    P_net_sizes = [6,12,6]
    Q_net_in = 4
    P_net_in = 3
    Q_net_out = 1
    P_net_out = 1

    # 環境の制約
    u_ulim = 2.0
    u_llim = -2.0
    
    # 学習に関するパラメータ
    Q_lr = 1e-2
    P_lr = 1e-2
    gamma = 0.95  # 割引率
    sig = 1.0    # 探索の標準偏差
    tau = 5e-3    # ターゲットネットの更新幅


# -----------------------------
# 2) DDPG Agent（推論に必要な部分 + load_all）
# -----------------------------
class DDPGAgent:
    def __init__(self, Config, device=None):
        if Config is None:
            raise ValueError("No Config!!")
        self.Config = Config

        # device 決定
        if device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device(device)

        # action bounds（Tensor化して device を揃える）
        self.u_low = torch.as_tensor(self.Config.u_llim, dtype=torch.float32, device=self.device)
        self.u_high = torch.as_tensor(self.Config.u_ulim, dtype=torch.float32, device=self.device)

        # online nets
        self.Q_net = self.build_net(
            self.Config.Q_net_in,
            self.Config.Q_net_sizes,
            self.Config.Q_net_out,
        ).to(self.device)

        self.P_net = self.build_net(
            self.Config.P_net_in,
            self.Config.P_net_sizes,
            self.Config.P_net_out,
            output_activator=tanhAndScale(a_high=self.u_high, a_low=self.u_low),
        ).to(self.device)

        # target nets（ロードの整合のため形だけ作っておく）
        self.Q_target_net = copy.deepcopy(self.Q_net).to(self.device)
        self.P_target_net = copy.deepcopy(self.P_net).to(self.device)

        self.mode2eval()

    def build_net(self, input_size, hidden_sizes, output_size=1, output_activator=None):
        layers = []
        for in_sz, out_sz in zip([input_size] + hidden_sizes, hidden_sizes + [output_size]):
            layers.append(nn.Linear(in_sz, out_sz))
            layers.append(nn.ReLU())
        layers = layers[:-1]  # 最後の ReLU を外す
        if output_activator is not None:
            layers.append(output_activator)
        return nn.Sequential(*layers)

    def mode2eval(self):
        self.P_net.eval()
        self.Q_net.eval()
        self.P_target_net.eval()
        self.Q_target_net.eval()

    @torch.no_grad()
    def step(self, observation) -> np.ndarray:
        """ノイズなし（評価用）。環境に渡す行動を返す。"""
        obs_t = torch.as_tensor(observation, dtype=torch.float32, device=self.device)
        if obs_t.dim() == 1:
            obs_t = obs_t.unsqueeze(0)  # (obs_dim,) -> (1, obs_dim)

        action = self.P_net(obs_t)  # (1, act_dim)
        action = torch.clamp(action, self.u_low, self.u_high)
        return action.squeeze(0).cpu().numpy()
    
    def load_all(self, path: str, map_location=None):
        """
        save_all() で保存したチェックポイントをロード。
        PyTorch 2.6+ の weights_only=True デフォルト問題を回避するため、
        信頼できるチェックポイントに限り weights_only=False でロードする。
        """
        try:
            # PyTorch 2.6+ 対応（weights_only=False を明示）
            ckpt = torch.load(path, map_location=map_location, weights_only=False)
        except TypeError:
            # 古い PyTorch（weights_only 引数が存在しない）向けフォールバック
            ckpt = torch.load(path, map_location=map_location)

        self.P_net.load_state_dict(ckpt["P_net"])
        self.Q_net.load_state_dict(ckpt["Q_net"])
        self.P_target_net.load_state_dict(ckpt["P_target_net"])
        self.Q_target_net.load_state_dict(ckpt["Q_target_net"])

        return ckpt.get("extra", None)


# -----------------------------
# 3) 非ブロッキング key 入力（q で終了）
#   - Windows: msvcrt
#   - Unix端末: termios + select
#   - Jupyter等: 無効（Ctrl+C で停止）
# -----------------------------
def make_nonblocking_key_reader():
    restore_fn = None

    # Windows
    if os.name == "nt":
        import msvcrt

        def read_key():
            if msvcrt.kbhit():
                return msvcrt.getwch()
            return None

        return read_key, restore_fn

    # Unix系（端末）
    if sys.stdin.isatty():
        import termios
        import tty
        import select

        fd = sys.stdin.fileno()
        old = termios.tcgetattr(fd)
        tty.setcbreak(fd)

        def restore():
            termios.tcsetattr(fd, termios.TCSADRAIN, old)

        restore_fn = restore

        def read_key():
            if select.select([sys.stdin], [], [], 0.0)[0]:
                return sys.stdin.read(1)
            return None

        return read_key, restore_fn

    # Jupyter 等
    def read_key():
        return None

    return read_key, restore_fn


# -----------------------------
# 4) モデルファイル選択（最新を自動で拾う）
# -----------------------------
MODEL_GLOB = "./models/ddpg_final_*.pth"
candidates = sorted(glob.glob(MODEL_GLOB))
if len(candidates) == 0:
    raise FileNotFoundError(f"No checkpoint found: {MODEL_GLOB}")
MODEL_PATH = candidates[-1]
print("Using checkpoint:", MODEL_PATH)


# -----------------------------
# 5) checkpoint から Config を復元して agent を作る
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 先に ckpt を読み、config dict を取り出す
ckpt_for_cfg = torch.load(MODEL_PATH, map_location=device, weights_only=False)
cfg_dict = ckpt_for_cfg["config"]  # save_all() で保存した config

# dict -> 属性アクセスできる形にする（SimpleNamespace）
# 例: cfg.Q_net_sizes, cfg.u_ulim, ...
cfg = SimpleNamespace(**cfg_dict)

agent = DDPGAgent(Config=Config(), device=device)
agent.load_all(MODEL_PATH, map_location=device)

print("cuda available:", torch.cuda.is_available())
print("agent device:", agent.device)
print("P_net device:", next(agent.P_net.parameters()).device)


# -----------------------------
# 6) 環境を作って推論ループ（描画あり）
# -----------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    stream=sys.stdout,
    datefmt="%H:%M:%S",
)

env = gym.make("Pendulum-v1", render_mode="human")

read_key, restore_key_state = make_nonblocking_key_reader()

try:
    episode = 0
    while True:
        obs, info = env.reset()
        terminated = False
        truncated = False
        ep_return = 0.0
        ep_steps = 0
        episode += 1

        while True:
            # q で停止（端末でのみ有効な場合がある）
            ch = read_key()
            if ch is not None and str(ch).lower() == "q":
                print("\nQuit requested by keypress.")
                raise KeyboardInterrupt

            # 推論（ノイズ無し）
            action = agent.step(obs)

            # 1 step
            obs, reward, terminated, truncated, info = env.step(action)
            env.render()

            ep_return += float(reward)
            ep_steps += 1

            if terminated or truncated:
                print(f"Episode {episode:4d} | return = {ep_return: .3f} | steps = {ep_steps}")
                break

            # 見やすさのため（任意）
            # time.sleep(1.0 / 60.0)

except KeyboardInterrupt:
    pass

finally:
    if restore_key_state is not None:
        try:
            restore_key_state()
        except Exception:
            pass
    env.close()
    print("Evaluation finished.")


Using checkpoint: ./models/ddpg_final_20251221_232411.pth
cuda available: True
agent device: cuda
P_net device: cuda:0
Episode    1 | return = -130.414 | steps = 200
Episode    2 | return = -298.774 | steps = 200
Episode    3 | return = -117.570 | steps = 200
Episode    4 | return = -115.005 | steps = 200
Episode    5 | return = -131.969 | steps = 200
Episode    6 | return = -265.686 | steps = 200
Episode    7 | return = -273.540 | steps = 200
Episode    8 | return = -2.761 | steps = 200
Episode    9 | return = -379.624 | steps = 200
Episode   10 | return = -266.667 | steps = 200
Episode   11 | return = -265.828 | steps = 200
Episode   12 | return = -269.782 | steps = 200
Episode   13 | return = -133.725 | steps = 200
Episode   14 | return = -273.527 | steps = 200
Episode   15 | return = -132.737 | steps = 200
Episode   16 | return = -391.015 | steps = 200
Episode   17 | return = -4.551 | steps = 200
Episode   18 | return = -130.086 | steps = 200
Episode   19 | return = -2.303 | steps 