# 自作のDDPGノートブック

In [13]:
import copy
from dataclasses import dataclass, asdict, is_dataclass, field
import sys
import logging

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gymnasium as gym

from myActivator import tanhAndScale
from myReplayBuffer import ReplayBuffer

In [14]:
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s",
                    stream=sys.stdout, datefmt="%H:%M:%S")

In [15]:
env = gym.make("Pendulum-v1",render_mode="human")
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
    logging.info('%s: %s', key, vars(env.unwrapped)[key])

23:56:51 [INFO] id: Pendulum-v1
23:56:51 [INFO] entry_point: gymnasium.envs.classic_control.pendulum:PendulumEnv
23:56:51 [INFO] reward_threshold: None
23:56:51 [INFO] nondeterministic: False
23:56:51 [INFO] max_episode_steps: 200
23:56:51 [INFO] order_enforce: True
23:56:51 [INFO] disable_env_checker: False
23:56:51 [INFO] kwargs: {'render_mode': 'human'}
23:56:51 [INFO] additional_wrappers: ()
23:56:51 [INFO] vector_entry_point: None
23:56:51 [INFO] namespace: None
23:56:51 [INFO] name: Pendulum
23:56:51 [INFO] version: 1
23:56:51 [INFO] max_speed: 8
23:56:51 [INFO] max_torque: 2.0
23:56:51 [INFO] dt: 0.05
23:56:51 [INFO] g: 10.0
23:56:51 [INFO] m: 1.0
23:56:51 [INFO] l: 1.0
23:56:51 [INFO] render_mode: human
23:56:51 [INFO] screen_dim: 500
23:56:51 [INFO] screen: None
23:56:51 [INFO] clock: None
23:56:51 [INFO] isopen: True
23:56:51 [INFO] action_space: Box(-2.0, 2.0, (1,), float32)
23:56:51 [INFO] observation_space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
23:56:51 [INFO] spec

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
@dataclass
class Config:
    # =========================
    # ニューラルネットの設定
    # =========================
    Q_net_sizes: list[int] = field(default_factory=lambda: [6, 12, 6])
    P_net_sizes: list[int] = field(default_factory=lambda: [6, 12, 6])
    Q_net_in: int = 4
    P_net_in: int = 3
    Q_net_out: int = 1
    P_net_out: int = 1

    # =========================
    # 環境の制約
    # =========================
    u_ulim: float = 2.0
    u_llim: float = -2.0

    # =========================
    # 学習に関するパラメータ
    # =========================
    Q_lr: float = 1e-2
    P_lr: float = 1e-2
    gamma: float = 0.95  # 割引率
    sig: float = 1.0     # 探索ノイズの標準偏差
    tau: float = 5e-3    # ターゲットネットの更新幅（Polyak係数）

In [18]:
class DDPGAgent:
    def __init__(self,Config,device=None):
        if Config:
            self.Config = Config
        else:
            raise ValueError("No Config!!")
        
        # ---- device 決定（指定がなければ CUDA があれば CUDA）----
        if device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device(device)

        # ---- action bounds（device を揃えるため Tensor 化）----
        self.u_low = torch.as_tensor(Config.u_llim, dtype=torch.float32, device=self.device)
        self.u_high = torch.as_tensor(Config.u_ulim, dtype=torch.float32, device=self.device)
        
        self.Q_net = self.build_net(
            Config.Q_net_in,
            Config.Q_net_sizes,
            Config.Q_net_out,
            ).to(self.device)
        self.Q_net.train()

        self.P_net = self.build_net(
            Config.P_net_in,
            Config.P_net_sizes,
            Config.P_net_out,
            tanhAndScale(a_high=self.u_high,a_low=self.u_low),
            ).to(self.device)
        self.P_net.train()

        # ---- Target nets（重要：deepcopy で別物を作る）----
        self.Q_target_net = copy.deepcopy(self.Q_net).to(self.device)
        self.P_target_net = copy.deepcopy(self.P_net).to(self.device)
        self.Q_target_net.eval()
        self.P_target_net.eval()

        self.Q_optim = optim.Adam(self.Q_net.parameters(),lr=Config.Q_lr)
        self.P_optim = optim.Adam(self.P_net.parameters(),lr=Config.P_lr)

    def to(self, device):
        """エージェント内部のネットと必要Tensorを指定 device に移す。"""
        self.device = torch.device(device)
        self.Q_net.to(self.device)
        self.P_net.to(self.device)
        self.Q_target_net.to(self.device)
        self.P_target_net.to(self.device)
        self.u_low = self.u_low.to(self.device)
        self.u_high = self.u_high.to(self.device)
        return self


    def build_net(self, input_size, hidden_sizes, output_size=1, output_activator=None):
        layers = []
        for input_size, output_size in zip([input_size]+hidden_sizes, hidden_sizes+[output_size]):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]  # 最後のReLUだけ取り除く
        if output_activator:
            layers.append(output_activator)
        net = nn.Sequential(*layers)
        return net
    

    @torch.no_grad()
    def step(self, observation) -> np.ndarray:
        """ノイズなし（評価用）。環境に渡す行動を返す。"""
        obs_t = torch.as_tensor(observation, dtype=torch.float32, device=self.device)
        if obs_t.dim() == 1:
            obs_t = obs_t.unsqueeze(0)

        action = self.P_net(obs_t)
        action = torch.clamp(action, self.u_low, self.u_high)
        return action.squeeze(0).cpu().numpy()
    

    @torch.no_grad()
    def step_with_noise(self, observation):
        # 1) observation を Tensor にし、ネットと同じ device に載せる
        obs = torch.as_tensor(observation, dtype=torch.float32, device=self.device)
        if obs.dim() == 1:
            obs = obs.unsqueeze(0)  # (obs_dim,) -> (1, obs_dim)

        # 2) 決定論的行動 a = μθ(s)
        action = self.P_net(obs)  # shape: (1, act_dim) を想定

        # 3) ε ~ N(0, σ^2 I) を生成して加算（探索）
        eps = float(self.Config.sig) * torch.randn_like(action)
        action = action + eps

        # 4) 出力制約 [u_low, u_high] に収める（安全弁）
        action = torch.clamp(action, self.u_low, self.u_high)

        # 5) 環境に渡すならバッチ次元を落として返す（numpy が必要なら .cpu().numpy()）
        return action.squeeze(0).cpu().numpy()
    

    def save_all(self, path: str, extra: dict | None = None):
        """
        Actor/Critic + target nets をまとめて保存（最終モデル用）。
        """
        cfg = asdict(self.Config) if is_dataclass(self.Config) else self.Config

        ckpt = {
            "config": cfg,
            "P_net": self.P_net.state_dict(),
            "Q_net": self.Q_net.state_dict(),
            "P_target_net": self.P_target_net.state_dict(),
            "Q_target_net": self.Q_target_net.state_dict(),
        }
        if extra is not None:
            ckpt["extra"] = extra

        torch.save(ckpt, path)


    def load_all(self, path: str, map_location=None):
        """
        save_all() で保存したチェックポイントをロード。

        PyTorch 2.6 以降:
        torch.load() のデフォルトが weights_only=True になったため、
        config/extra を含むチェックポイントはそのままだと UnpicklingError になり得る。
        その回避として「信頼できるチェックポイントに限り」 weights_only=False を明示する。

        ※ map_location は "cpu" や device を指定可。
        """
        # PyTorch 2.6+ では weights_only 引数がある
        try:
            ckpt = torch.load(path, map_location=map_location, weights_only=False)
        except TypeError:
            # 古い PyTorch（weights_only 引数が無い）向け
            ckpt = torch.load(path, map_location=map_location)

        self.P_net.load_state_dict(ckpt["P_net"])
        self.Q_net.load_state_dict(ckpt["Q_net"])
        self.P_target_net.load_state_dict(ckpt["P_target_net"])
        self.Q_target_net.load_state_dict(ckpt["Q_target_net"])

        return ckpt.get("extra", None)
    

    def mode2eval(self):
        self.P_net.eval()
        self.Q_net.eval()


    def mode2train(self):
        self.P_net.train()
        self.Q_net.train()
    

    @torch.no_grad()
    def soft_update(self, target_net, online_net, tau):
        """
        Polyak averaging:
          θ' ← (1-τ) θ' + τ θ
        """
        for target_param, online_param in zip(target_net.parameters(), online_net.parameters()):
            target_param.mul_(1.0 - tau).add_(tau * online_param)

    
    def update_net(self,states,actions,rewards,states_next,dones=None):
        """
        1回の更新（Critic→Actor→Target soft update）
        戻り値： (q_loss, p_loss) のスカラー
        """
        # ---- minibatch を device 上 Tensor に統一 ----
        states = torch.as_tensor(states, dtype=torch.float32, device=self.device)
        actions = torch.as_tensor(actions, dtype=torch.float32, device=self.device)
        rewards = torch.as_tensor(rewards, dtype=torch.float32, device=self.device)
        states_next = torch.as_tensor(states_next, dtype=torch.float32, device=self.device)

        if rewards.dim() == 1:
            rewards = rewards.unsqueeze(1)

        if dones is None:
            dones = torch.zeros((states.shape[0], 1), dtype=torch.float32, device=self.device)
        else:
            dones = torch.as_tensor(dones, dtype=torch.float32, device=self.device)
            if dones.dim() == 1:
                dones = dones.unsqueeze(1)

        with torch.no_grad():
            actions_next_for_target = self.P_target_net(states_next)
            y_targets = rewards + self.Config.gamma*(1-dones)*self.Q_target_net(torch.cat([states_next, actions_next_for_target], dim=1))
        
        Q_values = self.Q_net(torch.cat([states,actions],dim=1))
        Q_loss = F.mse_loss(y_targets,Q_values)
        self.Q_optim.zero_grad()
        Q_loss.backward()
        self.Q_optim.step()

        # ---- Actor update ----
        # Actor 更新では Q_net を通すが、Q_net 自体は更新しないので凍結（計算の節約＋安全）
        for p in self.Q_net.parameters():
            p.requires_grad_(False)
        
        actions_for_Ploss = self.P_net(states)
        P_loss = -self.Q_net(torch.cat([states,actions_for_Ploss],dim=1)).mean()
        self.P_optim.zero_grad()
        P_loss.backward()
        self.P_optim.step()

        for p in self.Q_net.parameters():
            p.requires_grad_(True)

        self.soft_update(
            target_net=self.Q_target_net,
            online_net=self.Q_net,
            tau=self.Config.tau
            )
        self.soft_update(
            target_net=self.P_target_net,
            online_net=self.P_net,
            tau=self.Config.tau
            )
        
        return float(Q_loss.item()), float(P_loss.item())

In [19]:
def train(
    env,
    agent,
    buffer,
    total_step=40000,
    warmup_steps=1000,
    batch_num=512,
):
    print("cuda available:", torch.cuda.is_available())
    print("agent device:", agent.device)
    print("P_net device:", next(agent.P_net.parameters()).device)
    print("Q_net device:", next(agent.Q_net.parameters()).device)
    
    # ログ保存用のリスト
    Q_loss_history = []
    P_loss_history = []
    episode_num = 1
    reward_history = []
    reward_log = 0

    # 1) 環境を初期化して最初の観測を得る
    obs, info = env.reset()

    # 2) 環境ステップを total_step 回まわす
    for t in range(total_step):

        # logging.info("train step %d start", t)

        # ---- (A) 行動選択：warmup まではランダム、その後は方策+ノイズが定石 ----
        if len(buffer) < warmup_steps:
            # 環境の action_space に従ってランダム行動（探索の立ち上がりを安定化）
            action = env.action_space.sample()
            # logging.info("warmup now")
        else:
            # DDPG の探索：方策にノイズを加えた行動
            action = agent.step_with_noise(obs)
            # logging.info("training")

        # 3) 環境を1ステップ進める
        obs_next, reward, terminated, truncated, info = env.step(action)
        reward_log += reward

        # 4) “エピソード終了”判定（reset のため）
        done = float(terminated)

        # ---- (B) バッファに格納：学習ターゲット用の done は方針に注意 ----
        # 方針1（簡単）：done をそのまま入れる（truncatedでもブートストラップ停止）
        buffer.add(obs, action, reward, obs_next, done)

        # 方針2（理屈に忠実）：terminated を入れる（truncatedはブートストラップ継続）
        # buffer.add(obs, action, reward, obs_next, terminated)

        # 5) 次の観測へ更新（done なら reset）
        # doneはterminatedにしたので、ここではterminatedとtruncatedのorを使う
        if terminated or truncated:
            logging.info('train episode %d: reward = %.2f',
                         episode_num, reward_log)
            episode_num += 1
            obs, info = env.reset()
            reward_history.append(reward_log)
            reward_log = 0
        else:
            obs = obs_next

        # 6) バッファが十分でなければ学習をスキップ
        if len(buffer) < warmup_steps:
            continue

        # 7) ミニバッチを取り出して更新
        minibatch = buffer.sample(batch_num)
        states      = minibatch["obs"]
        actions     = minibatch["act"]
        rewards     = minibatch["rew"]
        states_next = minibatch["obs_next"]
        dones       = minibatch["done"]

        Q_loss, P_loss = agent.update_net(states, actions, rewards, states_next, dones)

        # 8) ログが欲しいならここで（毎step item() は遅くなるので間引くのが定石）
        if t % 100 == 0:
            Q_loss_history.append(Q_loss)
            P_loss_history.append(P_loss)

    return Q_loss_history, P_loss_history, reward_history

In [20]:
agent = DDPGAgent(Config=Config(),device=device)
DDPGReplayBuffer = ReplayBuffer(obs_dim=3,act_dim=1,size=2000,device=device)
total_step = 40000

Qh, Ph, rh = train(
    env=env,
    agent=agent,
    buffer=DDPGReplayBuffer,
    total_step=total_step,
)

cuda available: True
agent device: cuda
P_net device: cuda:0
Q_net device: cuda:0
23:56:58 [INFO] train episode 1: reward = -1157.44
23:57:05 [INFO] train episode 2: reward = -1277.94
23:57:12 [INFO] train episode 3: reward = -1543.43
23:57:18 [INFO] train episode 4: reward = -1077.82
23:57:25 [INFO] train episode 5: reward = -909.27
23:57:32 [INFO] train episode 6: reward = -1325.22
23:57:38 [INFO] train episode 7: reward = -944.54
23:57:45 [INFO] train episode 8: reward = -1583.53
23:57:52 [INFO] train episode 9: reward = -1614.20
23:57:58 [INFO] train episode 10: reward = -1548.72
23:58:05 [INFO] train episode 11: reward = -1452.78
23:58:12 [INFO] train episode 12: reward = -1240.03
23:58:18 [INFO] train episode 13: reward = -1541.84
23:58:25 [INFO] train episode 14: reward = -1206.86
23:58:32 [INFO] train episode 15: reward = -1348.74
23:58:38 [INFO] train episode 16: reward = -1067.25
23:58:45 [INFO] train episode 17: reward = -781.26
23:58:52 [INFO] train episode 18: reward = -71

In [21]:
env.close()

In [22]:
from pathlib import Path
from datetime import datetime

def make_unique_path(path: str | Path) -> Path:
    """
    path が既に存在する場合、末尾に _1, _2, ... を付けて未使用のパスを返す。
    例: ddpg_final_20251221_235959.pth -> ddpg_final_20251221_235959_1.pth -> ...
    """
    p = Path(path)

    # 存在しないならそのまま使う
    if not p.exists():
        return p

    parent = p.parent
    stem = p.stem      # 拡張子抜きファイル名
    suffix = p.suffix  # ".pth"

    i = 1
    while True:
        cand = parent / f"{stem}_{i}{suffix}"
        if not cand.exists():
            return cand
        i += 1


# 推論用に eval モードにしておく（保存自体は train のままでも可）
agent.mode2eval()

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

models_dir = Path("./models")
models_dir.mkdir(parents=True, exist_ok=True)

base_path = models_dir / f"ddpg_final_{stamp}.pth"
save_path = make_unique_path(base_path)

agent.save_all(
    save_path.as_posix(),
    extra={
        "total_step": int(total_step),
        "reward_history": rh,  # 必要ならそのままでOK
    }
)

print(f"saved to {save_path}")

saved to models/ddpg_final_20251222_001903.pth
