# 自作のDDPGノートブック

In [None]:
import math
import random
import copy
from dataclasses import dataclass, asdict, is_dataclass
from collections import deque, namedtuple
import matplotlib.pyplot as plt
import tqdm
import sys
import logging
import itertools

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical, Normal

import gymnasium as gym

from myActivator import tanhAndScale

In [None]:
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s",
                    stream=sys.stdout, datefmt="%H:%M:%S")

In [None]:
env = gym.make("Pendulum-v1",render_mode="human")
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
    logging.info('%s: %s', key, vars(env.unwrapped)[key])

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
@dataclass
class Config:
    # ニューラルネットの設定
    Q_net_sizes = [6,12,12,6]
    P_net_sizes = [6,12,6]
    Q_net_in = 4
    P_net_in = 3
    Q_net_out = 1
    P_net_out = 1

    # 環境の制約
    u_ulim = 2.0
    u_llim = -2.0
    
    # 学習に関するパラメータ
    Q_lr = 1e-3
    P_lr = 1e-3
    gamma = 0.9  # 割引率
    sig = 0.3    # 探索の標準偏差
    tau = 5e-3    # ターゲットネットの更新幅

In [None]:
class DDPGAgent:
    def __init__(self,Config,device=None):
        if Config:
            self.Config = Config
        else:
            raise ValueError("No Config!!")
        
        # ---- device 決定（指定がなければ CUDA があれば CUDA）----
        if device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device(device)

        # ---- action bounds（device を揃えるため Tensor 化）----
        self.u_low = torch.as_tensor(Config.u_llim, dtype=torch.float32, device=self.device)
        self.u_high = torch.as_tensor(Config.u_ulim, dtype=torch.float32, device=self.device)
        
        self.Q_net = self.build_net(
            Config.Q_net_in,
            Config.Q_net_sizes,
            Config.Q_net_out,
            ).to(self.device)
        self.Q_net.train()

        self.P_net = self.build_net(
            Config.P_net_in,
            Config.P_net_sizes,
            Config.P_net_out,
            tanhAndScale(a_high=self.u_high,a_low=self.u_low),
            ).to(self.device)
        self.P_net.train()

        # ---- Target nets（重要：deepcopy で別物を作る）----
        self.Q_target_net = copy.deepcopy(self.Q_net).to(self.device)
        self.P_target_net = copy.deepcopy(self.P_net).to(self.device)
        self.Q_target_net.eval()
        self.P_target_net.eval()

        self.Q_optim = optim.Adam(self.Q_net.parameters(),lr=Config.Q_lr)
        self.P_optim = optim.Adam(self.P_net.parameters(),lr=Config.P_lr)

    def to(self, device):
        """エージェント内部のネットと必要Tensorを指定 device に移す。"""
        self.device = torch.device(device)
        self.Q_net.to(self.device)
        self.P_net.to(self.device)
        self.Q_target_net.to(self.device)
        self.P_target_net.to(self.device)
        self.u_low = self.u_low.to(self.device)
        self.u_high = self.u_high.to(self.device)
        return self


    def build_net(self, input_size, hidden_sizes, output_size=1, output_activator=None):
        layers = []
        for input_size, output_size in zip([input_size]+hidden_sizes, hidden_sizes+[output_size]):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]  # 最後のReLUだけ取り除く
        if output_activator:
            layers.append(output_activator)
        net = nn.Sequential(*layers)
        return net
    

    @torch.no_grad()
    def step(self, observation) -> np.ndarray:
        """ノイズなし（評価用）。環境に渡す行動を返す。"""
        obs_t = torch.as_tensor(observation, dtype=torch.float32, device=self.device)
        if obs_t.dim() == 1:
            obs_t = obs_t.unsqueeze(0)

        action = self.P_net(obs_t)
        action = torch.clamp(action, self.u_low, self.u_high)
        return action.squeeze(0).cpu().numpy()
    

    @torch.no_grad()
    def step_with_noise(self, observation):
        # 1) observation を Tensor にし、ネットと同じ device に載せる
        obs = torch.as_tensor(observation, dtype=torch.float32, device=self.device)
        if obs.dim() == 1:
            obs = obs.unsqueeze(0)  # (obs_dim,) -> (1, obs_dim)

        # 2) 決定論的行動 a = μθ(s)
        action = self.P_net(obs)  # shape: (1, act_dim) を想定

        # 3) ε ~ N(0, σ^2 I) を生成して加算（探索）
        eps = float(self.Config.sig) * torch.randn_like(action)
        action = action + eps

        # 4) 出力制約 [u_low, u_high] に収める（安全弁）
        action = torch.clamp(action, self.u_low, self.u_high)

        # 5) 環境に渡すならバッチ次元を落として返す（numpy が必要なら .cpu().numpy()）
        return action.squeeze(0).cpu().numpy()
    

    def save_all(self, path: str, extra: dict | None = None):
        """
        Actor/Critic + target nets をまとめて保存（最終モデル用）。
        """
        cfg = asdict(self.Config) if is_dataclass(self.Config) else self.Config

        ckpt = {
            "config": cfg,
            "P_net": self.P_net.state_dict(),
            "Q_net": self.Q_net.state_dict(),
            "P_target_net": self.P_target_net.state_dict(),
            "Q_target_net": self.Q_target_net.state_dict(),
        }
        if extra is not None:
            ckpt["extra"] = extra

        torch.save(ckpt, path)


    def load_all(self, path: str, map_location=None):
        """
        save_all() で保存したチェックポイントをロード。
        device を変えて読みたい場合は map_location="cpu" などを指定。
        """
        ckpt = torch.load(path, map_location=map_location)

        self.P_net.load_state_dict(ckpt["P_net"])
        self.Q_net.load_state_dict(ckpt["Q_net"])
        self.P_target_net.load_state_dict(ckpt["P_target_net"])
        self.Q_target_net.load_state_dict(ckpt["Q_target_net"])

        return ckpt.get("extra", None)
    

    def mode2eval(self):
        self.P_net.eval()
        self.Q_net.eval()


    def mode2train(self):
        self.P_net.train()
        self.Q_net.train()
    

    @torch.no_grad()
    def soft_update(self, target_net, online_net, tau):
        """
        Polyak averaging:
          θ' ← (1-τ) θ' + τ θ
        """
        for target_param, online_param in zip(target_net.parameters(), online_net.parameters()):
            target_param.mul_(1.0 - tau).add_(tau * online_param)

    
    def update_net(self,states,actions,rewards,states_next,dones=None):
        """
        1回の更新（Critic→Actor→Target soft update）
        戻り値： (q_loss, p_loss) のスカラー
        """
        # ---- minibatch を device 上 Tensor に統一 ----
        states = torch.as_tensor(states, dtype=torch.float32, device=self.device)
        actions = torch.as_tensor(actions, dtype=torch.float32, device=self.device)
        rewards = torch.as_tensor(rewards, dtype=torch.float32, device=self.device)
        states_next = torch.as_tensor(states_next, dtype=torch.float32, device=self.device)

        if rewards.dim() == 1:
            rewards = rewards.unsqueeze(1)

        if dones is None:
            dones = torch.zeros((states.shape[0], 1), dtype=torch.float32, device=self.device)
        else:
            dones = torch.as_tensor(dones, dtype=torch.float32, device=self.device)
            if dones.dim() == 1:
                dones = dones.unsqueeze(1)

        with torch.no_grad():
            actions_next_for_target = self.P_target_net(states_next)
            y_targets = rewards + self.Config.gamma*(1-dones)*self.Q_target_net(torch.cat([states_next, actions_next_for_target], dim=1))
        
        Q_values = self.Q_net(torch.cat([states,actions],dim=1))
        Q_loss = F.mse_loss(y_targets,Q_values)
        self.Q_optim.zero_grad()
        Q_loss.backward()
        self.Q_optim.step()

        # ---- Actor update ----
        # Actor 更新では Q_net を通すが、Q_net 自体は更新しないので凍結（計算の節約＋安全）
        for p in self.Q_net.parameters():
            p.requires_grad_(False)
        
        actions_for_Ploss = self.P_net(states)
        P_loss = -self.Q_net(torch.cat([states,actions_for_Ploss],dim=1)).mean()
        self.P_optim.zero_grad()
        P_loss.backward()
        self.P_optim.step()

        for p in self.Q_net.parameters():
            p.requires_grad_(True)

        self.soft_update(
            target_net=self.Q_target_net,
            online_net=self.Q_net,
            tau=self.Config.tau
            )
        self.soft_update(
            target_net=self.P_target_net,
            online_net=self.P_net,
            tau=self.Config.tau
            )
        
        return float(Q_loss.item()), float(P_loss.item())

In [None]:
from myReplayBuffer import ReplayBuffer

In [None]:
def train(
    env,
    agent,
    buffer,
    total_step=40000,
    warmup_steps=1000,
    batch_num=512,
):
    print("cuda available:", torch.cuda.is_available())
    print("agent device:", agent.device)
    print("P_net device:", next(agent.P_net.parameters()).device)
    print("Q_net device:", next(agent.Q_net.parameters()).device)
    
    # ログ保存用のリスト
    Q_loss_history = []
    P_loss_history = []
    episode_num = 1
    reward_history = []
    reward_log = 0

    # 1) 環境を初期化して最初の観測を得る
    obs, info = env.reset()

    # 2) 環境ステップを total_step 回まわす
    for t in range(total_step):

        # logging.info("train step %d start", t)

        # ---- (A) 行動選択：warmup まではランダム、その後は方策+ノイズが定石 ----
        if len(buffer) < warmup_steps:
            # 環境の action_space に従ってランダム行動（探索の立ち上がりを安定化）
            action = env.action_space.sample()
            # logging.info("warmup now")
        else:
            # DDPG の探索：方策にノイズを加えた行動
            action = agent.step_with_noise(obs)
            # logging.info("training")

        # 3) 環境を1ステップ進める
        obs_next, reward, terminated, truncated, info = env.step(action)
        reward_log += reward

        # 4) “エピソード終了”判定（reset のため）
        done = float(terminated)

        # ---- (B) バッファに格納：学習ターゲット用の done は方針に注意 ----
        # 方針1（簡単）：done をそのまま入れる（truncatedでもブートストラップ停止）
        buffer.add(obs, action, reward, obs_next, done)

        # 方針2（理屈に忠実）：terminated を入れる（truncatedはブートストラップ継続）
        # buffer.add(obs, action, reward, obs_next, terminated)

        # 5) 次の観測へ更新（done なら reset）
        # doneはterminatedにしたので、ここではterminatedとtruncatedのorを使う
        if terminated or truncated:
            logging.info('train episode %d: reward = %.2f',
                         episode_num, reward_log)
            episode_num += 1
            obs, info = env.reset()
            reward_history.append(reward_log)
            reward_log = 0
        else:
            obs = obs_next

        # 6) バッファが十分でなければ学習をスキップ
        if len(buffer) < warmup_steps:
            continue

        # 7) ミニバッチを取り出して更新
        minibatch = buffer.sample(batch_num)
        states      = minibatch["obs"]
        actions     = minibatch["act"]
        rewards     = minibatch["rew"]
        states_next = minibatch["obs_next"]
        dones       = minibatch["done"]

        Q_loss, P_loss = agent.update_net(states, actions, rewards, states_next, dones)

        # 8) ログが欲しいならここで（毎step item() は遅くなるので間引くのが定石）
        if t % 100 == 0:
            Q_loss_history.append(Q_loss)
            P_loss_history.append(P_loss)

    return Q_loss_history, P_loss_history, reward_history

In [None]:
agent = DDPGAgent(Config=Config(),device=device)
DDPGReplayBuffer = ReplayBuffer(obs_dim=3,act_dim=1,size=2000,device=device)
total_step = 60000

Qh, Ph, rh = train(
    env=env,
    agent=agent,
    buffer=DDPGReplayBuffer,
    total_step=total_step,
)

In [None]:
env.close()

In [None]:
# 推論用に eval モードにしておく（保存自体は train のままでも可）
agent.mode2eval()

from datetime import datetime

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

agent.save_all(
    "./models/ddpg_final_" + stamp + ".pth",
    extra={
        "total_step": total_step,
        "reward_history": rh,  # 要らなければ外してOK
    }
)
print("saved to ddpg_final.pth")

In [None]:
# import torch
# from dataclasses import asdict, is_dataclass

# def save_ddpg_checkpoint(agent, path: str, config=None, extra: dict | None = None):
#     """
#     agent を改造せずに、DDPG の全ネット（actor/critic + target）を保存する。
#     - config: 保存したければ Config を渡す（dataclassでもOK）
#     - extra : step数や報酬履歴など任意情報
#     """
#     ckpt = {
#         "P_net": agent.P_net.state_dict(),
#         "Q_net": agent.Q_net.state_dict(),
#         "P_target_net": agent.P_target_net.state_dict(),
#         "Q_target_net": agent.Q_target_net.state_dict(),
#     }

#     # optimizer も一緒に保存したい場合（最終保存なら無くてもOK）
#     if hasattr(agent, "P_optim"):
#         ckpt["P_optim"] = agent.P_optim.state_dict()
#     if hasattr(agent, "Q_optim"):
#         ckpt["Q_optim"] = agent.Q_optim.state_dict()

#     # Config を保存したい場合
#     if config is not None:
#         ckpt["config"] = asdict(config) if is_dataclass(config) else config

#     if extra is not None:
#         ckpt["extra"] = extra

#     torch.save(ckpt, path)
#     print(f"saved checkpoint: {path}")

In [None]:
# save_ddpg_checkpoint(
#     agent,
#     "./models/ddpg_final.pth",
#     #config=agent.Config,  # あるなら
#     extra={"reward_history": rh, "total_step": 40000},
# )