# 自作のActor-Criticノートブック

In [1]:
import numpy as np
import copy
from dataclasses import dataclass, asdict, is_dataclass

import sys
import logging

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal

import gymnasium as gym

from myActivator import tanhAndScale
from myFunction import make_squashed_gaussian

In [2]:
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s",
                    stream=sys.stdout, datefmt="%H:%M:%S")

In [3]:
env = gym.make("Pendulum-v1",render_mode="human")
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
    logging.info('%s: %s', key, vars(env.unwrapped)[key])

22:53:50 [INFO] id: Pendulum-v1
22:53:50 [INFO] entry_point: gymnasium.envs.classic_control.pendulum:PendulumEnv
22:53:50 [INFO] reward_threshold: None
22:53:50 [INFO] nondeterministic: False
22:53:50 [INFO] max_episode_steps: 200
22:53:50 [INFO] order_enforce: True
22:53:50 [INFO] disable_env_checker: False
22:53:50 [INFO] kwargs: {'render_mode': 'human'}
22:53:50 [INFO] additional_wrappers: ()
22:53:50 [INFO] vector_entry_point: None
22:53:50 [INFO] namespace: None
22:53:50 [INFO] name: Pendulum
22:53:50 [INFO] version: 1
22:53:50 [INFO] max_speed: 8
22:53:50 [INFO] max_torque: 2.0
22:53:50 [INFO] dt: 0.05
22:53:50 [INFO] g: 10.0
22:53:50 [INFO] m: 1.0
22:53:50 [INFO] l: 1.0
22:53:50 [INFO] render_mode: human
22:53:50 [INFO] screen_dim: 500
22:53:50 [INFO] screen: None
22:53:50 [INFO] clock: None
22:53:50 [INFO] isopen: True
22:53:50 [INFO] action_space: Box(-2.0, 2.0, (1,), float32)
22:53:50 [INFO] observation_space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
22:53:50 [INFO] spec

In [4]:
@dataclass
class Config:
    # ネットワーク構造をシンプルに（2層 × 128ユニット）
    V_net_sizes = [128, 128]
    P_net_sizes = [128, 128]
    V_net_in = 3
    P_net_in = 3
    V_net_out = 1
    P_net_out = 2  # mu と log_std

    V_lr = 3e-4    # 少し小さめに
    P_lr = 3e-4

    u_high = 2.0
    u_low = -2.0

    log_std_min = -20.0   # より広い範囲を許容
    log_std_max = 2.0

    gamma = 0.99          # 長期報酬を重視

In [5]:
class ActorCriticAgent:
    def __init__(self, Config, device=None):
        # ... (既存のコードと同じ初期化部分) ...
        self.Config = Config
        self.device = torch.device(device) if device else torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        
        self.u_high = torch.as_tensor(Config.u_high, dtype=torch.float32, device=self.device)
        self.u_low = torch.as_tensor(Config.u_low, dtype=torch.float32, device=self.device)
        
        self.V_net = self.build_net(Config.V_net_in, Config.V_net_sizes, Config.V_net_out).to(self.device)
        self.P_net = self.build_net(Config.P_net_in, Config.P_net_sizes, Config.P_net_out).to(self.device)
        
        self.V_optim = optim.Adam(self.V_net.parameters(), Config.V_lr)
        self.P_optim = optim.Adam(self.P_net.parameters(), Config.P_lr)
        
        self.log_std_min = Config.log_std_min
        self.log_std_max = Config.log_std_max

    def build_net(self, input_size, hidden_sizes, output_size):
        layers = []
        prev_size = input_size
        for h in hidden_sizes:
            layers.append(nn.Linear(prev_size, h))
            layers.append(nn.ReLU())
            prev_size = h
        layers.append(nn.Linear(prev_size, output_size))
        return nn.Sequential(*layers)

    def get_action_and_log_prob(self, state, deterministic=False):
        """行動と log_prob を同時に返す（収集時に使用）"""
        state = torch.as_tensor(state, dtype=torch.float32, device=self.device)
        if state.dim() == 1:
            state = state.unsqueeze(0)
        
        out = self.P_net(state)
        mu, log_std = torch.chunk(out, 2, dim=-1)
        log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
        std = torch.exp(log_std)
        
        dist = make_squashed_gaussian(mu=mu, std=std, low=self.u_low, high=self.u_high)
        
        if deterministic:
            # 評価時は平均を使う（tanh + affine変換を適用）
            action = torch.tanh(mu) * (self.u_high - self.u_low) / 2 + (self.u_high + self.u_low) / 2
            log_prob = None
        else:
            action = dist.rsample()
            log_prob = dist.log_prob(action)  # ← ここで計算して保存！
        
        return action.squeeze(0), log_prob.squeeze(0) if log_prob is not None else None

    @torch.no_grad()
    def step(self, state):
        """推論用（勾配不要）"""
        action, _ = self.get_action_and_log_prob(state, deterministic=False)
        return action.cpu().numpy()

    def update_net_batch(self, states, log_probs, rewards, states_next, dones):
        """
        修正版：log_probs は収集時に計算済みのものを受け取る
        """
        states = torch.as_tensor(states, dtype=torch.float32, device=self.device)
        log_probs = torch.stack(log_probs).to(self.device)  # 収集時に保存したもの
        rewards = torch.as_tensor(rewards, dtype=torch.float32, device=self.device)
        states_next = torch.as_tensor(states_next, dtype=torch.float32, device=self.device)
        dones = torch.as_tensor(dones, dtype=torch.float32, device=self.device)

        if rewards.dim() == 1:
            rewards = rewards.unsqueeze(1)
        if dones.dim() == 1:
            dones = dones.unsqueeze(1)
        if log_probs.dim() == 1:
            log_probs = log_probs.unsqueeze(1)

        # ========== Critic (V_net) の更新 ==========
        with torch.no_grad():
            V_next = self.V_net(states_next)
            y_targets = rewards + self.Config.gamma * (1 - dones) * V_next

        V_values = self.V_net(states)
        V_loss = F.mse_loss(V_values, y_targets)
        
        self.V_optim.zero_grad()
        V_loss.backward()
        self.V_optim.step()

        # ========== Actor (P_net) の更新 ==========
        # Advantage を計算（更新後の V_net ではなく、更新前の値を使う）
        with torch.no_grad():
            # 注意：V_values は更新前に計算済みなのでそのまま使える
            advantages = (y_targets - V_values)
            # Advantage の正規化（学習安定化のため重要！）
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # log_probs は収集時に計算済みなので、そのまま使う
        P_loss = -(advantages * log_probs).mean()
        
        self.P_optim.zero_grad()
        P_loss.backward()
        self.P_optim.step()

        return float(V_loss.item()), float(P_loss.item())

    
    def to(self,device):
        self.device = torch.device(device)
        self.V_net.to(self.device)
        self.P_net.to(self.device)
        return self
    

    def save_all(self,path:str,extra:dict|None=None):
        cfg = asdict(self.Config) if is_dataclass(self.Config) else self.Config
        ckpt = {
            "Config":cfg,
            "V_net":self.V_net.state_dict(),
            "P_net":self.P_net.state_dict(),
        }
        if extra is not None:
            ckpt["extra"] = extra
        
        torch.save(ckpt,path)

    
    def load_all(self,path:str,map_location=None):
        ckpt = torch.load(path,map_location=map_location)
        self.V_net.load_state_dict(ckpt["V_net"])
        self.P_net.load_state_dict(ckpt["P_net"])

        return ckpt.get("extra",None)
    

    def mode2eval(self):
        self.V_net.eval()
        self.P_net.eval()


    def mode2train(self):
        self.V_net.train()
        self.P_net.train()

In [6]:
def train(env, agent, rollout_num=200, rollout_len=2048):  # rollout_len を増やす
    reward_history = []
    reward_log = 0
    episode = 0
    state, info = env.reset()

    for r in range(rollout_num):
        states = []
        log_probs = []  # actions の代わりに log_probs を保存！
        rewards = []
        states_next = []
        dones = []

        for t in range(rollout_len):
            # 勾配計算が必要なので no_grad を外す
            action, log_prob = agent.get_action_and_log_prob(state)
            
            state_next, reward, terminated, truncated, info = env.step(action.detach().cpu().numpy())
            done = terminated

            states.append(state.copy())
            log_probs.append(log_prob)  # Tensor のまま保存
            rewards.append(reward)
            states_next.append(state_next.copy())
            dones.append(float(done))

            reward_log += reward

            if terminated or truncated:
                state, info = env.reset()
                episode += 1
                logging.info('episode %d: reward = %.2f', episode, reward_log)
                reward_history.append(reward_log)
                reward_log = 0
            else:
                state = state_next

        V_loss, P_loss = agent.update_net_batch(states, log_probs, rewards, states_next, dones)
        
        # 勾配計算グラフをクリア（メモリリーク防止）
        log_probs = None

    return reward_history

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = ActorCriticAgent(Config=Config(),device=device)

rollout_num=500
rollout_len=100

rh = train(
    env=env,
    agent=agent,
    rollout_num=rollout_num,
    rollout_len=rollout_len,
)

  states = torch.as_tensor(states, dtype=torch.float32, device=self.device)
Consider using tensor.detach() first. (Triggered internally at /pytorch/aten/src/ATen/native/Scalar.cpp:22.)
  return float(V_loss.item()), float(P_loss.item())


22:54:00 [INFO] episode 1: reward = -1519.51
22:54:07 [INFO] episode 2: reward = -1209.30
22:54:13 [INFO] episode 3: reward = -1365.54
22:54:20 [INFO] episode 4: reward = -1083.47
22:54:27 [INFO] episode 5: reward = -981.90
22:54:34 [INFO] episode 6: reward = -1349.46
22:54:41 [INFO] episode 7: reward = -877.89
22:54:48 [INFO] episode 8: reward = -1331.64
22:54:55 [INFO] episode 9: reward = -1299.91
22:55:02 [INFO] episode 10: reward = -947.95
22:55:09 [INFO] episode 11: reward = -841.92
22:55:16 [INFO] episode 12: reward = -1651.72
22:55:23 [INFO] episode 13: reward = -899.33
22:55:30 [INFO] episode 14: reward = -1175.74
22:55:37 [INFO] episode 15: reward = -979.93
22:55:44 [INFO] episode 16: reward = -1013.94
22:55:51 [INFO] episode 17: reward = -1687.81
22:55:58 [INFO] episode 18: reward = -870.64
22:56:05 [INFO] episode 19: reward = -864.88
22:56:12 [INFO] episode 20: reward = -867.84
22:56:19 [INFO] episode 21: reward = -1008.77
22:56:26 [INFO] episode 22: reward = -990.75
22:56:3

ValueError: Expected parameter loc (Tensor of shape (1, 1)) of distribution Normal(loc: tensor([[nan]], device='cuda:0', grad_fn=<SplitBackward0>), scale: tensor([[nan]], device='cuda:0', grad_fn=<ExpBackward0>)) to satisfy the constraint Real(), but found invalid values:
tensor([[nan]], device='cuda:0', grad_fn=<SplitBackward0>)

In [1]:
env.close()

NameError: name 'env' is not defined

In [None]:
# 推論用に eval モードにしておく（保存自体は train のままでも可）
agent.mode2eval()

from datetime import datetime

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")

agent.save_all(
    "./models/actor_critic_final_" + stamp + ".pth",
    extra={
        "rollout_num": rollout_num,
        "rollout_len": rollout_len,
        "reward_history": rh,  # 要らなければ外してOK
    }
)
print("saved to actor_critic_final.pth")