<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/DPG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# https://chatgpt.com/c/67a97cce-a950-800e-b653-f92af5eb1388

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import numpy as np

# 超参数
LR_ACTOR = 0.001
LR_CRITIC = 0.002
GAMMA = 0.99
TAU = 0.005  # 软更新参数
EPISODES = 200
MAX_STEPS = 200

# 创建环境
env = gym.make("Pendulum-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high[0]  # 作用于动作输出范围

# 定义 Actor（确定性策略）
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, action_bound):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, action_dim)
        self.action_bound = action_bound  # 限制动作范围

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        action = torch.tanh(self.fc3(x)) * self.action_bound  # 输出范围 [-action_bound, action_bound]
        return action

# 定义 Critic（Q 值估计）
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)  # 状态和动作拼接
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        q_value = self.fc3(x)
        return q_value

# 初始化网络和优化器
actor = Actor(state_dim, action_dim, action_bound)
critic = Critic(state_dim, action_dim)
target_actor = Actor(state_dim, action_dim, action_bound)
target_critic = Critic(state_dim, action_dim)

# 复制参数
target_actor.load_state_dict(actor.state_dict())
target_critic.load_state_dict(critic.state_dict())

actor_optimizer = optim.Adam(actor.parameters(), lr=LR_ACTOR)
critic_optimizer = optim.Adam(critic.parameters(), lr=LR_CRITIC)
loss_fn = nn.MSELoss()

# 经验回放
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = []
        self.capacity = capacity

    def push(self, transition):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append(transition)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        batch = [self.buffer[i] for i in indices]
        return zip(*batch)

buffer = ReplayBuffer()

# 软更新函数
def soft_update(target, source):
    for target_param, source_param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(TAU * source_param.data + (1.0 - TAU) * target_param.data)

# 训练 DPG
batch_size = 64
for episode in range(EPISODES):
    state = env.reset()[0]
    # state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    state = torch.tensor(state, dtype=torch.float32).view(1, -1)
    episode_reward = 0

    for step in range(MAX_STEPS):
        with torch.no_grad():
            action = actor(state).cpu().numpy()[0]

        next_state, reward, done, _, _ = env.step(action)
        buffer.push((state, action, reward, next_state, done))

        state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)
        episode_reward += reward

        if len(buffer.buffer) > batch_size:
            # 采样数据
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)

            states = torch.cat(states)
            actions = torch.tensor(actions, dtype=torch.float32)
            rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1)
            next_states = torch.tensor(next_states, dtype=torch.float32)
            dones = torch.tensor(dones, dtype=torch.float32).unsqueeze(1)

            # 计算目标 Q 值
            with torch.no_grad():
                next_actions = target_actor(next_states)  # 目标 Actor 计算下一步动作
                target_q_values = target_critic(next_states, next_actions)  # 目标 Critic 计算 Q 值
                target_q = rewards + GAMMA * target_q_values * (1 - dones)  # 计算目标 Q 值

            # 更新 Critic 网络
            critic_q_values = critic(states, actions)  # 当前 Critic 计算 Q 值
            critic_loss = loss_fn(critic_q_values, target_q)  # 计算损失
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

            # 更新 Actor 网络（策略梯度）
            actor_loss = -critic(states, actor(states)).mean()  # 策略梯度
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            # 软更新目标网络
            soft_update(target_actor, actor)
            soft_update(target_critic, critic)

        if done:
            break

    print(f"Episode {episode + 1}: Reward = {episode_reward:.2f}")

env.close()

  deprecation(
  deprecation(


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1 and 3x128)