In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gym
import numpy as np


In [6]:

# 策略网络（Actor）
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.action_head = nn.Linear(hidden_dim, action_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        action_probs = self.softmax(self.action_head(x))
        return action_probs


In [7]:

# 价值网络（Critic）
class Critic(nn.Module):
    def __init__(self, state_dim, hidden_dim=128):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.value_head = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        state_value = self.value_head(x)
        return state_value

def train_actor_critic(env_name='CartPole-v1', num_episodes=1000, gamma=0.99, lr=1e-3):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    actor = Actor(state_dim, action_dim)
    critic = Critic(state_dim)
    actor_optimizer = optim.Adam(actor.parameters(), lr=lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=lr)

    for episode in range(num_episodes):
        state, _ = env.reset()
        ep_reward = 0

        done = False
        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action_probs = actor(state_tensor)
            dist = torch.distributions.Categorical(action_probs)
            action = dist.sample()
            log_prob = dist.log_prob(action)

            next_state, reward, done, _, _ = env.step(action.item())
            ep_reward += reward

            # 计算TD误差
            value = critic(state_tensor)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            next_value = critic(next_state_tensor)
            target = reward + (1 - done) * gamma * next_value.detach()
            td_error = target - value

            # 更新Critic（最小化TD误差的平方）
            critic_loss = td_error.pow(2).mean()
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

            # 更新Actor（策略梯度）
            actor_loss = -log_prob * td_error.detach()
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            state = next_state

        if (episode + 1) % 10 == 0:
            print(f"Episode {episode+1}, Reward: {ep_reward}")

    env.close()


In [8]:
if __name__ == "__main__":
    train_actor_critic()

  if not isinstance(terminated, (bool, np.bool8)):


Episode 10, Reward: 8.0
Episode 20, Reward: 14.0
Episode 30, Reward: 22.0
Episode 40, Reward: 9.0
Episode 50, Reward: 12.0
Episode 60, Reward: 25.0
Episode 70, Reward: 43.0
Episode 80, Reward: 53.0
Episode 90, Reward: 102.0
Episode 100, Reward: 9.0
Episode 110, Reward: 10.0
Episode 120, Reward: 8.0
Episode 130, Reward: 11.0
Episode 140, Reward: 8.0
Episode 150, Reward: 10.0
Episode 160, Reward: 9.0
Episode 170, Reward: 8.0
Episode 180, Reward: 9.0
Episode 190, Reward: 9.0
Episode 200, Reward: 10.0
Episode 210, Reward: 8.0
Episode 220, Reward: 9.0
Episode 230, Reward: 11.0
Episode 240, Reward: 10.0
Episode 250, Reward: 64.0
Episode 260, Reward: 33.0
Episode 270, Reward: 53.0
Episode 280, Reward: 30.0
Episode 290, Reward: 42.0
Episode 300, Reward: 24.0
Episode 310, Reward: 217.0
Episode 320, Reward: 168.0
Episode 330, Reward: 184.0
Episode 340, Reward: 111.0
Episode 350, Reward: 31.0
Episode 360, Reward: 89.0
Episode 370, Reward: 67.0
Episode 380, Reward: 40.0
Episode 390, Reward: 52.0
E