In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym

# 定义策略网络
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=128):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.softmax(self.fc2(x), dim=-1)  # 输出动作概率分布
        return x

# 选择动作
def select_action(policy_net, state):
    state = torch.from_numpy(state).float()
    action_probs = policy_net(state)
    action = torch.multinomial(action_probs, 1).item()  # 根据概率选择动作
    return action, action_probs[action]

# 计算折扣回报
def discount_rewards(rewards, gamma=0.99):
    R = 0
    discounted_rewards = []
    for r in rewards[::-1]:
        R = r + gamma * R
        discounted_rewards.insert(0, R)
    return discounted_rewards

# 训练策略网络
def train(policy_net, optimizer, episodes=500, gamma=0.99):
    env = gym.make('CartPole-v1')
    for episode in range(episodes):
        state, _ = env.reset()  # 提取状态
        log_probs = []
        rewards = []
        done = False

        # 运行一集
        while not done:
            action, log_prob = select_action(policy_net, state)
            next_state, reward, done, _ = env.step(action)
            
            log_probs.append(torch.log(log_prob))
            rewards.append(reward)

            state = next_state
        
        # 计算折扣奖励
        discounted_rewards = discount_rewards(rewards, gamma)
        discounted_rewards = torch.tensor(discounted_rewards)
        
        # 计算损失
        loss = -torch.sum(torch.stack(log_probs) * discounted_rewards)
        
        # 优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if episode % 50 == 0:
            print(f'Episode {episode}, Total Reward: {sum(rewards)}')

    env.close()

# 主函数
def main():
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    env.close()

    policy_net = PolicyNetwork(state_size, action_size)
    optimizer = optim.Adam(policy_net.parameters(), lr=0.01)
    
    train(policy_net, optimizer)

if __name__ == "__main__":
    main()



AttributeError: module 'numpy' has no attribute 'bool8'