In [1]:
# 测试环境
%run env.py
env = CombinatorialAuctionEnv(n_agents=2, n_items=3, max_steps=5)
obs = env.reset()
print("环境重置成功!")
print(f"观察: {obs}")
print(f"观察形状: {[o.shape for o in obs]}")

# 测试一步执行
actions = [env.action_space.sample() for _ in range(env.n_agents)]
print(f"随机动作: {actions}")

next_obs, rewards, done, info = env.step(actions)
print(f"执行成功! 奖励: {rewards}, 完成: {done}")
env.render()

环境重置成功!
观察: [array([6.75237178, 2.7474508 , 8.15561182, 0.        , 0.        ,
       0.        , 0.        ]), array([3.96704494, 4.27406802, 6.3245583 , 0.        , 0.        ,
       0.        , 0.        ])]
观察形状: [(7,), (7,)]
随机动作: [array([9.43289 , 6.090355, 4.307592], dtype=float32), array([9.365489 , 3.9628537, 8.273278 ], dtype=float32)]
执行成功! 奖励: [-3.82852009  2.01696638], 完成: False

=== 回合 1 ===
真实估值:
  智能体 0: [6.75237178 2.7474508  8.15561182]
  智能体 1: [3.96704494 4.27406802 6.3245583 ]
出价:
  智能体 0: [9.43289  6.090355 4.307592]
  智能体 1: [9.365489  3.9628537 8.273278 ]
分配结果:
  智能体 0 获得物品: [0 1]
  智能体 1 获得物品: [2]
支付:
  智能体 0: 13.33
  智能体 1: 4.31
效用:
  智能体 0: -3.83
  智能体 1: 2.02


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [13]:
# 测试 MADDPG
%run maddpg.py

# 使用与环境相同的参数
n_agents = 2
obs_dims = [env.obs_dim] * n_agents  # 注意使用 env.obs_dim 而不是 env.n_items
act_dims = [env.n_items] * n_agents

maddpg = MADDPG(n_agents, obs_dims, act_dims)
print("MADDPG 初始化成功!")

# 测试动作生成
test_obs = env.reset()
test_actions = maddpg.act(test_obs)
print(f"MADDPG 生成的动作: {test_actions}")

MADDPG 初始化成功!
MADDPG 生成的动作: [array([0.37231618, 0.53775936, 0.44884092], dtype=float32), array([0.44975576, 0.6657661 , 0.56756955], dtype=float32)]


In [14]:
# 简化训练循环
import numpy as np

# 重置环境
obs_list = env.reset()
done = False
episode_reward = 0

while not done:
    # 使用 MADDPG 生成动作
    actions = maddpg.act(obs_list)
    
    # 执行动作
    next_obs_list, rewards, done, info = env.step(actions)
    episode_reward += np.sum(rewards)
    
    # 存储经验
    maddpg.buffer.add((obs_list, actions, rewards, next_obs_list, done))
    
    # 更新观察
    obs_list = next_obs_list
    
    print(f"步骤奖励: {rewards}, 累计奖励: {episode_reward}")

print(f"回合结束，总奖励: {episode_reward}")

# 尝试更新 MADDPG
print("尝试更新 MADDPG...")
maddpg.update(batch_size=32)
print("更新完成!")

步骤奖励: [ 2.11409236 11.52363563], 累计奖励: 13.63772798495957
步骤奖励: [ 2.11305282 11.510764  ], 累计奖励: 27.26154481207269
步骤奖励: [ 2.11333094 11.50952971], 累计奖励: 40.88440546147432
步骤奖励: [ 2.11354626 11.50805396], 累计奖励: 54.50600568125521
步骤奖励: [ 2.11386515 11.50652605], 累计奖励: 68.12639688042148
回合结束，总奖励: 68.12639688042148
尝试更新 MADDPG...
更新完成!
