In [1]:
import sys

sys.path.append(r"C:\Users\Eddie\Documents\marl_sigctrl\framework")
sys.path.append(r"C:\Users\Eddie\Documents\marl_sigctrl")


In [4]:
import torch
from torch.distributions import Categorical
from collections import namedtuple
from ding.rl_utils import ppo_policy_error


def test_ppo_policy_error():
    # 设置测试数据
    logit_new = torch.tensor([[0.1, 0.2, 0.7], [0.6, 0.2, 0.2]])
    logit_old = torch.tensor([[0.1, 0.2, 0.7], [0.6, 0.2, 0.2]])
    action = torch.tensor([2, 0])
    adv = torch.tensor([0.5, -0.5])
    weight = torch.tensor([1.0, 1.0])
    clip_ratio = 0.2
    dual_clip = None

    data = namedtuple('Data', ['logit_new', 'logit_old', 'action', 'adv', 'weight'])
    data = data(logit_new, logit_old, action, adv, weight)

    # 调用被测方法
    loss, info = ppo_policy_error(data, clip_ratio, dual_clip)

    # 验证损失和信息
    assert loss.policy_loss.item() >= 0
    assert loss.entropy_loss.item() >= 0
    assert info.approx_kl >= 0
    assert 0 <= info.clipfrac <= 1


def test_ppo_policy_error_DualClip():
    # 设置测试数据
    logit_new = torch.tensor([[0.1, 0.2, 0.7], [0.6, 0.2, 0.2]])
    logit_old = torch.tensor([[0.1, 0.2, 0.7], [0.6, 0.2, 0.2]])
    action = torch.tensor([2, 0])
    adv = torch.tensor([0.5, -0.5])
    weight = torch.tensor([1.0, 1.0])
    clip_ratio = 0.2
    dual_clip = 0.5

    data = namedtuple('Data', ['logit_new', 'logit_old', 'action', 'adv', 'weight'])
    data = data(logit_new, logit_old, action, adv, weight)

    # 调用被测方法
    loss, info = ppo_policy_error(data, clip_ratio, dual_clip)

    # 验证损失和信息
    assert loss.policy_loss.item() >= 0
    assert loss.entropy_loss.item() >= 0
    assert info.approx_kl >= 0
    assert 0 <= info.clipfrac <= 1


def test_ppo_policy_error_NoWeight():
    # 设置测试数据
    logit_new = torch.tensor([[0.1, 0.2, 0.7], [0.6, 0.2, 0.2]])
    logit_old = torch.tensor([[0.1, 0.2, 0.7], [0.6, 0.2, 0.2]])
    action = torch.tensor([2, 0])
    adv = torch.tensor([0.5, -0.5])
    weight = None
    clip_ratio = 0.2
    dual_clip = None

    data = namedtuple('Data', ['logit_new', 'logit_old', 'action', 'adv', 'weight'])
    data = data(logit_new, logit_old, action, adv, weight)

    # 调用被测方法
    loss, info = ppo_policy_error(data, clip_ratio, dual_clip)

    # 验证损失和信息
    assert loss.policy_loss.item() >= 0
    assert loss.entropy_loss.item() >= 0
    assert info.approx_kl >= 0
    assert 0 <= info.clipfrac <= 1

In [6]:

test_ppo_policy_error_DualClip()

AssertionError: 

被测方法行为:
 gae 函数实现了广义优势估计器（Generalized Advantage Estimator，GAE），这是一种用于计算强化学习中优势估计的算法。该算法使用折扣因子 gamma 和 GAE 参数 lambda_ 来计算优势。函数接收一个包含价值、下一个价值、奖励、完成标志和轨迹标志的命名元组。它处理这些输入，计算每个时间步的优势，并返回一个优势张量。

分支和所需的测试用例:

- [1]**分支 1**: 如果 done 为 None，则将其初始化为与 reward 形状相同的零张量。
- [2]**分支 2**: 如果 traj_flag 为 None，则将其设置为 done。
- [3]**分支 3**: 如果 value 的维度比 reward 多一个，则对 reward、done 和 traj_flag 进行扩展。
- [4]**循环**: 从最后一个时间步向前迭代，计算 GAE。

模拟需求: 不需要模拟，因为该函数直接处理输入张量，没有外部依赖或需要模拟的复杂交互。

In [11]:
# 每个智能体对应一个优势函数估计值：当 value 的形状为 (T, B) 时，每个智能体有一个优势估计值。
# 每个智能体的每一个动作都有一个估计值：当 value 的形状为 (T, B, A) 时，每个智能体的每个动作都有一个优势估计值。

In [9]:
from ding.rl_utils import gae
def test_gae():
    # 设置测试数据
    T, B = 5, 3
    value = torch.randn(T, B)
    next_value = torch.randn(T, B)
    reward = torch.randn(T, B)
    done = torch.randint(0, 2, (T, B)).float()
    traj_flag = torch.randint(0, 2, (T, B)).float()
    data = namedtuple('Data', ['value', 'next_value', 'reward', 'done', 'traj_flag'])(value, next_value, reward, done, traj_flag)

    # 使用默认参数调用 GAE
    adv = gae(data)
    print(adv)
    # 验证输出形状
    assert adv.shape == (T, B), "Advantage tensor shape is incorrect."

    # 验证计算
    gamma = 0.99
    lambda_ = 0.97
    next_value *= (1 - done)
    delta = reward + gamma * next_value - value
    factor = gamma * lambda_ * (1 - traj_flag)
    expected_adv = torch.zeros_like(value)
    gae_item = torch.zeros_like(value[0])
    for t in reversed(range(T)):
        gae_item = delta[t] + factor[t] * gae_item
        expected_adv[t] = gae_item

    assert torch.allclose(adv, expected_adv), "Calculated advantage does not match expected values."

    # 测试当 done 和 traj_flag 为 None 时
    data_none_flags = namedtuple('Data', ['value', 'next_value', 'reward', 'done', 'traj_flag'])(value, next_value, reward, None, None)
    adv_none_flags = gae(data_none_flags)
    assert torch.allclose(adv_none_flags, expected_adv), "Advantage calculation with None done/traj_flag is incorrect."

    # 测试当 value 的维度比 reward 多一个时
    value_expanded = torch.randn(T, B, 1)
    data_expanded = namedtuple('Data', ['value', 'next_value', 'reward', 'done', 'traj_flag'])(value_expanded, next_value, reward, done, traj_flag)
    adv_expanded = gae(data_expanded)
    assert adv_expanded.shape == (T, B, 1), "Advantage tensor shape is incorrect for expanded value."
    assert torch.allclose(adv_expanded.squeeze(-1), expected_adv), "Calculated advantage for expanded value does not match expected values."

In [None]:
test_gae()

tensor([[ 0.8593,  0.4486,  3.6517],
        [ 3.0480, -0.4445,  3.6012],
        [-0.8671,  1.9734,  1.0996],
        [ 0.8457, -1.2078,  0.2137],
        [ 1.7281,  0.2591,  0.3168]])


AssertionError: Advantage calculation with None done/traj_flag is incorrect.


# torch.no_grad() 的作用
在 PyTorch 中，torch.no_grad() 是一个上下文管理器，用于在上下文环境中关闭梯度计算。

在深度学习训练过程中，PyTorch 会自动跟踪所有涉及到 torch.Tensor 的操作，以便后续进行反向传播计算梯度。

然而，在某些情况下，我们并不需要计算梯度，例如在模型推理阶段或者进行一些不需要更新参数的计算时，关闭梯度计算可以带来以下好处：

- 节省内存：由于不需要存储中间计算结果用于反向传播，因此可以减少内存的使用。
- 提高计算速度：避免了梯度计算的开销，从而加快了计算速度。

In [19]:
import torch
import unittest

class TestNoGrad(unittest.TestCase):
    def test_no_grad(self):
        # 定义一个简单的线性模型
        model = torch.nn.Linear(10, 1)
        # 生成一个输入张量
        x = torch.randn(1, 10)

        # 在有梯度计算的情况下进行前向传播
        with torch.enable_grad():
            y_with_grad = model(x)
            self.assertEqual(y_with_grad.requires_grad, False, "梯度计算应该是开启的")

        # 在无梯度计算的情况下进行前向传播
        with torch.no_grad():
            y_no_grad = model(x)
            self.assertEqual(y_no_grad.requires_grad, False, "梯度计算应该是关闭的")


In [20]:
no_grad_test = TestNoGrad()
no_grad_test.test_no_grad()

AssertionError: True != False : 梯度计算应该是开启的

# 定义Critic网络来估计Q值和V值


In [16]:
import torch
import torch.nn as nn

class CriticNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dims):
        super(CriticNetwork, self).__init__()
        # 用于估计Q值的网络
        q_layers = []

        # 状态价值对-优势估计
        input_dim = state_dim + action_dim
        for hidden_dim in hidden_dims:
            q_layers.append(nn.Linear(input_dim, hidden_dim))
            q_layers.append(nn.ReLU())
            input_dim = hidden_dim
        q_layers.append(nn.Linear(input_dim, 1))
        self.q_network = nn.Sequential(*q_layers)

        # 用于估计V值的网络
        v_layers = []
        input_dim = state_dim
        for hidden_dim in hidden_dims:
            v_layers.append(nn.Linear(input_dim, hidden_dim))
            v_layers.append(nn.ReLU())
            input_dim = hidden_dim
        v_layers.append(nn.Linear(input_dim, 1))
        self.v_network = nn.Sequential(*v_layers)

    def forward(self, state, action):
        # 计算Q值
        q_input = torch.cat([state, action], dim=-1)
        q_value = self.q_network(q_input)
        # 计算V值
        v_value = self.v_network(state)
        # 计算优势函数值
        advantage = q_value - v_value
        return advantage

# 单元测试
def test_critic_network():
    # 定义参数
    num_intersections = 3  # 交叉口数量
    state_features_per_intersection = 4  # 每个交叉口的状态特征数量
    state_dim = num_intersections * state_features_per_intersection
    action_dim = num_intersections
    hidden_dims = [16, 8]  # 隐藏层维度

    # 创建Critic网络实例
    critic = CriticNetwork(state_dim, action_dim, hidden_dims)

    # 随机生成一个全局状态和联合动作
    s_t = torch.randn(1, state_dim)  # 时间步t的全局状态
    a_t = torch.randint(0, 2, (1, action_dim)).float()  # 时间步t的联合动作

    # 计算优势函数估计值
    advantage = critic(s_t, a_t)

    print(f"全局状态: {s_t}")
    print(f"联合动作: {a_t}")
    print(f"优势函数估计值: {advantage.item()}")

# if __name__ == "__main__":
test_critic_network()

全局状态: tensor([[-0.2505,  0.2877,  1.8954,  0.6721,  1.6329,  0.7981, -1.7562,  1.1579,
         -0.0837, -2.8926,  0.5453,  0.4943]])
联合动作: tensor([[1., 0., 1.]])
优势函数估计值: 0.021983802318572998


In [17]:
import torch
import numpy as np

class MockEnv:
    """模拟两个交叉口的环境"""
    def __init__(self):
        self.n_agents = 2
        self.global_state_dim = 4  # 假设全局状态包含两个交叉口的信息
        self.local_obs_dim = 2     # 每个Actor观测到自身交叉口的状态
        
    def reset(self):
        return {'global_state': np.random.randn(self.global_state_dim),
                'local_obs': [np.random.randn(self.local_obs_dim) for _ in range(self.n_agents)]}
    
    def step(self, actions):
        # 动作空间A={0,1}，随机生成奖励
        rewards = [float(act == 1)*0.5 - 0.1 for act in actions]  # 切换相位获得+0.5，否则-0.1
        done = False
        return {'global_state': np.random.randn(self.global_state_dim),
                'local_obs': [np.random.randn(self.local_obs_dim) for _ in range(self.n_agents)],
                'rewards': rewards,
                'done': done}

In [18]:
class CentralizedCritic(torch.nn.Module):
    """中心化Critic网络"""
    def __init__(self, global_state_dim):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(global_state_dim, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 1)  # 输出全局状态价值V(s)
        )
    
    def forward(self, global_state):
        return self.net(global_state)

class Actor(torch.nn.Module):
    """分散式Actor网络"""
    def __init__(self, local_obs_dim):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(local_obs_dim, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 2),  # 输出动作概率（A={0,1}）
            torch.nn.Softmax(dim=-1)
        )
    
    def forward(self, local_obs):
        return self.net(local_obs)

In [32]:
def test_advantage_calculation():
    # 初始化环境与模型
    env = MockEnv()
    critic = CentralizedCritic(env.global_state_dim)
    actors = [Actor(env.local_obs_dim) for _ in range(env.n_agents)]
    
    # 模拟轨迹收集
    batch = {'global_states': [], 'rewards': [], 'local_obs': [], 'dones': []}
    obs = env.reset()
    for i in range(3):  # 收集3步数据
        actions = [np.random.choice([0,1], p=actor(torch.Tensor(obs['local_obs'][i])).detach().numpy()) 
                  for i, actor in enumerate(actors)]
        print(f"step {i} actions", actions)
        next_obs = env.step(actions)

        batch['global_states'].append(torch.Tensor(obs['global_state']))
        batch['local_obs'].append([torch.Tensor(o) for o in obs['local_obs']])
        batch['rewards'].append(next_obs['rewards'])
        batch['dones'].append(next_obs['done'])

        obs = next_obs
    for key, vals in batch.items():
        print(f"batch {key}: ", vals)
        print("--"*20)

    # 计算优势函数
    with torch.no_grad():
        values = critic(torch.stack(batch['global_states'])).squeeze().numpy()
        print("no grad critc values ", values)
        print("--"*20)
    
    # 优势函数计算逻辑（假设gamma=0.99）
    advantages = []
    for i in range(env.n_agents):
        agent_rewards = [r[i] for r in batch['rewards']]
        print(f"agent {i} rewards", agent_rewards)
        agent_advantages = []
        running_advantage = 0
        for t in reversed(range(len(agent_rewards))):
            running_advantage = agent_rewards[t] + 0.99 * running_advantage * (1 - int(batch['dones'][t]))
            delta = running_advantage - values[t]
            agent_advantages.insert(0, delta)
        advantages.append(agent_advantages)
        print(f"智能体{i}优势函数值", agent_advantages)
    # 验证断言
    assert len(advantages) == env.n_agents, "每个智能体应有独立优势序列"
    assert not np.allclose(advantages[0], advantages[1]), "不同智能体的优势值应不同（因奖励轨迹不同）"
    print("测试通过：优势函数按智能体独立计算")

# 执行测试
test_advantage_calculation()

step 0 actions [0, 1]
step 1 actions [0, 0]
step 2 actions [1, 1]
batch global_states:  [tensor([ 0.6529,  2.1936,  1.2539, -0.5296]), tensor([-0.4851,  0.1526, -0.1222,  0.2268]), tensor([-0.9882,  0.3741, -1.3519, -0.1919])]
----------------------------------------
batch rewards:  [[-0.1, 0.4], [-0.1, -0.1], [0.4, 0.4]]
----------------------------------------
batch local_obs:  [[tensor([-0.3493, -1.3071]), tensor([-0.4891,  2.1346])], [tensor([2.6298, 0.7472]), tensor([2.9983, 0.4752])], [tensor([1.0271, 1.7269]), tensor([-0.8710, -0.1085])]]
----------------------------------------
batch dones:  [False, False, False]
----------------------------------------
no grad critc values  [-0.49859232  0.03809349  0.33578724]
----------------------------------------
agent 0 rewards [-0.1, -0.1, 0.4]
智能体0优势函数值 [0.6916323171043396, 0.25790650761127476, 0.06421276330947878]
agent 1 rewards [0.4, -0.1, 0.4]
智能体1优势函数值 [1.1916323171043397, 0.25790650761127476, 0.06421276330947878]
测试通过：优势函数按智能体独立计