In [1]:
import torch
import time

In [7]:
def G_modified(X, model):
    # 开始计时
    start = time.time()
    
    input_dim, m = model.W.shape  # m: 隐藏层神经元数量, input_dim: 输入维度
    batch_size = X.shape[0]       # batch_size: 批处理大小
    
    # 初始化 Jacobian 矩阵 J，大小为 (batch_size, m * (input_dim + 1))
    J = torch.zeros(batch_size, m * (input_dim + 1), device=X.device)
    
    # 计算所有样本的 <w_i, x> 和 ReLU 激活
    relu_input = X @ model.W  # (batch_size, m)
    relu_output = torch.relu(relu_input)  # (batch_size, m)
    
    # 对 w_i 的部分并行计算 Jacobian
    W_grad = torch.zeros(batch_size, input_dim, m)
    for j in range(m):
        mask = (relu_input[:, j] > 0).float()  # 只选择 ReLU 激活大于0的元素
        W_grad[:, :, j] = model.a[j] * X * mask.view(-1, 1) / m
    J[:, :m*input_dim] = W_grad.reshape(W_grad.shape[0], -1)
    # 对 a_i 的部分并行计算 Jacobian
    J[:, m*input_dim:] = relu_output / m
    end = time.time()
    print("计算Jacobian矩阵耗时：", end - start)
    return J

In [3]:
def G_modified_CUDA(X, model):
    # 开始计时
    start = time.time()
    # 确保所有张量在 CUDA 设备上
    device = X.device
    
    input_dim, m = model.W.shape  # m: 隐藏层神经元数量, input_dim: 输入维度
    batch_size = X.shape[0]       # batch_size: 批处理大小
    
        # 初始化 Jacobian 矩阵 J，大小为 (batch_size, m * (input_dim + 1))
    J = torch.zeros(batch_size, m * (input_dim + 1), device=device)
    
    # 计算所有样本的 <w_i, x> 和 ReLU 激活
    relu_input = X @ model.W  # (batch_size, m)
    relu_output = torch.relu(relu_input)  # (batch_size, m)
    
    # 对 w_i 的部分并行计算 Jacobian
    mask = (relu_input > 0).float()  # (batch_size, m)
    
    # 使用广播机制计算 W_grad
    W_grad = (X.unsqueeze(2) * mask.unsqueeze(1)) / m  # (batch_size, input_dim, m)
    W_grad = W_grad * model.a.view(1, 1, m)  # (batch_size, input_dim, m)
    
    # 将 W_grad 转换为二维矩阵并赋值给 J
    J[:, :m*input_dim] = W_grad.reshape(batch_size, -1)
    
    # 对 a_i 的部分并行计算 Jacobian
    J[:, m*input_dim:] = relu_output / m
    
    end = time.time()
    print("CUDA计算Jacobian矩阵耗时：", end - start)
    return J

In [4]:
class test_model(torch.nn.Module):
    def __init__(self, D, m, W: torch.Tensor, a: torch.Tensor):
        super(test_model, self).__init__()
        self.m = m
        self.W = torch.nn.Parameter(W, requires_grad=True)
        self.a = torch.nn.Parameter(a, requires_grad=True)

    def forward(self, X):
        return torch.relu(X @ self.W) @ self.a / self.m

In [5]:
D, m = 3, 2
W = torch.tensor([[0.5, -0.3], [0.8, 0.6], [-0.2, 0.7]])
a = torch.tensor([0.9, -1.1])
X = torch.tensor([[1., 2., 3.], [0.5, 1.0, 1.5], [3., -2., 5]])
model = test_model(3, 2, W, a)

In [8]:
# 计算 G_modified
J = G_modified(X, model)
print(J)
J_cuda = G_modified_CUDA(X, model)
print(J_cuda)

计算Jacobian矩阵耗时： 0.001752614974975586
tensor([[ 0.4500, -0.5500,  0.9000, -1.1000,  1.3500, -1.6500,  0.7500,  1.5000],
        [ 0.2250, -0.2750,  0.4500, -0.5500,  0.6750, -0.8250,  0.3750,  0.7500],
        [ 0.0000, -1.6500, -0.0000,  1.1000,  0.0000, -2.7500,  0.0000,  0.7000]],
       grad_fn=<CopySlices>)
CUDA计算Jacobian矩阵耗时： 0.00024890899658203125
tensor([[ 0.4500, -0.5500,  0.9000, -1.1000,  1.3500, -1.6500,  0.7500,  1.5000],
        [ 0.2250, -0.2750,  0.4500, -0.5500,  0.6750, -0.8250,  0.3750,  0.7500],
        [ 0.0000, -1.6500, -0.0000,  1.1000,  0.0000, -2.7500,  0.0000,  0.7000]],
       grad_fn=<CopySlices>)


In [None]:
P = torch.zeros((m*(D+1), m*(D+1)))

for i in range(D):
    for j in range(m):
        P[j*D + i, i*m + j] = 1
# 令最后m行m列为单位矩阵
for i in range(m*D, m*(D+1)):
    P[i, i] = 1

print(torch.mm(J, P))

In [None]:
def auto_grad_G(X, model):
    # length = model.W.shape[0] * model.W.shape[1]
    # height = X.shape[0]
    # J = torch.zeros(height, length)
    # y = model(X).flatten()
    # grad_y = torch.zeros(y.shape)
    # for i in range(y.shape[0]):
    #     grad_y.zero_()
    #     grad_y[i] = 1
    #     w_grad = torch.autograd.grad(y, model.W, grad_y, retain_graph=True, create_graph=True)[0]
    #     J[i] = w_grad.flatten()
    # return J
    output = model(X)
    output.backward()
    return model.W.grad, model.a.grad

model = test_model(3, 2, W, a)
X = torch.tensor([[1.0, 2.0, 3.0]])
w_grad, a_grad = auto_grad_G(X, model)
print(model.W.data.flatten())
print(w_grad)
print(model.W.grad.flatten())
print(model.a.data.flatten(), a_grad.flatten())