In [1]:
import torch
from modules import Simple_Perceptron
from data import grip_data
import random
import time
import numpy as np
from utils import validate
from tqdm import tqdm

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # 如果有多个GPU
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# 设置随机数种子，例如：
set_seed(42)

In [3]:
# device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Simple_Perceptron.Simple_Perceptron(41, 100, 1)
model = model.to(device)
train_loader, X_train, Y_train, X_test, Y_test, D = grip_data.load_data(l=64, device=device)
device

device(type='cuda')

In [6]:
model.loss(model(X_train), Y_train).mean()

tensor(1.0243, device='cuda:0', grad_fn=<MeanBackward0>)

In [3]:
def G_modified_CP(X, model):
    input_dim, m = model.W.shape  # m: 隐藏层神经元数量, input_dim: 输入维度
    batch_size = X.shape[0]       # batch_size: 批处理大小
    
    # 初始化 Jacobian 矩阵 J，大小为 (batch_size, m * (input_dim + 1))
    J = torch.zeros(batch_size, m * (input_dim + 1), device=X.device)
    
    # 计算所有样本的 <w_i, x> 和 ReLU 激活
    relu_input = X @ model.W  # (batch_size, m)
    relu_output = torch.relu(relu_input)  # (batch_size, m)
    
    # 遍历输入维度和神经元，确保顺序正确
    for i in range(input_dim):
        for j in range(m):
            mask = relu_output[:, j] > 0  # 只选择 ReLU 激活大于0的元素
            # 修正后的索引填充顺序
            J[:, j*input_dim + i] = (model.a[j] * X[:, i] * mask) / m
    
    # 对 a_i 的部分并行计算 Jacobian
    J[:, m*input_dim:] = relu_output / m
    
    return J

In [19]:
def G_modified(X, model):
    # 开始计时
    # start = time.time()
    
    input_dim, m = model.W.shape  # m: 隐藏层神经元数量, input_dim: 输入维度
    batch_size = X.shape[0]       # batch_size: 批处理大小
    
    # 初始化 Jacobian 矩阵 J，大小为 (batch_size, m * (input_dim + 1))
    J = torch.zeros(batch_size, m * (input_dim + 1), device=X.device)
    
    # 计算所有样本的 <w_i, x> 和 ReLU 激活
    relu_input = X @ model.W  # (batch_size, m)
    relu_output = torch.relu(relu_input)  # (batch_size, m)
    # 计算模型输出
    # 对 w_i 的部分并行计算 Jacobian
    for j in range(m):
        mask = relu_output[:, j] > 0  # 只选择 ReLU 激活大于0的元素
        J[:, j*input_dim:(j+1)*input_dim] = (model.a[j] * X * mask.view(-1, 1)) / m
    
    # 对 a_i 的部分并行计算 Jacobian
    J[:, m*input_dim:] = relu_output / m

    # 结束计时
    # end = time.time()
    # print("优化后Time: ", end - start)
    
    return J

In [5]:
def G(X, Y, model):
    # 开始计时
    start = time.time()
    
    input_dim, m= model.W.shape  # m: 隐藏层神经元数量, input_dim: 输入维度
    batch_size = X.shape[0]       # batch_size: 批处理大小
    
    # 初始化 Jacobian 矩阵 J，大小为 (batch_size, m * (input_dim + 1))
    J = torch.zeros(batch_size, m * (input_dim + 1), device=X.device)
    
    for i in range(batch_size):
        x = X[i].reshape(1, -1)  # x 的尺寸为 (,input_dim)
        # print(x.shape, model.W.shape)
        relu_input = x @ model.W 
        relu_output = torch.relu(relu_input)   # 计算 relu(<w_i, x>)，结果为 (m,)
        for j in range(m):
            if relu_output[0, j] > 0:
                # print(J[i, j*input_dim: (j+1)*input_dim].shape)
                # print(relu_output.shape)
                J[i, j*input_dim: (j+1)*input_dim] = model.a[j] * x.view(-1) / m
        J[i, m*input_dim:] = relu_output / m
    # 结束计时
    end = time.time()
    # print("优化前Time: ", end - start)
    return J


TypeError: G_modified() takes 2 positional arguments but 3 were given

In [6]:
for X, Y in train_loader:
    X_tmp = X
    Y_tmp = Y
    break


In [14]:
U = (model.forward(X_tmp) - Y_tmp.reshape(-1, 1))
J_2 = G_modified(X_tmp, model)
#=========Remain to be Update=============
theta_0 = torch.cat([model.W.flatten(), model.a.flatten()]).reshape(-1, 1)
J_3 = torch.zeros(U.shape[0], theta_0.numel(), device=device)
for i in range(U.shape[0]):
    U[i].backward(retain_graph=True)
    J_3[i] = torch.cat([model.W.grad.flatten(), model.a.grad.flatten()])
    model.W.grad.zero_()
    model.a.grad.zero_()
#=========================================

优化后Time:  0.011729717254638672


In [15]:
# 打印范数差异
print("Norm difference between J_3 and J_2:")
print(torch.norm(J_3 - J_2))

# 逐元素比较 J_3 和 J_2
print("Element-wise difference between J_3 and J_2:")
print(J_3 - J_2)

# 找出最大差异的位置
max_diff = torch.max(torch.abs(J_3 - J_2))
max_diff_idx = torch.argmax(torch.abs(J_3 - J_2))
print(f"Maximum difference: {max_diff.item()} at position {max_diff_idx}")

Norm difference between J_3 and J_2:
tensor(8.3748, device='cuda:0', grad_fn=<LinalgVectorNormBackward0>)
Element-wise difference between J_3 and J_2:
tensor([[0.0000e+00, 2.0560e-02, 8.5018e-03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 1.1807e-02, 9.6919e-05,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 2.7847e-03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 4.1264e-03, 1.5707e-03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 1.4330e-02, 5.4546e-03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]], device='cuda:0', grad_fn=<SubBackward0>)
Maximum difference: 0.11241750419139862 at position 21646


In [20]:
lr = 1
epochs = 100

for epoch in tqdm(range(epochs)):
    flag = True
    for X, Y in train_loader:
        if flag:
            U = (model(X) - Y.reshape(-1, 1))
            flag = False
        # 增广模型中的参数
        theta_0 = torch.cat([model.W.flatten(), model.a.flatten()]).reshape(-1, 1)
        #=====Jacobian 矩阵=========================
        J = G_modified(X, model)
        #===========================================
        # 转置矩阵 J_T
        with torch.no_grad():
        # 计算 A = I + 2Δt * J_n * J_n^T，确保 A 在 CUDA 上
            A = torch.eye(J.shape[0], device=device) + 2 * lr * torch.mm(J, J.T)
            
            # 使用 Cholesky 分解计算 A 的逆矩阵，确保操作在 CUDA 上
            L = torch.linalg.cholesky(A)
            A_inv = torch.cholesky_inverse(L)
            
            # 更新 U^{n+1}
            U_1 = torch.mm(A_inv, U)
            
            # 更新 theta^{n+1}
            theta_1 = theta_0 - 2 * lr * torch.mm(J.T, U_1)
            
            # 更新模型参数，确保更新后的参数在 GPU 上
            model.W.data = theta_1[:model.W.numel()].reshape(model.W.shape)
            model.a.data = theta_1[model.W.numel():].reshape(model.a.shape)
            
            # 更新 U_n 和 theta_n
            U = U_1
        
    validate(model, X_train, Y_train, X_test, Y_test, epoch, is_recoard=False)

 10%|█         | 1/10 [00:00<00:08,  1.02it/s]

Epoch 1: Train Loss: 0.8559078574180603, Test Loss: 0.8777651190757751


 20%|██        | 2/10 [00:01<00:07,  1.03it/s]

Epoch 2: Train Loss: 0.824286937713623, Test Loss: 0.8392741680145264


 30%|███       | 3/10 [00:02<00:06,  1.02it/s]

Epoch 3: Train Loss: 0.750862181186676, Test Loss: 0.7655225992202759


 40%|████      | 4/10 [00:03<00:05,  1.04it/s]

Epoch 4: Train Loss: 0.6808584332466125, Test Loss: 0.6921696066856384


 50%|█████     | 5/10 [00:04<00:04,  1.04it/s]

Epoch 5: Train Loss: 0.6468988656997681, Test Loss: 0.6569773554801941


 60%|██████    | 6/10 [00:05<00:03,  1.04it/s]

Epoch 6: Train Loss: 0.6141281127929688, Test Loss: 0.6221404075622559


 70%|███████   | 7/10 [00:06<00:02,  1.04it/s]

Epoch 7: Train Loss: 0.5697447657585144, Test Loss: 0.5789356231689453


 80%|████████  | 8/10 [00:07<00:01,  1.05it/s]

Epoch 8: Train Loss: 0.5346863269805908, Test Loss: 0.5419045090675354


 90%|█████████ | 9/10 [00:08<00:00,  1.05it/s]

Epoch 9: Train Loss: 0.5036500692367554, Test Loss: 0.5074693560600281


100%|██████████| 10/10 [00:09<00:00,  1.04it/s]

Epoch 10: Train Loss: 0.47113943099975586, Test Loss: 0.4733397364616394



