In [4]:
import torch
from modules import Simple_Perceptron
from data import grip_data
import random
import time
import numpy as np
from utils import validate
from tqdm import tqdm

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # 如果有多个GPU
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# 设置随机数种子，例如：
set_seed(42)

In [2]:
# device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Simple_Perceptron.Simple_Perceptron(41, 100, 1)
model = model.to(device)
train_loader, X_train, Y_train, X_test, Y_test, D = grip_data.load_data(l=64, device=device)
device

device(type='cuda')

In [21]:
def G_modified(X, model):
    # 开始计时
    # start = time.time()
    
    input_dim, m = model.W.shape  # m: 隐藏层神经元数量, input_dim: 输入维度
    batch_size = X.shape[0]       # batch_size: 批处理大小
    
    # 初始化 Jacobian 矩阵 J，大小为 (batch_size, m * (input_dim + 1))
    J = torch.zeros(batch_size, m * (input_dim + 1), device=X.device)
    
    # 计算所有样本的 <w_i, x> 和 ReLU 激活
    relu_input = X @ model.W  # (batch_size, m)
    relu_output = torch.relu(relu_input)  # (batch_size, m)
    
    # 对 w_i 的部分并行计算 Jacobian
    for j in range(m):
        mask = relu_output[:, j] > 0  # 只选择 ReLU 激活大于0的元素
        J[:, j*input_dim:(j+1)*input_dim] = (model.a[j] * X * mask.view(-1, 1)) / m
    
    # 对 a_i 的部分并行计算 Jacobian
    J[:, m*input_dim:] = relu_output / m

    # # 结束计时
    # end = time.time()
    # print("优化后Time: ", end - start)
    
    return J

In [19]:
def G(X, Y, model):
    # 开始计时
    start = time.time()
    
    input_dim, m= model.W.shape  # m: 隐藏层神经元数量, input_dim: 输入维度
    batch_size = X.shape[0]       # batch_size: 批处理大小
    
    # 初始化 Jacobian 矩阵 J，大小为 (batch_size, m * (input_dim + 1))
    J = torch.zeros(batch_size, m * (input_dim + 1), device=X.device)
    
    for i in range(batch_size):
        x = X[i].reshape(1, -1)  # x 的尺寸为 (,input_dim)
        # print(x.shape, model.W.shape)
        relu_input = x @ model.W 
        relu_output = torch.relu(relu_input)   # 计算 relu(<w_i, x>)，结果为 (m,)
        for j in range(m):
            if relu_output[0, j] > 0:
                # print(J[i, j*input_dim: (j+1)*input_dim].shape)
                # print(relu_output.shape)
                J[i, j*input_dim: (j+1)*input_dim] = model.a[j] * x.view(-1) / m
        J[i, m*input_dim:] = relu_output / m
    # 结束计时
    end = time.time()
    # print("优化前Time: ", end - start)
    return J

for X, Y in train_loader:
    U = (model(X) - Y.reshape(-1, 1))
    J_1 = G(X, Y, model)
    J_2 = G_modified(X, Y, model)
    U = (model.forward(X) - Y.reshape(-1, 1))
    theta_0 = torch.cat([model.W.flatten(), model.a.flatten()]).reshape(-1, 1)
    J_3 = torch.zeros(U.shape[0], theta_0.numel(), device=device)
    #=========Remain to be Update=============
    for i in range(U.shape[0]):
        U[i].backward(retain_graph=True)
        J_3[i] = torch.cat([model.W.grad.flatten(), model.a.grad.flatten()])
        model.W.grad.zero_()
        model.a.grad.zero_()
    
    print(torch.norm(J_3 - J_2))
    break

tensor(179.6503, device='cuda:0', grad_fn=<LinalgVectorNormBackward0>)


In [23]:
lr = 1
epochs = 10

for epoch in tqdm(range(epochs)):
    flag = True
    for X, Y in train_loader:
        if flag:
            U = (model(X) - Y.reshape(-1, 1))
            flag = False
        # 增广模型中的参数
        theta_0 = torch.cat([model.W.flatten(), model.a.flatten()]).reshape(-1, 1)
        #=====Jacobian 矩阵=========================
        J = G_modified(X, model)
        #===========================================
        # 转置矩阵 J_T
        with torch.no_grad():
            J_T = J.T
            # 使用Cholesky分解来替代直接逆矩阵计算
            A = torch.eye(theta_0.numel(), device=device) + 2 * lr * torch.mm(J_T, J)
            # 进行 Cholesky 分解
            L = torch.linalg.cholesky(A)
            # 计算 A 的逆
            A_inv = torch.cholesky_inverse(L)
            # 更新 theta_1
            theta_1 = theta_0 - 2 * lr * torch.mm(torch.mm(A_inv, J_T), U)
            # 计算更新量
            U -= (2 * lr * (J @ A_inv) @ J_T) @ U
            # 更新参数
            model.W.data = theta_1[:model.W.numel()].reshape(model.W.shape)
            model.a.data = theta_1[model.W.numel():].reshape(model.a.shape) 
    validate(model, X_train, Y_train, X_test, Y_test, epoch, is_recoard=False)

 10%|█         | 1/10 [00:01<00:17,  1.90s/it]

Epoch 1: Train Loss: 0.42189860343933105, Test Loss: 0.43460991978645325


 20%|██        | 2/10 [00:03<00:14,  1.82s/it]

Epoch 2: Train Loss: 0.4833986461162567, Test Loss: 0.5065326690673828


 30%|███       | 3/10 [00:05<00:12,  1.80s/it]

Epoch 3: Train Loss: 0.4438907206058502, Test Loss: 0.4669342637062073


 40%|████      | 4/10 [00:07<00:10,  1.79s/it]

Epoch 4: Train Loss: 0.46449196338653564, Test Loss: 0.48483845591545105


 50%|█████     | 5/10 [00:08<00:08,  1.78s/it]

Epoch 5: Train Loss: 0.5468817353248596, Test Loss: 0.5971066951751709


 60%|██████    | 6/10 [00:10<00:07,  1.78s/it]

Epoch 6: Train Loss: 0.5308660864830017, Test Loss: 0.5275813341140747


 70%|███████   | 7/10 [00:12<00:05,  1.77s/it]

Epoch 7: Train Loss: 0.490449458360672, Test Loss: 0.5030314922332764


 80%|████████  | 8/10 [00:14<00:03,  1.78s/it]

Epoch 8: Train Loss: 0.48937782645225525, Test Loss: 0.5476897358894348


 90%|█████████ | 9/10 [00:16<00:01,  1.78s/it]

Epoch 9: Train Loss: 0.4861034154891968, Test Loss: 0.4984762966632843


100%|██████████| 10/10 [00:17<00:00,  1.78s/it]

Epoch 10: Train Loss: 0.4483572244644165, Test Loss: 0.42316755652427673



