In [6]:
import numpy as np
# Define the network parameters
# --- 定义确定的网络参数，便于调试和验证 ---
D, H, C = 3, 4, 2  # 输入维度(D=3), 隐藏层大小(H=4), 输出类别数(C=2)

# 手动设定固定参数（非随机）
W = np.array([
    [1.0, 0.5, -0.3],   # 隐藏层第1个神经元的权重
    [0.2, -1.0, 0.8],   # 隐藏层第2个神经元的权重
    [0.4, 0.7, -0.9],   # 隐藏层第3个神经元的权重
    [-0.6, 0.1, 0.5]    # 隐藏层第4个神经元的权重
], dtype=np.float32)     # 形状 (4,3)

b1 = np.array([0.1, -0.2, 0.3, -0.4], dtype=np.float32)  # 隐藏层偏置 (4,)

U = np.array([
    [0.5, -0.3, 0.2, 0.1],   # 输出层第1个类别的权重
    [-0.4, 0.6, 0.7, -0.8]   # 输出层第2个类别的权重
], dtype=np.float32)          # 形状 (2,4)

b2 = np.array([0.2, -0.1], dtype=np.float32)  # 输出层偏置 (2,)

x = np.array([0.5, -1.0, 0.8], dtype=np.float32)  # 输入向量 (3,)

y_true = np.array([0, 1], dtype=np.float32)       # 真实标签（第二类）

# Define the forward pass
def relu(z):
    return np.maximum(0, z)

def softmax(theta):
    exp_theta = np.exp(theta - np.max(theta))
    return exp_theta / np.sum(exp_theta)

def cross_entropy(y, y_true):
    return -np.sum(y_true * np.log(y))

def forward(x, W, b1, U, b2):
    z = W @ x + b1
    h = relu(z)
    theta = U @ h + b2
    y = softmax(theta)
    loss = cross_entropy(y, y_true)
    return loss, z, h, theta, y

# Compute analytical gradients
loss, z, h, theta, y = forward(x, W, b1, U, b2)
dL_dtheta = y - y_true
dL_dU = np.outer(dL_dtheta, h)
dL_db2 = dL_dtheta
dL_dh = U.T @ dL_dtheta
dL_dz = dL_dh * (z > 0)
dL_dW = np.outer(dL_dz, x)
dL_db1 = dL_dz
dL_dx = W.T @ dL_dz

# Numerical gradient approximation
h_numerical = 1e-5

def numerical_gradient(f, param):
    param_plus = param + h_numerical
    param_minus = param - h_numerical
    return (f(param_plus) - f(param_minus)) / (2 * h_numerical)

# Compute numerical gradients
numerical_dL_dU = np.zeros_like(U)
for i in range(U.shape[0]):
    for j in range(U.shape[1]):
        def f_U(u):
            U_temp = U.copy()
            U_temp[i, j] = u
            loss, _, _, _, _ = forward(x, W, b1, U_temp, b2)
            return loss
        numerical_dL_dU[i, j] = numerical_gradient(f_U, U[i, j])

numerical_dL_db2 = np.zeros_like(b2)
for i in range(b2.shape[0]):
    def f_b2(b):
        b2_temp = b2.copy()
        b2_temp[i] = b
        loss, _, _, _, _ = forward(x, W, b1, U, b2_temp)
        return loss
    numerical_dL_db2[i] = numerical_gradient(f_b2, b2[i])

numerical_dL_dW = np.zeros_like(W)
for i in range(W.shape[0]):
    for j in range(W.shape[1]):
        def f_W(w):
            W_temp = W.copy()
            W_temp[i, j] = w
            loss, _, _, _, _ = forward(x, W_temp, b1, U, b2)
            return loss
        numerical_dL_dW[i, j] = numerical_gradient(f_W, W[i, j])

numerical_dL_db1 = np.zeros_like(b1)
for i in range(b1.shape[0]):
    def f_b1(b):
        b1_temp = b1.copy()
        b1_temp[i] = b
        loss, _, _, _, _ = forward(x, W, b1_temp, U, b2)
        return loss
    numerical_dL_db1[i] = numerical_gradient(f_b1, b1[i])

numerical_dL_dx = np.zeros_like(x)
for i in range(x.shape[0]):
    def f_x(x_val):
        x_temp = x.copy()
        x_temp[i] = x_val
        loss, _, _, _, _ = forward(x_temp, W, b1, U, b2)
        return loss
    numerical_dL_dx[i] = numerical_gradient(f_x, x[i])

# Compare analytical and numerical gradients
print("Analytical dL_dU:\n", dL_dU)
print("Numerical dL_dU:\n", numerical_dL_dU)
print("Analytical dL_db2:\n", dL_db2)
print("Numerical dL_db2:\n", numerical_dL_db2)
print("Analytical dL_dW:\n", dL_dW)
print("Numerical dL_dW:\n", numerical_dL_dW)
print("Analytical dL_db1:\n", dL_db1)
print("Numerical dL_db1:\n", numerical_dL_db1)
print("Analytical dL_dx:\n", dL_dx)
print("Numerical dL_dx:\n", numerical_dL_dx)

Analytical dL_dU:
 [[ 0.          0.38865328  0.          0.        ]
 [-0.         -0.3886532  -0.         -0.        ]]
Numerical dL_dU:
 [[ 0.         0.3874302  0.         0.       ]
 [ 0.        -0.3874302  0.         0.       ]]
Analytical dL_db2:
 [ 0.25237226 -0.2523722 ]
Numerical dL_db2:
 [ 0.25480986 -0.2503395 ]
Analytical dL_dW:
 [[ 0.         -0.          0.        ]
 [-0.11356751  0.22713502 -0.18170802]
 [-0.          0.         -0.        ]
 [ 0.         -0.          0.        ]]
Numerical dL_dW:
 [[ 0.          0.          0.        ]
 [-0.11175871  0.22798777 -0.18328428]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]]
Analytical dL_db1:
 [ 0.         -0.22713502 -0.          0.        ]
Numerical dL_db1:
 [ 0.         -0.22798777  0.          0.        ]
Analytical dL_dx:
 [-0.04542701  0.22713502 -0.18170802]
Numerical dL_dx:
 [-0.04917383  0.22798777 -0.18328428]


In [7]:
import torch

# Define the network parameters
W_torch = torch.tensor(W, requires_grad=True)
b1_torch = torch.tensor(b1, requires_grad=True)
U_torch = torch.tensor(U, requires_grad=True)
b2_torch = torch.tensor(b2, requires_grad=True)
x_torch = torch.tensor(x, requires_grad=True)
y_true_torch = torch.tensor(y_true)

# Forward pass
z_torch = W_torch @ x_torch + b1_torch
h_torch = torch.relu(z_torch)
theta_torch = U_torch @ h_torch + b2_torch
y_torch = torch.softmax(theta_torch, dim=0)
loss_torch = -torch.sum(y_true_torch * torch.log(y_torch))

# Backward pass
loss_torch.backward()

# Compare gradients
print("PyTorch dL_dU:\n", U_torch.grad)
print("PyTorch dL_db2:\n", b2_torch.grad)
print("PyTorch dL_dW:\n", W_torch.grad)
print("PyTorch dL_db1:\n", b1_torch.grad)
print("PyTorch dL_dx:\n", x_torch.grad)

PyTorch dL_dU:
 tensor([[ 0.0000,  0.3887,  0.0000,  0.0000],
        [-0.0000, -0.3887, -0.0000, -0.0000]])
PyTorch dL_db2:
 tensor([ 0.2524, -0.2524])
PyTorch dL_dW:
 tensor([[ 0.0000, -0.0000,  0.0000],
        [-0.1136,  0.2271, -0.1817],
        [ 0.0000, -0.0000,  0.0000],
        [ 0.0000, -0.0000,  0.0000]])
PyTorch dL_db1:
 tensor([ 0.0000, -0.2271,  0.0000,  0.0000])
PyTorch dL_dx:
 tensor([-0.0454,  0.2271, -0.1817])


In [8]:
import numpy as np

# --- 定义确定的网络参数，便于调试和验证 ---
D, H, C = 3, 4, 2  # 输入维度(D=3), 隐藏层大小(H=4), 输出类别数(C=2)

# 手动设定固定参数（非随机）
W = np.array([
    [1.0, 0.5, -0.3],   # 隐藏层第1个神经元的权重
    [0.2, -1.0, 0.8],   # 隐藏层第2个神经元的权重
    [0.4, 0.7, -0.9],   # 隐藏层第3个神经元的权重
    [-0.6, 0.1, 0.5]    # 隐藏层第4个神经元的权重
], dtype=np.float32)     # 形状 (4,3)

b1 = np.array([0.1, -0.2, 0.3, -0.4], dtype=np.float32)  # 隐藏层偏置 (4,)

U = np.array([
    [0.5, -0.3, 0.2, 0.1],   # 输出层第1个类别的权重
    [-0.4, 0.6, 0.7, -0.8]   # 输出层第2个类别的权重
], dtype=np.float32)          # 形状 (2,4)

b2 = np.array([0.2, -0.1], dtype=np.float32)  # 输出层偏置 (2,)

x = np.array([0.5, -1.0, 0.8], dtype=np.float32)  # 输入向量 (3,)

y_true = np.array([0, 1], dtype=np.float32)       # 真实标签（第二类）

# --- 前向传播函数（保持不变）---
def relu(z):
    return np.maximum(0, z)

def softmax(theta):
    exp_theta = np.exp(theta - np.max(theta))
    return exp_theta / np.sum(exp_theta)

def cross_entropy(y, y_true):
    return -np.sum(y_true * np.log(y + 1e-12))  # 添加微小值防止log(0)

def forward(x, W, b1, U, b2):
    z = W @ x + b1
    h = relu(z)
    theta = U @ h + b2
    y = softmax(theta)
    loss = cross_entropy(y, y_true)
    return loss, z, h, theta, y

# --- 计算解析梯度 ---
loss, z, h, theta, y = forward(x, W, b1, U, b2)

# 梯度计算步骤
dL_dtheta = y - y_true  # (2,)
dL_dU = np.outer(dL_dtheta, h)  # (2,4)
dL_db2 = dL_dtheta.copy()        # (2,)
dL_dh = U.T @ dL_dtheta         # (4,)
dL_dz = dL_dh * (z > 0)         # (4,)
dL_dW = np.outer(dL_dz, x)      # (4,3)
dL_db1 = dL_dz.copy()           # (4,)
dL_dx = W.T @ dL_dz            # (3,)

# --- 数值梯度计算函数（修正后）---
h_numerical = 1e-5

def numerical_gradient(f, param):
    param_plus = param + h_numerical
    param_minus = param - h_numerical
    return (f(param_plus) - f(param_minus)) / (2 * h_numerical)

# 计算数值梯度
def compute_numerical_gradients():
    numerical = {}
    
    # U的梯度
    numerical_dL_dU = np.zeros_like(U)
    for i in range(U.shape[0]):
        for j in range(U.shape[1]):
            def f_U(u_val):
                U_temp = U.copy()
                U_temp[i,j] = u_val
                loss,_,_,_,_ = forward(x, W, b1, U_temp, b2)
                return loss
            numerical_dL_dU[i,j] = numerical_gradient(f_U, U[i,j])
    numerical['U'] = numerical_dL_dU

    # b2的梯度
    numerical_dL_db2 = np.zeros_like(b2)
    for i in range(b2.shape[0]):
        def f_b2(b_val):
            b2_temp = b2.copy()
            b2_temp[i] = b_val
            loss,_,_,_,_ = forward(x, W, b1, U, b2_temp)
            return loss
        numerical_dL_db2[i] = numerical_gradient(f_b2, b2[i])
    numerical['b2'] = numerical_dL_db2

    # W的梯度（类似U）
    numerical_dL_dW = np.zeros_like(W)
    for i in range(W.shape[0]):
        for j in range(W.shape[1]):
            def f_W(w_val):
                W_temp = W.copy()
                W_temp[i,j] = w_val
                loss,_,_,_,_ = forward(x, W_temp, b1, U, b2)
                return loss
            numerical_dL_dW[i,j] = numerical_gradient(f_W, W[i,j])
    numerical['W'] = numerical_dL_dW

    # b1的梯度（类似b2）
    numerical_dL_db1 = np.zeros_like(b1)
    for i in range(b1.shape[0]):
        def f_b1(b_val):
            b1_temp = b1.copy()
            b1_temp[i] = b_val
            loss,_,_,_,_ = forward(x, W, b1_temp, U, b2)
            return loss
        numerical_dL_db1[i] = numerical_gradient(f_b1, b1[i])
    numerical['b1'] = numerical_dL_db1

    # x的梯度
    numerical_dL_dx = np.zeros_like(x)
    for i in range(x.shape[0]):
        def f_x(x_val):
            x_temp = x.copy()
            x_temp[i] = x_val
            loss,_,_,_,_ = forward(x_temp, W, b1, U, b2)
            return loss
        numerical_dL_dx[i] = numerical_gradient(f_x, x[i])
    numerical['x'] = numerical_dL_dx
    
    return numerical

numerical = compute_numerical_gradients()

# --- 结果对比 ---
print("Analytical dL_dU:\n", dL_dU)
print("Numerical dL_dU:\n", numerical_dL_dU)
print("Analytical dL_db2:\n", dL_db2)
print("Numerical dL_db2:\n", numerical_dL_db2)
print("Analytical dL_dW:\n", dL_dW)
print("Numerical dL_dW:\n", numerical_dL_dW)
print("Analytical dL_db1:\n", dL_db1)
print("Numerical dL_db1:\n", numerical_dL_db1)
print("Analytical dL_dx:\n", dL_dx)
print("Numerical dL_dx:\n", numerical_dL_dx)

Analytical dL_dU:
 [[ 0.          0.38865328  0.          0.        ]
 [-0.         -0.3886532  -0.         -0.        ]]
Numerical dL_dU:
 [[ 0.         0.3874302  0.         0.       ]
 [ 0.        -0.3874302  0.         0.       ]]
Analytical dL_db2:
 [ 0.25237226 -0.2523722 ]
Numerical dL_db2:
 [ 0.25480986 -0.2503395 ]
Analytical dL_dW:
 [[ 0.         -0.          0.        ]
 [-0.11356751  0.22713502 -0.18170802]
 [-0.          0.         -0.        ]
 [ 0.         -0.          0.        ]]
Numerical dL_dW:
 [[ 0.          0.          0.        ]
 [-0.11175871  0.22798777 -0.18328428]
 [ 0.          0.          0.        ]
 [ 0.          0.          0.        ]]
Analytical dL_db1:
 [ 0.         -0.22713502 -0.          0.        ]
Numerical dL_db1:
 [ 0.         -0.22798777  0.          0.        ]
Analytical dL_dx:
 [-0.04542701  0.22713502 -0.18170802]
Numerical dL_dx:
 [-0.04917383  0.22798777 -0.18328428]


In [9]:
import torch

# Define the network parameters
W_torch = torch.tensor(W, requires_grad=True)
b1_torch = torch.tensor(b1, requires_grad=True)
U_torch = torch.tensor(U, requires_grad=True)
b2_torch = torch.tensor(b2, requires_grad=True)
x_torch = torch.tensor(x, requires_grad=True)
y_true_torch = torch.tensor(y_true)

# Forward pass
z_torch = W_torch @ x_torch + b1_torch
h_torch = torch.relu(z_torch)
theta_torch = U_torch @ h_torch + b2_torch
y_torch = torch.softmax(theta_torch, dim=0)
loss_torch = -torch.sum(y_true_torch * torch.log(y_torch))

# Backward pass
loss_torch.backward()

# Compare gradients
print("PyTorch dL_dU:\n", U_torch.grad)
print("PyTorch dL_db2:\n", b2_torch.grad)
print("PyTorch dL_dW:\n", W_torch.grad)
print("PyTorch dL_db1:\n", b1_torch.grad)
print("PyTorch dL_dx:\n", x_torch.grad)





PyTorch dL_dU:
 tensor([[ 0.0000,  0.3887,  0.0000,  0.0000],
        [-0.0000, -0.3887, -0.0000, -0.0000]])
PyTorch dL_db2:
 tensor([ 0.2524, -0.2524])
PyTorch dL_dW:
 tensor([[ 0.0000, -0.0000,  0.0000],
        [-0.1136,  0.2271, -0.1817],
        [ 0.0000, -0.0000,  0.0000],
        [ 0.0000, -0.0000,  0.0000]])
PyTorch dL_db1:
 tensor([ 0.0000, -0.2271,  0.0000,  0.0000])
PyTorch dL_dx:
 tensor([-0.0454,  0.2271, -0.1817])
