## 均方误差 MSELoss

In [3]:
import torch
import torch.nn as nn

y_pred = torch.tensor([2.5, 0.0, 2.1, 7.8], requires_grad=True)
y_true = torch.tensor([3.0, -0.5, 2.0, 7.0])

criterion = nn.MSELoss()
loss = criterion(y_pred, y_true)
print("MSE Loss: ", loss.item())

loss.backward()
print("gradient of y: ", y_pred.grad)


MSE Loss:  0.2875000834465027
gradient of y:  tensor([-0.2500,  0.2500,  0.0500,  0.4000])


In [5]:
# 手写
diff = y_pred - y_true
loss = torch.mean(diff**2)

print("MSE Loss: ", loss.item())

n = y_pred.shape[0]
grad_manual = (2.0/n) * (y_pred - y_true)

print("gradient manual: ", grad_manual)

MSE Loss:  0.2875000834465027
gradient manual:  tensor([-0.2500,  0.2500,  0.0500,  0.4000], grad_fn=<MulBackward0>)


## 交叉熵
二分类 

torch.BCELoss()   

$Loss = - \frac{1}{n}\sum_{i = 1}^{n}[y_i\log(\hat{y_i}) + (1 - y_i)\log(1 - \hat{y_i})]$   

$\frac{\partial Loss}{\partial y_i} = -\frac{1}{n}(\frac{y_i}{\hat{y_i}} - \frac{1 - y_i}{1 - \hat{y_i}})$

torch.BCEWithLogitsLoss()

$Loss = - \frac{1}{n}\sum_{i = 1}^{n}[y_i \log(\sigma(x_i)) + (1 - y_i)\log(1 - \sigma(x_i))]$  

$\sigma(x) = \frac{1}{1 + e^{-x}}$  

$\frac{dy_i}{dx_i} = \frac{e^{-x_i}}{(1 + e^{-x_i})^2} = \hat{y_i}(1 - \hat{y_i})$   

$\frac{\partial Loss}{\partial x_i} = \frac{\partial Loss}{\partial y_i} \times \frac{dy_i}{dx_i} = -\frac{1}{n}(y_i - \hat{y_i})$


In [16]:
import torch
import torch.nn as nn

probs = torch.tensor([0.668, 0.231, 0.881], requires_grad=True)
labels = torch.tensor([1, 0, 1], dtype=torch.float32)
criterion = nn.BCELoss()
loss = criterion(probs, labels)

print("BCELoss: ", loss.item())

loss.backward()
print("gradient: ", probs.grad)


BCELoss:  0.2642763555049896
gradient:  tensor([-0.4990,  0.4335, -0.3784])


In [8]:
import torch
import torch.nn as nn

# 假设模型输出的是 logits（未经过 Sigmoid）
logits = torch.tensor([0.7, -1.2, 2.0], requires_grad=True)  # shape: [batch_size]
labels = torch.tensor([1.0, 0.0, 1.0])  # shape: [batch_size]

# 使用 BCEWithLogitsLoss（自动对 logits 做 Sigmoid）
criterion = nn.BCEWithLogitsLoss()
loss = criterion(logits, labels)

print("Binary Cross Entropy Loss:", loss.item())

# 反向传播
loss.backward()
print("logits 的梯度：", logits.grad)


Binary Cross Entropy Loss: 0.2644655108451843
logits 的梯度： tensor([-0.1106,  0.0772, -0.0397])


In [None]:
# 手写交叉熵
# 模拟一组预测概率（已过 Sigmoid）和真实标签
y_pred = torch.tensor([0.668, 0.231, 0.881], requires_grad=True)  # 模型预测的概率
y_true = torch.tensor([1.0, 0.0, 1.0])  # 标签
n = y_pred.shape[0]
epsilon = 1e-7
loss = -torch.mean(y_true * torch.log(y_pred + epsilon) + (1 - y_true) * torch.log(1 - y_pred + epsilon))
print("Loss: ", loss.item())

gradient_manual = -(y_true/y_pred - (1 - y_true)/(1 - y_pred)) / n
print("Gradient: ", gradient_manual)

# 自动求导
# loss.backward()
# print("Autograd gradient:", y_pred.grad)

Loss:  0.2642762362957001
Gradient:  tensor([-0.4990,  0.4335, -0.3784], grad_fn=<DivBackward0>)
Autograd gradient: tensor([-0.4990,  0.4335, -0.3784])


In [19]:
import torch
import torch.nn as nn

# 假设模型输出的是 logits（未经过 Sigmoid）
logits = torch.tensor([0.7, -1.2, 2.0], requires_grad=True)  # shape: [batch_size]
labels = torch.tensor([1.0, 0.0, 1.0])  # shape: [batch_size]
epsilon = 0
loss = -torch.mean(y_true * torch.log(torch.sigmoid(logits) + epsilon) + (1 - y_true) * torch.log(1 - torch.sigmoid(logits) + epsilon))

print("Binary Cross Entropy Loss:", loss.item())
n = logits.shape[0]
y_pred = torch.sigmoid(logits)
gradient_manual = - (y_true - y_pred)/n

print("gradient_manual: ", gradient_manual)
# 反向传播
loss.backward()
print("logits 的梯度：", logits.grad)

Binary Cross Entropy Loss: 0.2644655704498291
gradient_manual:  tensor([-0.1106,  0.0772, -0.0397], grad_fn=<DivBackward0>)
logits 的梯度： tensor([-0.1106,  0.0772, -0.0397])


## 多分类
Loss function，我们不妨只考虑单个样本  

有C类

z = [z1, z2, z3,...,Z_C]

y = [y1, y2, y3,...,y_C]

$y_i = \frac{e^{z_i}}{\sum_{j \in C}e^{z_j}}$

$Loss = - \log y_k$ k为真实标签

$\frac{d Loss}{d y_k} = - \frac{1}{y_k}$

$\frac{\partial y_k}{\partial z_k} = \frac{e^{z_k}\sum_{j \in C}e^{z_j} - e^{z_k}e^{z_k}}{(\sum_{j \in C}e^{z_j})^2} = y_k(1 - y_k)$

$\frac{\partial y_k}{\partial z_i} = -\frac{e^{z_k}e^{z_i}}{(\sum_{j \in C}e^{z_j})^2} = y_ky_i$

$\frac{\partial Loss}{\partial z_k} = y_k - 1$

$\frac{\partial Loss}{\partial z_i} = y_i$

In [3]:
import torch
import torch.nn as nn

# 模拟 logits（未归一化分数）和标签
logits = torch.tensor([[2.0, 0.5, 0.3],
                       [0.2, 2.5, 0.3]], requires_grad=True)  # shape: [batch_size, num_classes]
labels = torch.tensor([1, 2])  # 每个样本的类别索引

# 创建损失函数对象
criterion = nn.CrossEntropyLoss()
loss = criterion(logits, labels)

print("Cross Entropy Loss:", loss.item())

# 反向传播
loss.backward()
print("logits 的梯度:", logits.grad)


Cross Entropy Loss: 2.1160569190979004
logits 的梯度: tensor([[ 0.3557, -0.4206,  0.0650],
        [ 0.0414,  0.4129, -0.4543]])


In [7]:
def my_softmax(logits):
    print(torch.exp(logits))
    print(torch.sum(torch.exp(logits), dim=1))
    print(torch.sum(torch.exp(logits), dim=1, keepdim=True))
    return torch.exp(logits)/torch.sum(torch.exp(logits), dim=1, keepdim=True)

logits = torch.tensor([[2.0, 0.5, 0.3],
                       [0.2, 2.5, 0.3]], requires_grad=True)  # shape: [batch_size, num_classes]
my_softmax(logits)

tensor([[ 7.3891,  1.6487,  1.3499],
        [ 1.2214, 12.1825,  1.3499]], grad_fn=<ExpBackward0>)
tensor([10.3876, 14.7538], grad_fn=<SumBackward1>)
tensor([[10.3876],
        [14.7538]], grad_fn=<SumBackward1>)


tensor([[0.7113, 0.1587, 0.1299],
        [0.0828, 0.8257, 0.0915]], grad_fn=<DivBackward0>)

In [6]:
import torch
# 手写多分类

# 模拟 logits（未归一化分数）和标签
logits = torch.tensor([[2.0, 0.5, 0.3],
                       [0.2, 2.5, 0.3]], requires_grad=True)  # shape: [batch_size, num_classes]
labels = torch.tensor([1, 2])  # 每个样本的类别索引
n = logits.shape[0]

softmax = torch.softmax(logits, dim=1)
mysoft = my_softmax(logits)
print("my softmax: ", mysoft)
print("softmax: ", softmax)
print(labels.shape)
print(labels.unsqueeze(1).shape)

# softmax
probs = softmax.gather(dim=1, index = labels.unsqueeze(1)).squeeze()
print(probs)
loss = -torch.mean(torch.log(probs))
print(loss)


# 再拆开
index_loss = logits.gather(dim=1, index=labels.unsqueeze(1)).squeeze()
other_loss = torch.log(torch.sum(torch.exp(logits), dim = 1))
print(index_loss)
print(other_loss)
loss = torch.mean(other_loss - index_loss)
print(loss)

# 手动梯度

gradient_manual = softmax
gradient_manual[range(len(labels)), labels] -= 1
gradient_manual /= n
print("mannual gradient: ", gradient_manual)
loss.backward()
print("auto gradient: ", logits.grad)

my softmax:  tensor([[0.7113, 0.1587, 0.1299],
        [0.0828, 0.8257, 0.0915]], grad_fn=<DivBackward0>)
softmax:  tensor([[0.7113, 0.1587, 0.1299],
        [0.0828, 0.8257, 0.0915]], grad_fn=<SoftmaxBackward0>)
torch.Size([2])
torch.Size([2, 1])
tensor([0.1587, 0.0915], grad_fn=<SqueezeBackward0>)
tensor(2.1161, grad_fn=<NegBackward0>)
tensor([0.5000, 0.3000], grad_fn=<SqueezeBackward0>)
tensor([2.3406, 2.6915], grad_fn=<LogBackward0>)
tensor(2.1161, grad_fn=<MeanBackward0>)
mannual gradient:  tensor([[ 0.3557, -0.4206,  0.0650],
        [ 0.0414,  0.4129, -0.4543]], grad_fn=<DivBackward0>)
auto gradient:  tensor([[ 0.3557, -0.4206,  0.0650],
        [ 0.0414,  0.4129, -0.4543]])
