## Layer Normalization

$x \in \mathbf{R}^d$

$\mu = \frac{1}{d}\sum_{i}^dx_i$

$\sigma ^ 2 = \frac{1}{d}\sum_{i}^d(x_i - \mu)^2$

$\hat{x_i} = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}}$

$LayerNorm(x_i) = \gamma_i \hat{x_i} + \beta_i$

In [4]:
# LayerNorm
import torch
import torch.nn as nn

x = torch.randn(2, 3, 4)  # [B, S, E]

# LayerNorm 对 E 归一化
ln = nn.LayerNorm(4)
y = ln(x)
print(y.shape)  # ✅ [2, 5, 10]
print(y)
print(y[0, 0].mean(), y[0, 0].std())  # ≈ 0, ≈ 1

# BatchNorm1d 直接用会报错（因为输入不是 [B, C, L]）
# bn = nn.BatchNorm1d(5)
# try:
#     y = bn(x)  
# except Exception as e:
#     print("BatchNorm1d failed:", e)


torch.Size([2, 3, 4])
tensor([[[-1.1780, -0.3379,  1.5793, -0.0634],
         [ 0.7418,  0.3202,  0.6486, -1.7106],
         [ 0.5709, -1.6080, -0.0061,  1.0432]],

        [[ 1.1907, -0.1755, -1.5166,  0.5014],
         [ 0.4835, -0.2095,  1.2203, -1.4944],
         [-0.0659, -1.5755,  1.1090,  0.5324]]],
       grad_fn=<NativeLayerNormBackward0>)
tensor(3.1665e-08, grad_fn=<MeanBackward0>) tensor(1.1547, grad_fn=<StdBackward0>)


In [17]:
import torch
import torch.nn as nn

# 假设每个样本有 10 个特征，batch size 为 4
# x = torch.tensor([[1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#                   [11.0, 12, 13, 14, 15, 16, 17, 18, 19, 20],
#                   [21.0, 22, 23, 24, 25, 26, 27, 28, 29, 30],
#                   [31.0, 32, 33, 34, 35, 36, 37, 38, 39, 40]],
#                  dtype=torch.float32)  # shape: [4, 10]

# # 创建 BatchNorm1d，作用在 10 维特征上
# bn = nn.BatchNorm1d(10)

x = torch.randn(4, 3, 4)  # [B, S, E]
bn = nn.BatchNorm1d(3)
bn2 = nn.BatchNorm1d(3)
# 设置为训练模式
bn.train()
bn2.train()

# 执行归一化
out = bn(x)

y = x[0:2]
print(y)

out_2 = bn2(y)

# 打印结果
print("输入 x:\n", x)
print("\n归一化后输出 out:\n", out)
print("\n归一化后输出 out_2:\n", out_2)
print("\n可学习参数 gamma (weight):\n", bn.weight.data)
print("可学习参数 beta (bias):\n", bn.bias.data)


tensor([[[ 0.5926, -2.3817, -0.1639,  0.1778],
         [ 0.7832, -1.3496, -0.0598, -1.7809],
         [-0.7037, -0.5313,  0.2901, -0.2698]],

        [[ 0.8184,  1.2024, -0.8448, -0.0255],
         [ 0.5303,  1.1668, -0.0562, -1.9919],
         [-0.9228, -0.4486,  1.2760, -0.7444]]])
输入 x:
 tensor([[[ 0.5926, -2.3817, -0.1639,  0.1778],
         [ 0.7832, -1.3496, -0.0598, -1.7809],
         [-0.7037, -0.5313,  0.2901, -0.2698]],

        [[ 0.8184,  1.2024, -0.8448, -0.0255],
         [ 0.5303,  1.1668, -0.0562, -1.9919],
         [-0.9228, -0.4486,  1.2760, -0.7444]],

        [[-0.3610, -0.9783, -0.9163, -0.6306],
         [-0.7238, -0.0909, -1.3626, -0.6182],
         [-0.2154,  0.3414, -0.9250, -1.4365]],

        [[-0.7154, -2.0548,  0.4197,  0.0632],
         [-0.5378,  0.3008, -0.3595, -0.2863],
         [ 0.5011, -0.4610, -0.1347, -1.0008]]])

归一化后输出 out:
 tensor([[[ 1.0191e+00, -2.1549e+00,  2.1181e-01,  5.7646e-01],
         [ 1.3691e+00, -1.0940e+00,  3.9557e-01, -1.5921e+