# LayerNormalization

LN的提出，是为了解决BN的两个问题
1. 不适合RNN这种动态网络，也就是每个batch中的数据，长度都不是固定的。
2. batch_size变小模型性能急剧下降

## 具体做法
根据样本的特征数做归一化，也就是说 B*C*F的数据，根据C和F做归一化

## 注意事项
在CNN中，BN的性能还是比LN要好，但是最近很多人将自然语言领域的模型用来处理图像，所以还是会涉及到LN

In [20]:
# 模型优化之Layer Normalization
import torch as t
import torch.nn as nn
from torch import Tensor
def layer_norm(X:Tensor,
                    gamma:Tensor,
                    beta:Tensor,
                    eps:float,
                    )->tuple[Tensor,Tensor,Tensor]:
    
    
    assert len(X.shape) == 4 
    mean  = X.mean(dim=(1,2,3),keepdim=True)
    var = ((X-mean)**2).mean(dim=(1, 2, 3), keepdim=True)


    X_hat = (X-mean)/t.sqrt(var+eps)
    # 进行缩放和移位，即乘以gamma加上beta
    gamma =gamma.unsqueeze(0).repeat(X_hat.shape[0],1,1,1)
    beta =beta.unsqueeze(0).repeat(X_hat.shape[0],1,1,1)
    Y = X_hat*gamma+beta
    return Y

In [21]:
class LayerNorm(nn.Module):
    def __init__(self, normal_shape: tuple[int], eps=1e-5) -> None:
        super().__init__()
        self.eps = eps
        self.normal_shape = normal_shape
        # 参与求梯度和迭代的拉伸和偏移 scale and center，分别初始化为1和0
        self.gamma = nn.Parameter(t.ones(normal_shape))
        self.beta = nn.Parameter(t.zeros(normal_shape))
        # 非模型参数的变量初始化为0和1

    def forward(self, X: Tensor) -> Tensor:
        # 保存更新之后的moving_mean和moving_var
        Y= layer_norm(
            X, self.gamma, self.beta, self.eps)
        return Y


In [22]:
X=t.randn((2,3,3,3))
ln1=LayerNorm([3,3,3])
ln2 = nn.LayerNorm([3, 3, 3])


In [23]:
ln1.forward(X)

tensor([[[[-2.0675,  0.6669, -2.7354],
          [ 0.6440,  1.2522,  0.2103],
          [ 0.1085,  0.6687, -0.8088]],

         [[-1.0286,  1.1523, -0.7071],
          [ 0.3461,  1.0671, -0.4448],
          [ 0.6964,  0.8086, -0.5977]],

         [[-0.9233,  0.6457, -0.6459],
          [-0.9472,  0.4827,  1.2731],
          [ 0.5590,  0.8756, -0.5509]]],


        [[[-1.4413,  2.6829, -0.1880],
          [ 0.9779,  1.3153, -1.8779],
          [ 0.0497,  0.4556, -0.7516]],

         [[ 0.8969,  0.4723, -0.1573],
          [-0.7768,  0.6440, -1.3729],
          [-0.3492,  0.4028, -0.5894]],

         [[ 1.5562, -0.8485,  0.9283],
          [-0.6250, -0.3974, -0.1383],
          [ 0.1068, -1.2142,  0.2392]]]], grad_fn=<AddBackward0>)

In [24]:
ln2.forward(X)

tensor([[[[-2.0675,  0.6669, -2.7354],
          [ 0.6440,  1.2522,  0.2103],
          [ 0.1085,  0.6687, -0.8088]],

         [[-1.0286,  1.1523, -0.7071],
          [ 0.3461,  1.0671, -0.4448],
          [ 0.6964,  0.8086, -0.5977]],

         [[-0.9233,  0.6457, -0.6459],
          [-0.9472,  0.4827,  1.2731],
          [ 0.5590,  0.8756, -0.5509]]],


        [[[-1.4413,  2.6829, -0.1880],
          [ 0.9779,  1.3153, -1.8779],
          [ 0.0497,  0.4556, -0.7516]],

         [[ 0.8969,  0.4723, -0.1573],
          [-0.7768,  0.6440, -1.3729],
          [-0.3492,  0.4028, -0.5894]],

         [[ 1.5562, -0.8485,  0.9283],
          [-0.6250, -0.3974, -0.1383],
          [ 0.1068, -1.2142,  0.2392]]]], grad_fn=<NativeLayerNormBackward0>)