In [2]:
import torch
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, num_features, esp=1e-5):
        super(LayerNorm, self).__init__()
        # 防止分母为0
        self.esp = esp
        # 初始化γ和β，该参数用于平移和缩放，增强模型在不同归一化分布下对特征的表达能力
        # 初始化一个可学习的参数γ
        self.gamma = nn.Parameter(torch.ones(num_features))
        # 初始化一个可学习的参数β
        self.beta = nn.Parameter(torch.zeros(num_features))
        
    def forward(self, x):
        # 沿着特征的维度求均值和方差
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        # 对特征进行归一化
        normed_x = (x-mean) / torch.sqrt(var+self.esp)
        # 进行平移和缩放
        return self.gamma *normed_x + self.beta

x = torch.rand(2, 10)
layer_norm = LayerNorm(10)
output = layer_norm(x)
output

torch.Size([2, 10])