In [46]:
import torch
import torch.nn as nn
pdist = nn.PairwiseDistance(p=2)

## batch norm
按通道维度计算平均值和方差
$$
y=\frac{x-E(x)}{\sqrt{Var(x)+\epsilon}}*\gamma+\beta
$$

In [47]:
inputs = torch.rand([16,32,64,64])
# print(inputs)
mean = torch.mean(inputs, [0,2,3], keepdim=True) # 按通道维度即32那个维度，计算那个维度对应的16*64*64个值的平均值
var = torch.var(inputs, [0,2,3], keepdim=True)
# print(mean)
# print(var)
bn = nn.BatchNorm2d(32)(inputs)
bn_ = (inputs-mean)/torch.sqrt(var+1e-5)
# print(bn[:,1,:,:])
# print(bn_)
print(torch.sum(torch.abs(bn-bn_)))    # 结果差距很小
print(torch.sum(pdist(bn, bn_)))

tensor(13.8286, grad_fn=<SumBackward0>)
tensor(2.0101, grad_fn=<SumBackward0>)


## Layer norm
避开了batch维度的大小限制，直接按照batch维度对其他几个维度计算平均值和方差
$$
y=\frac{x-E(x)}{\sqrt{Var(x)+\epsilon}}*\gamma+\beta
$$

In [48]:
inputs = torch.rand([16,32,64,64])
mean = torch.mean(inputs, [1,2,3], keepdim=True) # 按batch维度即16那个维度，计算那个维度对应的32*64*64个值的平均值
var = torch.var(inputs, [1,2,3], keepdim=True)
ln = nn.LayerNorm([32,64,64])(inputs)
ln_ = (inputs-mean)/torch.sqrt(var+1e-5)
print(torch.sum(torch.abs(ln-ln_)))    # 结果差距很小
print(torch.sum(pdist(ln, ln_)))

tensor(6.9151, grad_fn=<SumBackward0>)
tensor(1.0266, grad_fn=<SumBackward0>)


## Instance norm
 归一化的维度为[H,W]；根据N，C维度计算其他两个维度的平均值和方差，即计算H× W个数据的平均值和方差

In [49]:
inputs = torch.rand([16,32,64,64])
mean = torch.mean(inputs, [2,3], keepdim=True) # 按batch和channel维度即[16,32]那个维度，计算那个维度对应的64*64个值的平均值
var = torch.var(inputs, [2,3], keepdim=True)
In = nn.InstanceNorm2d(32)(inputs)
In_ = (inputs-mean)/torch.sqrt(var+1e-5)
print(torch.sum(torch.abs(In-In_)))    # 结果差距很小
print(torch.sum(pdist(In, In_)))

tensor(221.6718)
tensor(31.9452)


## Group norm
介于LN和IN之间，其首先将channel分为许多组（group），对每一组做归一化，即先将feature的维度由[N, C, H, W]reshape为[N, G，C//G , H, W]，归一化的维度为[C//G , H, W]

In [50]:
groups = 8
inputs = torch.rand([16,32,64,64])
inputs_ = inputs.reshape([16, groups, 32//groups, 64, 64])
mean = torch.mean(inputs_, [2,3,4], keepdim=True) # 按batch和group维度即[16,8]那个维度，计算那个维度对应的4*64*64个值的平均值
var = torch.var(inputs_, [2,3,4], keepdim=True)
gn = nn.GroupNorm(groups, 32)(inputs)
gn_ = (inputs_-mean)/torch.sqrt(var+1e-5)
gn_ = gn_.reshape([16, 32, 64,64])
print(torch.sum(torch.abs(gn-gn_)))    # 结果差距很小
print(torch.sum(pdist(In, In_)))

tensor(55.4126, grad_fn=<SumBackward0>)
tensor(31.9452)


batchNorm是在batch上，对小batchsize效果不好；
layerNorm在通道方向上，主要对RNN作用明显；
instanceNorm在图像像素上，用在风格化迁移；
GroupNorm将channel分组，然后再做归一化, 在batchsize<16的时候, 可以使用这种归一化；