In [10]:
import time
import torch
from torch import nn, optim
import torch.nn.functional as F

import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
def batch_norm(is_training,X,gamma,beta,moving_mean,moving_var,eps,momentum):
    if not is_training:
        # 预测模式下，直接使用传入的移动平均所得的均值和方差
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2,4)
        if len(X.shape) == 2:
            # 使用全连接层的情况
            mean = X.mean(dim=0)
            var = ((X - mean)**2).mean(dim=0)
        else:
            # 使用二维卷积层的情况。通道维度axis=1
            # 计算通道维上均值和方差
            # 这里保持X的形状以便后面可以做广播运算
            mean = X.mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
            var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
        # 训练模式下使用但该案的均值和方差做标准化
        X_hat = (X - mean) / torch.sqrt(var + eps)
        # 更新移动平均的均值和方差
        moving_mean = momentum * moving_var + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta
    return Y,moving_mean,moving_var

In [29]:
X = torch.rand(2,4)
mean = X.mean(dim=0)
var = ((X - mean)**2).mean(dim=0)
print(X)
print(((X - mean)**2))
print(mean)
print(var)

tensor([[0.1381, 0.1119, 0.9487, 0.7644],
        [0.6178, 0.9555, 0.4787, 0.8178]])
tensor([[0.0575, 0.1779, 0.0552, 0.0007],
        [0.0575, 0.1779, 0.0552, 0.0007]])
tensor([0.3780, 0.5337, 0.7137, 0.7911])
tensor([0.0575, 0.1779, 0.0552, 0.0007])


# BatchNorm层

In [12]:
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super(BatchNorm, self).__init__()
        if num_dims == 2: # num_dims对于全连接层和卷积层来说分别为2和4。
            shape = (1,num_features)
        else:
            shape = (1, num_features,1,1)
            
        #参与求梯度和迭代的拉伸和偏移参数，分别初始化成0和1
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.ones(shape))
        
        #不参与求梯度和迭代的变量，在内存上初始化为0
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)
    
    def forward(self,X):
        # 如果X不在内存上，将moving_mean和moving_var复制到X所在显存上
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
            
        # 保存更新过的moving_mean和moving_var, 
        # Module实例的traning属性默认为true, 调用.eval()后设成false
        Y,self.moving_mean, self.moving_var = batch_norm(
            self.training, 
            X, 
            self.gamma,
            self.beta,
            self.moving_mean,
            self.moving_var,
            eps=1e-5,
            momentum=0.9)
        return Y

#  使用批量归一化层的LeNet

In [13]:
net = nn.Sequential(
            nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
            BatchNorm(6, num_dims=4), # num_dims对于全连接层和卷积层来说分别为2和4。
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2), # kernel_size, stride
            nn.Conv2d(6, 16, 5),
            BatchNorm(16, num_dims=4),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2),
            d2l.FlattenLayer(),
            nn.Linear(16*4*4, 120),
            BatchNorm(120, num_dims=2),
            nn.Sigmoid(),
            nn.Linear(120, 84),
            BatchNorm(84, num_dims=2),
            nn.Sigmoid(),
            nn.Linear(84, 10)
        )

In [19]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)


training on  cpu
epoch 1, loss 1.0850, train acc 0.774, test acc 0.100, time 17.8 sec


KeyboardInterrupt: 

查看第一个批量归一化层学习到的拉伸参数gamma和偏移参数beta

In [21]:
net[1].gamma.view(-1,)
net[1].beta.view(-1,)

tensor([0.8527, 0.7832, 1.1020, 0.8823, 0.8849, 1.0511],
       grad_fn=<ViewBackward>)

# 简洁实现

Pytorch中nn模块定义的BatchNorm1d和BatchNorm2d类使用起来更加简单，二者分别用于全连接层和卷积层，都需要指定输入的num_features参数值。

In [None]:
net = nn.Sequential(
            nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
            nn.BatchNorm2d(6), #2d用于卷积层
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2), # kernel_size, stride
            nn.Conv2d(6, 16, 5),
            nn.BatchNorm2d(16),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2),
            d2l.FlattenLayer(),
            nn.Linear(16*4*4, 120),
            nn.BatchNorm1d(120), #1d用于全连接层
            nn.Sigmoid(),
            nn.Linear(120, 84),
            nn.BatchNorm1d(84),
            nn.Sigmoid(),
            nn.Linear(84, 10)
        )


In [None]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)
