In [2]:
import time
import torch
from torch import nn,optim
import torch.nn.functional as F

import sys
sys.path.append('..')
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def batch_norm(is_training,X,gamma,beta,moving_mean,moving_var,eps,momentum):
    # 判断当前模式是训练模式还是预测模式
    if not is_training:
         # 如果是在预测模式下，直接使用传入的移动平均所得的均值和方差
        X_hat = (X-moving_mean)/torch.sqrt(moving_var+eps)
    else:
        assert len(X.shape) in (2,4)
        if len(X.shape) == 2:
            #全连接层
            mean = X.mean(dim=0)
            var = ((X-mean)**2).mean(dim=0)
        else:
            #卷积层
            mean = X.mean(dim=(0,2,3),keepdim = True)
            var =  ((X-mean)**2).mean(dim=(0,2,3),keepdim = True)
        X_hat = (X-mean)/torch.sqrt(var+eps)
        #移动方差和均值，不取决于小样本
        moving_mean = momentum * moving_mean + (1.0-momentum)*mean
        moving_var = momentum*moving_var + (1.0-momentum)*var
    Y = gamma * X_hat + beta
    return Y,moving_mean,moving_var

In [4]:
class BatchNorm(nn.Module):
    def __init__(self,num_features,num_dims):
        super(BatchNorm,self).__init__()
        if num_dims == 2:
            shape = (1,num_features)
        else:
            shape = (1,num_features,1,1)
        #参与迭代
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        #不参与迭代
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)
    def forward(self,X):
        #保持计算位置一致
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        Y,self.moving_mean,self.moving_var = batch_norm(self.training,X,self.gamma,
                                                        self.beta,self.moving_mean,
                                                        self.moving_var,eps=1e-5,momentum=0.9)
        return Y

In [5]:
net = nn.Sequential(
        nn.Conv2d(1,6,5),# in_channels, out_channels, kernel_size
        BatchNorm(6,num_dims=4),
        nn.Sigmoid(),
        nn.MaxPool2d(2,2),
        nn.Conv2d(6,16,5),
        BatchNorm(16,num_dims=4),
        nn.Sigmoid(),
        nn.MaxPool2d(2,2),
        d2l.FlattenLayer(),
        nn.Linear(16*4*4,120),
        BatchNorm(120,num_dims=2),
        nn.Sigmoid(),
        nn.Linear(120,84),
        BatchNorm(84,num_dims=2),
        nn.Sigmoid(),
        nn.Linear(84,10))

In [6]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.0076, train acc 0.777, test acc 0.830, time 11.0 sec
epoch 2, loss 0.4735, train acc 0.858, test acc 0.842, time 9.3 sec
epoch 3, loss 0.3798, train acc 0.874, test acc 0.858, time 9.2 sec
epoch 4, loss 0.3430, train acc 0.882, test acc 0.866, time 9.6 sec
epoch 5, loss 0.3187, train acc 0.889, test acc 0.818, time 9.4 sec


In [7]:
net[1].gamma.view((-1,))

tensor([1.0567, 1.0016, 0.9698, 0.9174, 1.1968, 0.9967], device='cuda:0',
       grad_fn=<ViewBackward>)

In [8]:
net[1].beta.view((-1,))

tensor([-0.6612, -0.7462, -0.5699, -0.3830,  0.2342, -0.1123], device='cuda:0',
       grad_fn=<ViewBackward>)

In [None]:
import matpl