In [2]:
import sys
sys.path.insert(0, '..')
import gluonbook as gb
from mxnet import nd, gluon, init, autograd
from mxnet.gluon import nn

def batch_norm(X, gamma, beta, moving_mean, moving_var,
               eps, momentum):
    # 通过 autograd 来获取是不是在训练环境下。
    if not autograd.is_training():
        # 如果是在预测模式下，直接使用传入的移动平滑均值和方差。
        X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        # 接在全连接层后情况，计算特征维上的均值和方差。
        if len(X.shape) == 2:
            mean = X.mean(axis=0)
            var = ((X - mean)**2).mean(axis=0)
        # 接在二维卷积层后的情况，计算通道维上（axis=1）的均值和方差。这里我们需要保持 X
        # 的形状以便后面可以正常的做广播运算。
        else:
            mean = X.mean(axis=(0,2,3), keepdims=True)
            var = ((X - mean)**2).mean(axis=(0,2,3), keepdims=True)
        # 训练模式下用当前的均值和方差做归一化。
        X_hat = (X - mean) / nd.sqrt(var + eps)
        # 更新移动平滑均值和方差。
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    # 拉升和偏移
    Y = gamma * X_hat + beta
    return (Y, moving_mean, moving_var)

  from ._conv import register_converters as _register_converters


In [3]:
class BatchNorm(nn.Block):
    def __init__(self, num_features, num_dims, **kwargs):
        super(BatchNorm, self).__init__(**kwargs)
        shape = (1,num_features) if num_dims == 2 else (1,num_features,1,1)
        # 参与求导和更新的模型参数，分别初始化成 0 和 1。
        self.beta = self.params.get('beta', shape=shape, init=init.Zero())
        self.gamma = self.params.get('gamma', shape=shape, init=init.One())
        # 不参与求导的模型参数。全在 CPU 上初始化成 0。
        self.moving_mean = nd.zeros(shape)
        self.moving_variance = nd.zeros(shape)
    def forward(self, X):
        # 如果 X 不在 CPU 上，将 moving_mean 和 moving_varience 复制到对应设备上。
        if self.moving_mean.context != X.context:
            self.moving_mean = self.moving_mean.copyto(X.context)
            self.moving_variance = self.moving_variance.copyto(X.context)
        # 保存更新过的 moving_mean 和 moving_var。
        Y, self.moving_mean, self.moving_variance = batch_norm(
            X, self.gamma.data(), self.beta.data(), self.moving_mean,
            self.moving_variance, eps=1e-5, momentum=0.9)
        return Y

In [5]:

net = nn.Sequential()
net.add(
    nn.Conv2D(6, kernel_size=5),
    BatchNorm(6, num_dims=4),
    nn.Activation('sigmoid'),
    nn.MaxPool2D(pool_size=2, strides=2),
    nn.Conv2D(16, kernel_size=5),
    BatchNorm(16, num_dims=4),
    nn.Activation('sigmoid'),
    nn.MaxPool2D(pool_size=2, strides=2),
    nn.Dense(120),dd
    BatchNorm(120, num_dims=2),
    nn.Activation('sigmoid'),
    nn.Dense(84),
    BatchNorm(84, num_dims=2),
    nn.Activation('sigmoid'),
    nn.Dense(10)
)

lr = 1.0
ctx = gb.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
train_data, test_data = gb.load_data_fashion_mnist(batch_size=256)
gb.train(train_data, test_data, net, loss, trainer, ctx, num_epochs=100)

training on gpu(0)
epoch 1, loss 0.6536, train acc 0.768, test acc 0.826, time 3.7 sec
epoch 2, loss 0.3949, train acc 0.857, test acc 0.821, time 3.7 sec
epoch 3, loss 0.3456, train acc 0.875, test acc 0.873, time 3.7 sec
epoch 4, loss 0.3215, train acc 0.882, test acc 0.856, time 3.7 sec
epoch 5, loss 0.3001, train acc 0.891, test acc 0.868, time 3.7 sec
epoch 6, loss 0.2862, train acc 0.896, test acc 0.880, time 3.7 sec
epoch 7, loss 0.2724, train acc 0.902, test acc 0.884, time 3.7 sec
epoch 8, loss 0.2624, train acc 0.904, test acc 0.884, time 3.7 sec
epoch 9, loss 0.2516, train acc 0.908, test acc 0.897, time 3.7 sec
epoch 10, loss 0.2437, train acc 0.911, test acc 0.862, time 3.7 sec
epoch 11, loss 0.2378, train acc 0.912, test acc 0.884, time 3.7 sec
epoch 12, loss 0.2307, train acc 0.915, test acc 0.815, time 3.7 sec
epoch 13, loss 0.2245, train acc 0.917, test acc 0.889, time 3.7 sec
epoch 14, loss 0.2175, train acc 0.920, test acc 0.903, time 3.7 sec
epoch 15, loss 0.2113, t

KeyboardInterrupt: 

In [4]:
lr = 1.0
ctx = gb.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
train_data, test_data = gb.load_data_fashion_mnist(batch_size=256)
gb.train(train_data, test_data, net, loss, trainer, ctx, num_epochs=100)

training on gpu(0)
epoch 1, loss 0.6647, train acc 0.761, test acc 0.715, time 6.4 sec
epoch 2, loss 0.4011, train acc 0.855, test acc 0.826, time 3.7 sec
epoch 3, loss 0.3511, train acc 0.871, test acc 0.860, time 3.7 sec
epoch 4, loss 0.3221, train acc 0.882, test acc 0.841, time 3.7 sec
epoch 5, loss 0.2999, train acc 0.891, test acc 0.869, time 3.7 sec
epoch 6, loss 0.2872, train acc 0.894, test acc 0.773, time 3.7 sec
epoch 7, loss 0.2748, train acc 0.899, test acc 0.859, time 3.7 sec
epoch 8, loss 0.2634, train acc 0.903, test acc 0.868, time 3.6 sec
epoch 9, loss 0.2555, train acc 0.907, test acc 0.878, time 3.6 sec
epoch 10, loss 0.2466, train acc 0.908, test acc 0.864, time 3.7 sec
epoch 11, loss 0.2429, train acc 0.910, test acc 0.889, time 3.7 sec
epoch 12, loss 0.2331, train acc 0.915, test acc 0.884, time 3.7 sec
epoch 13, loss 0.2277, train acc 0.916, test acc 0.883, time 3.7 sec
epoch 14, loss 0.2220, train acc 0.918, test acc 0.897, time 3.7 sec
epoch 15, loss 0.2166, t

In [6]:

net = nn.Sequential()
net.add(
    nn.Conv2D(6, kernel_size=5),

    nn.Activation('sigmoid'),

    nn.MaxPool2D(pool_size=2, strides=2),
            BatchNorm(6, num_dims=4),
    nn.Conv2D(16, kernel_size=5),
   
    nn.Activation('sigmoid'),
    nn.MaxPool2D(pool_size=2, strides=2),
     BatchNorm(16, num_dims=4),
    nn.Dense(120),
    
    nn.Activation('sigmoid'),
    BatchNorm(120, num_dims=2),
    nn.Dense(84),
   
    nn.Activation('sigmoid'),
     BatchNorm(84, num_dims=2),
    nn.Dense(10)
)
lr = 1.0
ctx = gb.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
train_data, test_data = gb.load_data_fashion_mnist(batch_size=256)
gb.train(train_data, test_data, net, loss, trainer, ctx, num_epochs=100)

training on gpu(0)
epoch 1, loss 0.7385, train acc 0.745, test acc 0.794, time 3.4 sec
epoch 2, loss 0.4175, train acc 0.845, test acc 0.844, time 3.4 sec
epoch 3, loss 0.3713, train acc 0.861, test acc 0.868, time 3.3 sec
epoch 4, loss 0.3381, train acc 0.873, test acc 0.853, time 3.3 sec
epoch 5, loss 0.3171, train acc 0.882, test acc 0.880, time 3.3 sec
epoch 6, loss 0.3043, train acc 0.886, test acc 0.882, time 3.3 sec
epoch 7, loss 0.2886, train acc 0.893, test acc 0.877, time 3.3 sec
epoch 8, loss 0.2782, train acc 0.895, test acc 0.890, time 3.3 sec
epoch 9, loss 0.2689, train acc 0.898, test acc 0.875, time 3.3 sec
epoch 10, loss 0.2576, train acc 0.902, test acc 0.885, time 3.3 sec
epoch 11, loss 0.2515, train acc 0.904, test acc 0.880, time 3.6 sec
epoch 12, loss 0.2442, train acc 0.908, test acc 0.888, time 3.3 sec
epoch 13, loss 0.2369, train acc 0.910, test acc 0.889, time 3.3 sec
epoch 14, loss 0.2313, train acc 0.913, test acc 0.887, time 3.4 sec
epoch 15, loss 0.2284, t

KeyboardInterrupt: 

In [7]:

net = nn.Sequential()
net.add(
    nn.Conv2D(6, kernel_size=5),

    nn.Activation('relu'),
 
    nn.MaxPool2D(pool_size=2, strides=2),
           BatchNorm(6, num_dims=4),
    nn.Conv2D(16, kernel_size=5),
   
    nn.Activation('relu'),
    nn.MaxPool2D(pool_size=2, strides=2),
     BatchNorm(16, num_dims=4),
    nn.Dense(120),
    
    nn.Activation('relu'),
    BatchNorm(120, num_dims=2),
    nn.Dense(84),
   
    nn.Activation('relu'),
     BatchNorm(84, num_dims=2),
    nn.Dense(10)
)
lr = 0.1
ctx = gb.try_gpu()
net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier())
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
train_data, test_data = gb.load_data_fashion_mnist(batch_size=256)
gb.train(train_data, test_data, net, loss, trainer, ctx, num_epochs=100)


training on gpu(0)
epoch 1, loss 0.5337, train acc 0.809, test acc 0.853, time 3.4 sec
epoch 2, loss 0.3677, train acc 0.867, test acc 0.864, time 3.4 sec
epoch 3, loss 0.3264, train acc 0.882, test acc 0.876, time 3.4 sec
epoch 4, loss 0.3025, train acc 0.889, test acc 0.888, time 3.4 sec
epoch 5, loss 0.2837, train acc 0.896, test acc 0.854, time 3.4 sec
epoch 6, loss 0.2693, train acc 0.901, test acc 0.891, time 3.5 sec
epoch 7, loss 0.2585, train acc 0.905, test acc 0.901, time 3.4 sec
epoch 8, loss 0.2475, train acc 0.909, test acc 0.898, time 3.4 sec
epoch 9, loss 0.2383, train acc 0.912, test acc 0.893, time 3.4 sec
epoch 10, loss 0.2288, train acc 0.916, test acc 0.903, time 3.4 sec
epoch 11, loss 0.2219, train acc 0.918, test acc 0.898, time 3.4 sec
epoch 12, loss 0.2122, train acc 0.922, test acc 0.899, time 3.4 sec
epoch 13, loss 0.2085, train acc 0.922, test acc 0.894, time 3.5 sec
epoch 14, loss 0.2010, train acc 0.926, test acc 0.899, time 3.4 sec
epoch 15, loss 0.1936, t