## 06-03 Batch Normalization

In [1]:
import numpy as np 
import torch
import torch.nn as nn

torch.__version__

'1.3.1'

### 1.Batch Normalization and Initialize

In [7]:
class MLP(nn.Module):
    def __init__(self, num_hidden, num_layers=100):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(num_hidden, num_hidden, bias=False) for i in range(num_layers)])
        self.bns = nn.ModuleList([nn.BatchNorm1d(num_hidden) for i in range(num_layers)])
        self.num_hidden = num_hidden

    def forward(self, x):
        for i, (linear, bn) in enumerate(zip(self.linears, self.bns)):
            x = linear(x)
            x = bn(x)
            x = torch.relu(x)

            if torch.isnan(x.std()):
                print('output is nan in {} layers'.format(i))
                break
            print('layers:{}, std:{}'.format(i, x.std().item()))
        return x

    def initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                # nn.init.normal_(m.weight.data, std=1)
                nn.init.kaiming_normal_(m.weight.data)

num_hidden = 256
num_layers = 100
batch_size = 16

net = MLP(num_hidden, num_layers)
net.initialize()

inputs = torch.randn((batch_size, num_hidden))
outputs = net(inputs)
print(outputs)

layers:0, std:0.5761674642562866
layers:1, std:0.5949079394340515
layers:2, std:0.578766942024231
layers:3, std:0.5829827189445496
layers:4, std:0.5845738649368286
layers:5, std:0.5818923115730286
layers:6, std:0.5772672891616821
layers:7, std:0.5799934267997742
layers:8, std:0.5806533694267273
layers:9, std:0.5835395455360413
layers:10, std:0.5771958827972412
layers:11, std:0.5888833403587341
layers:12, std:0.5752442479133606
layers:13, std:0.585811972618103
layers:14, std:0.5752991437911987
layers:15, std:0.5831204652786255
layers:16, std:0.5782873630523682
layers:17, std:0.578441321849823
layers:18, std:0.5825368762016296
layers:19, std:0.5842101573944092
layers:20, std:0.5705402493476868
layers:21, std:0.5783528685569763
layers:22, std:0.5758209228515625
layers:23, std:0.5793800950050354
layers:24, std:0.580898106098175
layers:25, std:0.5877857208251953
layers:26, std:0.5863972306251526
layers:27, std:0.5840093493461609
layers:28, std:0.5815438032150269
layers:29, std:0.57673364877

### 2.Batch Normalization Function

nn.BatchNorm1d

In [12]:
batch_size = 3
num_features = 5
momentum = 0.3

feature_shape = (1,)

feature_map = torch.ones(feature_shape)
feature_maps = torch.stack([feature_map * (i + 1) for i in range(num_features)], dim=0)
feature_maps_bs = torch.stack([feature_maps for i in range(batch_size)], dim=0)
print('input data:\n {}\n shape: {}'.format(feature_maps_bs, feature_maps_bs.shape))

bn = nn.BatchNorm1d(num_features=num_features, momentum=momentum)

running_mean, running_var = 0, 1

for i in range(2):
    outputs = bn(feature_maps_bs)
    print('iteration: {}, running mean: {}'.format(i, bn.running_mean))
    print('iteration: {}, running var: {}'.format(i, bn.running_var))

    mean_t, var_t = 2, 0

    running_mean = (1 - momentum) * running_mean + momentum * mean_t
    running_var = (1 - momentum) * running_var + momentum * var_t

    print('iteration: {}, 2nd feature running mean: {}'.format(i, running_mean))
    print('iteration: {}, 2nd feature running var: {}'.format(i, running_var))

input data:
 tensor([[[1.],
         [2.],
         [3.],
         [4.],
         [5.]],

        [[1.],
         [2.],
         [3.],
         [4.],
         [5.]],

        [[1.],
         [2.],
         [3.],
         [4.],
         [5.]]])
 shape: torch.Size([3, 5, 1])
iteration: 0, running mean: tensor([0.3000, 0.6000, 0.9000, 1.2000, 1.5000])
iteration: 0, running var: tensor([0.7000, 0.7000, 0.7000, 0.7000, 0.7000])
iteration: 0, 2nd feature running mean: 0.6
iteration: 0, 2nd feature running var: 0.7
iteration: 1, running mean: tensor([0.5100, 1.0200, 1.5300, 2.0400, 2.5500])
iteration: 1, running var: tensor([0.4900, 0.4900, 0.4900, 0.4900, 0.4900])
iteration: 1, 2nd feature running mean: 1.02
iteration: 1, 2nd feature running var: 0.48999999999999994


nn.BatchNorm2d

In [13]:
batch_size = 3
num_features = 6
momentum = 0.3

feature_shape = (2, 2)

feature_map = torch.ones(feature_shape)
feature_maps = torch.stack([feature_map * (i + 1) for i in range(num_features)], dim=0)
feature_maps_bs = torch.stack([feature_maps for i in range(batch_size)], dim=0)
print('input data:\n {}\n shape: {}'.format(feature_maps_bs, feature_maps_bs.shape))

bn = nn.BatchNorm2d(num_features=num_features, momentum=momentum)

running_mean, running_var = 0, 1

for i in range(2):
    outputs = bn(feature_maps_bs)
    print('iteration: {}, running_mean.shape: {}'.format(i, bn.running_mean.shape))
    print('iteration: {}, running_var.shape: {}'.format(i, bn.running_var.shape))

    print('iteration: {}, weight.shape: {}'.format(i, bn.weight.shape))
    print('iteration: {}, bias.shape: {}'.format(i, bn.bias.shape))

input data:
 tensor([[[[1., 1.],
          [1., 1.]],

         [[2., 2.],
          [2., 2.]],

         [[3., 3.],
          [3., 3.]],

         [[4., 4.],
          [4., 4.]],

         [[5., 5.],
          [5., 5.]],

         [[6., 6.],
          [6., 6.]]],


        [[[1., 1.],
          [1., 1.]],

         [[2., 2.],
          [2., 2.]],

         [[3., 3.],
          [3., 3.]],

         [[4., 4.],
          [4., 4.]],

         [[5., 5.],
          [5., 5.]],

         [[6., 6.],
          [6., 6.]]],


        [[[1., 1.],
          [1., 1.]],

         [[2., 2.],
          [2., 2.]],

         [[3., 3.],
          [3., 3.]],

         [[4., 4.],
          [4., 4.]],

         [[5., 5.],
          [5., 5.]],

         [[6., 6.],
          [6., 6.]]]])
 shape: torch.Size([3, 6, 2, 2])
iteration: 0, running_mean.shape: torch.Size([6])
iteration: 0, running_var.shape: torch.Size([6])
iteration: 0, weight.shape: torch.Size([6])
iteration: 0, bias.shape: torch.Size([6])
iteratio

nn.BatchNorm3d

In [14]:
batch_size = 3
num_features = 4
momentum = 0.3

feature_shape = (2, 2, 3)

feature_map = torch.ones(feature_shape)
feature_maps = torch.stack([feature_map * (i + 1) for i in range(num_features)], dim=0)
feature_maps_bs = torch.stack([feature_maps for i in range(batch_size)], dim=0)
print('input data:\n {}\n shape: {}'.format(feature_maps_bs, feature_maps_bs.shape))

bn = nn.BatchNorm3d(num_features=num_features, momentum=momentum)

running_mean, running_var = 0, 1

for i in range(2):
    outputs = bn(feature_maps_bs)
    print('iteration: {}, running_mean.shape: {}'.format(i, bn.running_mean.shape))
    print('iteration: {}, running_var.shape: {}'.format(i, bn.running_var.shape))

    print('iteration: {}, weight.shape: {}'.format(i, bn.weight.shape))
    print('iteration: {}, bias.shape: {}'.format(i, bn.bias.shape))

input data:
 tensor([[[[[1., 1., 1.],
           [1., 1., 1.]],

          [[1., 1., 1.],
           [1., 1., 1.]]],


         [[[2., 2., 2.],
           [2., 2., 2.]],

          [[2., 2., 2.],
           [2., 2., 2.]]],


         [[[3., 3., 3.],
           [3., 3., 3.]],

          [[3., 3., 3.],
           [3., 3., 3.]]],


         [[[4., 4., 4.],
           [4., 4., 4.]],

          [[4., 4., 4.],
           [4., 4., 4.]]]],



        [[[[1., 1., 1.],
           [1., 1., 1.]],

          [[1., 1., 1.],
           [1., 1., 1.]]],


         [[[2., 2., 2.],
           [2., 2., 2.]],

          [[2., 2., 2.],
           [2., 2., 2.]]],


         [[[3., 3., 3.],
           [3., 3., 3.]],

          [[3., 3., 3.],
           [3., 3., 3.]]],


         [[[4., 4., 4.],
           [4., 4., 4.]],

          [[4., 4., 4.],
           [4., 4., 4.]]]],



        [[[[1., 1., 1.],
           [1., 1., 1.]],

          [[1., 1., 1.],
           [1., 1., 1.]]],


         [[[2., 2., 2.],
    