# Vanishing/Exploding Gradients Problem

## Glorot and He Initialization

In [4]:
import torch
import torch.nn as nn

layer = nn.Linear(40, 10)
layer.weight.shape

torch.Size([10, 40])

In [5]:
layer.weight.data[0] 

tensor([ 0.0381,  0.0389,  0.1085,  0.0407, -0.1058, -0.1046,  0.0603,  0.0576,
         0.0638, -0.0148, -0.1172,  0.1388,  0.0791, -0.1340,  0.0588,  0.0765,
        -0.0298,  0.1425,  0.0880,  0.1417, -0.0578, -0.1159,  0.0319, -0.0018,
        -0.1426,  0.0153,  0.0917, -0.1019,  0.0428,  0.0068,  0.0844, -0.0469,
        -0.1578,  0.1303,  0.1105, -0.0995, -0.0700, -0.0508,  0.0376,  0.1548])

In [6]:
layer.weight.data *= 6**0.5  # He init

In [7]:
layer.weight.data[0]

tensor([ 0.0933,  0.0952,  0.2657,  0.0997, -0.2591, -0.2561,  0.1477,  0.1411,
         0.1563, -0.0362, -0.2870,  0.3401,  0.1938, -0.3283,  0.1440,  0.1873,
        -0.0731,  0.3490,  0.2155,  0.3471, -0.1416, -0.2839,  0.0782, -0.0044,
        -0.3494,  0.0375,  0.2245, -0.2496,  0.1048,  0.0167,  0.2067, -0.1149,
        -0.3866,  0.3191,  0.2707, -0.2437, -0.1715, -0.1245,  0.0922,  0.3792])

In [8]:
layer.bias.data

tensor([-0.0649, -0.0632, -0.0706, -0.0283, -0.0608, -0.1432,  0.0786,  0.0967,
         0.0878, -0.0658])

In [9]:
torch.zero_(layer.bias.data)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [10]:
layer = nn.Linear(40, 10)
nn.init.kaiming_uniform_(layer.weight)
nn.init.zeros_(layer.bias)

Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)

In [11]:
def use_he_init(module):
    if isinstance(module, nn.Linear):
        nn.init.kaiming_uniform_(module.weight)
        nn.init.zeros_(module.bias)
model = nn.Sequential(nn.Linear(40, 30), nn.ReLU(), nn.Linear(30, 1), nn.ReLU())
model.apply(use_he_init)

Sequential(
  (0): Linear(in_features=40, out_features=30, bias=True)
  (1): ReLU()
  (2): Linear(in_features=30, out_features=1, bias=True)
  (3): ReLU()
)

In [12]:
model = nn.Sequential(nn.Linear(40, 10), nn.LeakyReLU(negative_slope=0.2))  # alpha is negative slope
model.apply(use_he_init)

Sequential(
  (0): Linear(in_features=40, out_features=10, bias=True)
  (1): LeakyReLU(negative_slope=0.2)
)