# Introduction

Experiment with PyTorch Weights Initialisation.

In [2]:
# Import Standard Libraries
import torch
import torch.nn as nn

# Layer Definition

In [4]:
# Define a Layer to initialise
layer = nn.Linear(5, 5)

In [7]:
# Usually the weights are initialised with the Xavier Initialisation
layer.weight

Parameter containing:
tensor([[-7.2488e-02, -1.1921e-01, -7.6537e-02, -5.1171e-05, -1.3085e-01],
        [ 3.9598e-01, -4.0044e-01,  4.0640e-01, -2.2635e-01,  1.4261e-01],
        [-4.3520e-01, -3.9455e-01,  9.7896e-02,  4.3159e-01, -1.2327e-01],
        [ 3.9971e-01,  3.4355e-01,  2.0090e-01,  1.3588e-01,  2.7037e-01],
        [ 4.1686e-01, -4.4352e-01, -2.9747e-01, -1.0779e-01,  2.4341e-01]],
       requires_grad=True)

In [8]:
layer.weight.data

tensor([[-7.2488e-02, -1.1921e-01, -7.6537e-02, -5.1171e-05, -1.3085e-01],
        [ 3.9598e-01, -4.0044e-01,  4.0640e-01, -2.2635e-01,  1.4261e-01],
        [-4.3520e-01, -3.9455e-01,  9.7896e-02,  4.3159e-01, -1.2327e-01],
        [ 3.9971e-01,  3.4355e-01,  2.0090e-01,  1.3588e-01,  2.7037e-01],
        [ 4.1686e-01, -4.4352e-01, -2.9747e-01, -1.0779e-01,  2.4341e-01]])

# Uniform Initialisation

In [11]:
# NOTE: The '_' implies an in_place operation
# Initialise the weights with uniform distributed values from a to b
nn.init.uniform_(layer.weight.data, a=0, b=3)

tensor([[2.3615, 1.9547, 2.4783, 0.3394, 0.9608],
        [0.7245, 1.2889, 1.2515, 1.0568, 1.1905],
        [1.6503, 1.0915, 0.0711, 1.0232, 0.4786],
        [2.3917, 0.9893, 1.8717, 1.7230, 1.6079],
        [1.7654, 2.3208, 2.5131, 0.2856, 2.3064]])

# Normal Distribution

In [12]:
nn.init.normal_(layer.weight.data, mean=0.0, std=1.0)

tensor([[-0.5931, -0.1266, -2.1361, -0.2345, -1.2032],
        [ 0.0805,  0.3737,  1.0846,  1.8180, -2.1737],
        [ 0.4277,  1.7683,  0.5878, -0.3763,  0.4127],
        [-0.1888,  0.0473, -1.2144,  0.8918, -0.6653],
        [-0.1872, -0.6459, -0.2771,  0.5146,  0.2216]])

In [13]:
nn.init.normal_(layer.weight.data, mean=0.0, std=0.2)

tensor([[-0.0814,  0.0566, -0.2195, -0.0798, -0.0303],
        [-0.1445,  0.0052, -0.1558,  0.4677, -0.2113],
        [ 0.0615, -0.2238,  0.1719,  0.1280,  0.1497],
        [-0.2210,  0.1540, -0.4170, -0.2353,  0.0749],
        [ 0.0930,  0.0867, -0.0754,  0.3241,  0.0860]])

Shrinking the Standard Deviation would result in closer values.

# Constant Initialisation

Useful when initialise the Bias.

In [14]:
nn.init.constant_(layer.weight, 8)

Parameter containing:
tensor([[8., 8., 8., 8., 8.],
        [8., 8., 8., 8., 8.],
        [8., 8., 8., 8., 8.],
        [8., 8., 8., 8., 8.],
        [8., 8., 8., 8., 8.]], requires_grad=True)

In [15]:
nn.init.constant_(layer.bias, 0)

Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)

In [16]:
# Another way to intialise the Bias to zero
nn.init.zeros_(layer.bias)

Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)

# Xavier Initialisation

In [17]:
nn.init.xavier_uniform_(layer.weight.data)

tensor([[ 0.3335,  0.5168, -0.1817, -0.3870, -0.0390],
        [-0.5368, -0.4168,  0.1458,  0.5605, -0.5042],
        [-0.7677,  0.7031, -0.1493,  0.3173, -0.4605],
        [ 0.6563, -0.3810,  0.5848, -0.7694,  0.7013],
        [-0.3772, -0.2059,  0.3171, -0.4665, -0.3061]])

In [18]:
nn.init.xavier_normal_(layer.weight.data)

tensor([[-0.2754, -0.3600,  0.2767,  0.9130, -0.4385],
        [-0.5473, -0.4185,  0.4147,  0.3779, -0.8795],
        [ 0.2540,  0.3041, -0.0303,  0.2051, -0.6307],
        [-0.9589, -0.0322, -0.8565, -0.5455,  0.2822],
        [-0.3635, -0.4457, -0.7713, -0.1532,  0.3847]])