In [None]:
import torch
from torch import nn

In [None]:
inputs = torch.Tensor([[[0.2,0.1,0.3], [0.5, 0.1,0.1]]])
""" Here we added batches, as we want to normalize not only the last
layer(embedding), also by batches. SO Layer normalization is going to be
computed across layer and also the batch.
thats why here we reshape our input, and bring the batch as 2nd last that
that we can process embedding and batch along with together"""
B, S, E = inputs.size()
inputs = inputs.reshape(S, B, E)
inputs.size()

torch.Size([2, 1, 3])

In [None]:
parameter_shape = inputs.size()[-2:] # for parameter, we are considering only last 2,as the norm will apply embedding and batch
gamma = nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.zeros(parameter_shape))

In [None]:
""" the reason here, we can see only the last two dimention here,
becuase again!the layer norm will be applied on batch and embedding"""
gamma.size(), beta.size()

(torch.Size([1, 3]), torch.Size([1, 3]))

In [None]:
## calculating the dimentions of the batch and embedding dimentionss, as the layer norm will be applied here.
dims = [-(i + 1) for i in range(len(parameter_shape))] ##comprehension :python
dims # so it shows the last two dimentions where the layer norm will be applied.

[-1, -2]

In [None]:
""" So for normalization, we will calculate the mean across layer and batch"""
mean = inputs.mean(dim = dims, keepdim = True)
mean.size()

torch.Size([2, 1, 1])

In [None]:
mean

tensor([[[0.2000]],

        [[0.2333]]])

In [None]:
""" we need STD as well, lets calculate standard daviation-STD for normalization """
var = ((inputs - mean) ** 2).mean (dim=dims, keepdim = True)
epsilon = 1e-5 ## are adding epsilone during STD calculation, as STD will be denominator, so that it does not become zero.
std = (var +epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [None]:
""" Apply normalization formula """
y = (inputs - mean)/ std
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [None]:
## the final output of layer normalization
out = gamma * y + beta
out ## here in the tensor we can see, grad_fn parameter, this parameter is not gamma & beta, which is learnable.

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

Create a class for what we have done so far

In [None]:

import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y  + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out

Apply the layer normalization using the layer norm class

In [None]:
batch_size = 3
sentence_length = 5
embedding_dim = 8
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-0.2364, -0.6338, -0.8136,  1.1681,  1.2041, -1.2963,  1.3588,
          -0.2183],
         [ 0.9932,  0.5894, -1.1784, -0.5347, -0.3060,  0.3411,  1.5260,
           1.6832],
         [ 0.9735, -1.2352,  0.3909, -0.8722,  0.6739, -2.6895,  0.1192,
          -0.5708]],

        [[ 1.6261, -0.7122,  0.0905,  0.7380,  0.2301,  1.1817, -1.2036,
           1.0570],
         [ 0.6507,  0.2438, -0.2595,  1.5971,  1.4274,  0.4059, -0.3605,
           0.7279],
         [ 0.2835,  1.8822, -0.2638, -0.9029,  0.2947,  0.7813,  2.1086,
          -0.1727]],

        [[ 1.4508,  0.4152,  0.0220, -0.7879,  1.3054, -0.1078,  0.3189,
           0.6838],
         [-1.4393,  0.0056, -1.1333,  0.9543,  2.0164,  1.4971, -0.9736,
          -0.8344],
         [ 0.2225, -0.1916,  0.1652,  1.5632, -0.9313, -0.2051, -0.7447,
           0.4788]],

        [[ 0.5901,  1.6292,  0.2596,  0.1191,  1.0151,  0.8403,  1.2528,
          -0.5778],
         [-2.0575,  0.2575, 

In [None]:
layer_norm = LayerNormalization(inputs.size()[-1:])

In [None]:
out = layer_norm.forward(inputs)

Mean 
 (torch.Size([5, 3, 1])): 
 tensor([[[ 0.0666],
         [ 0.3892],
         [-0.4013]],

        [[ 0.3760],
         [ 0.5541],
         [ 0.5014]],

        [[ 0.4125],
         [ 0.0116],
         [ 0.0446]],

        [[ 0.6410],
         [ 0.1005],
         [-0.6724]],

        [[-0.0118],
         [ 0.4259],
         [ 0.3956]]])
Standard Deviation 
 (torch.Size([5, 3, 1])): 
 tensor([[[0.9665],
         [0.9466],
         [1.1251]],

        [[0.9075],
         [0.6621],
         [0.9792]],

        [[0.6912],
         [1.2358],
         [0.7275]],

        [[0.6554],
         [1.1473],
         [0.9747]],

        [[0.5136],
         [0.6755],
         [0.9308]]])
y 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-0.3135, -0.7247, -0.9107,  1.1397,  1.1769, -1.4101,  1.3371,
          -0.2947],
         [ 0.6380,  0.2115, -1.6561, -0.9761, -0.7345, -0.0508,  1.2009,
           1.3670],
         [ 1.2219, -0.7413,  0.7041, -0.4185,  0.9556, -2.0338,  0.4627,
          -0.1507]],



In [None]:
out[0].mean(), out[0].std()

(tensor(-2.9802e-08, grad_fn=<MeanBackward0>),
 tensor(1.0215, grad_fn=<StdBackward0>))