Layer Normalisation : main idea is to make output's mean to 0 and variance to 1

mean (u) = (x1 + x2 + x3+ x4)/n
variance (v) =  1/n * ( (x1-u)^2 + (x2-u)^2 + (x3-u)^2 + (x4-u)^2  )

normalisation = ( ( (u-x1) / ( (v)^1/2) )  + ( (u-x2) / ( (v)^1/2) ) +  ( (u-x3) / ( (v)^1/2) ) + ( (u-x4) / ( (v)^1/2) ) ) 

In [3]:
import torch
import torch.nn as nn

torch.manual_seed(123)
batch_example = torch.randn(2,5)
print(batch_example.shape)

layer = nn.Sequential(nn.Linear(5,6), nn.ReLU())
out = layer(batch_example)
print(out)
print(out.shape)

torch.Size([2, 5])
tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)
torch.Size([2, 6])


In [None]:
mean = out.mean(dim = -1, keepdim=True)
variance = out.var(dim = -1, keepdim=True)
print(mean)
print(variance)

tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)


In [13]:
out_norm = (out - mean )/ torch.sqrt(variance)
print(out_norm)
mean_norm = out_norm.mean(dim = -1, keepdim=True)
variance_norm = out_norm.var(dim = -1, keepdim=True)
print(" mean : ",mean_norm)
print("var : ", variance_norm)

tensor([[ 0.2570,  0.5896, -0.3639,  0.2451, -0.3639, -0.3639],
        [-0.0081,  0.0480, -0.4659,  0.6499,  0.2419, -0.4659]],
       grad_fn=<DivBackward0>)
 mean :  tensor([[2.4835e-09],
        [1.9868e-08]], grad_fn=<MeanBackward1>)
var :  tensor([[0.1742],
        [0.1835]], grad_fn=<VarBackward0>)


In [14]:
torch.set_printoptions(sci_mode=False)
print(" mean : ",mean_norm)
print("var : ", variance_norm)

 mean :  tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
var :  tensor([[0.1742],
        [0.1835]], grad_fn=<VarBackward0>)


In [16]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5 # prevent division by zero --> numerical stability
        self.scale = nn.Parameter(torch.ones(emb_dim))  # scale it perfectly.
        self.shift = nn.Parameter(torch.zeros(emb_dim)) # center it.

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [17]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
