<a href="https://colab.research.google.com/github/VinaySingh561/LARGE-LANGUAGE-MODELS/blob/main/Layer_Normalization_in_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import torch
import torch.nn as nn
torch.manual_seed(123)
input  = torch.randn(2,5)
layer = nn.Sequential(nn.Linear(5,6),nn.ReLU())
out = layer(input)
print(out)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


In [10]:
mean  = out.mean(dim=-1, keepdim = True)
var = out.var(dim=-1, keepdim = True)
print("Mean : \n",mean)
print("Variance : \n",var)

Mean : 
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance : 
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


In [12]:
out_norm  = (out-mean)/var.sqrt()
print(out_norm)

tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)


In [19]:
torch.set_printoptions(sci_mode=False)
print("Mean before layer normalization \n ", mean)
print("Mean after layer normalization \n ",out_norm.mean(dim=-1, keepdim = True))
print("Var before layer normalization \n ", var)
print("Var after layer normalization \n ",out_norm.var(dim=-1, keepdim = True))

Mean before layer normalization 
  tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Mean after layer normalization 
  tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Var before layer normalization 
  tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)
Var after layer normalization 
  tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [25]:
class Layernorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self,x):
    mean = x.mean(dim=-1, keepdim  = True)
    var = x.var(dim=-1, keepdim = True, unbiased = False) ## unbiased -->> Bessel Correction
    out = (x - mean)/torch.sqrt(var + self.eps)
    return self.scale*out + self.shift

In [28]:
def mean_var(inputs):
  mean  = inputs.mean(dim=-1,keepdim = True)
  var = inputs.var(dim=-1, keepdim = True,unbiased = False)
  return mean, var

In [29]:
input = torch.randn(2,5)
emb_dim = input.shape[1]
Layernormalization = Layernorm(emb_dim)
Normalized_output = Layernormalization(input)
print(" Mean and Variance before Normalization : \n",mean_var(input) )
print(" Mean and Variance after  Normalization : \n", mean_var(Normalized_output))

 Mean and Variance before Normalization : 
 (tensor([[-0.4793],
        [ 0.0112]]), tensor([[2.4954],
        [0.7209]]))
 Mean and Variance after  Normalization : 
 (tensor([[    -0.0000],
        [    -0.0000]], grad_fn=<MeanBackward1>), tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>))
