In [1]:
import torch
import torch.nn as nn

In [5]:
inputs = torch.Tensor([[[0.2 ,0.1,0.3],[0.5 , 0.1,0.1]]])
B , S , E = inputs.size()
inputs = inputs.resize(S,B,E)
print("Shape: ",inputs.size())
inputs

Shape:  torch.Size([2, 1, 3])


tensor([[[0.2000, 0.1000, 0.3000]],

        [[0.5000, 0.1000, 0.1000]]])

In [18]:
parameter_shape = inputs.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.zeros(parameter_shape))

gamma.size() , beta.size()

(torch.Size([3, 8]), torch.Size([3, 8]))

In [19]:
dims = [-(i + 1) for i in range(len(parameter_shape))]
dims

[-1, -2]

In [20]:
mean = inputs.mean(dim=dims,keepdim=True)
print(mean.size())
mean

torch.Size([5, 1, 1])


tensor([[[-0.0155]],

        [[-0.1688]],

        [[ 0.1098]],

        [[ 0.0224]],

        [[-0.0540]]])

In [21]:
var = ((inputs - mean) **2).mean(dim=dims,keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
std

tensor([[[0.9439]],

        [[1.0442]],

        [[0.6804]],

        [[0.9724]],

        [[0.9446]]])

In [22]:
y = (inputs - mean) / std
y

tensor([[[-0.6058,  1.6298,  1.3305,  0.7093, -0.4536, -1.0804,  0.5552,
          -2.1253],
         [ 0.6390, -0.5451, -0.4671,  1.1381, -0.0183, -0.0692,  2.1459,
          -1.5521],
         [-0.7086,  0.2646,  0.5917, -0.4039, -1.3006, -0.2569,  0.6478,
          -0.0649]],

        [[ 0.6384,  0.5546,  0.8327,  0.7626,  1.8832, -0.7146,  0.4405,
           1.2791],
         [-0.9689,  1.9229, -0.0103,  0.4134, -1.9839, -0.4046, -1.1101,
           0.2057],
         [ 0.6626, -1.4404, -0.9034,  0.6538, -0.2449, -0.6609, -1.0664,
          -0.7413]],

        [[-0.9166, -1.2624,  0.8926,  0.7634,  0.8716,  0.2077, -0.5040,
          -1.1245],
         [ 1.3516, -0.9081,  0.0818,  0.1162,  0.3905, -1.4489,  1.2314,
           0.4351],
         [ 0.2721,  2.5996, -0.2269, -0.3493, -0.3936, -0.7441, -1.8143,
           0.4792]],

        [[-1.3926,  2.4016,  0.2430,  0.6469, -0.2895,  0.6609,  0.2984,
          -1.1430],
         [-1.3658, -0.3062, -0.6057,  0.1037,  0.9170, -0.1174, 

In [23]:
out = gamma * y +beta
out

tensor([[[-0.6058,  1.6298,  1.3305,  0.7093, -0.4536, -1.0804,  0.5552,
          -2.1253],
         [ 0.6390, -0.5451, -0.4671,  1.1381, -0.0183, -0.0692,  2.1459,
          -1.5521],
         [-0.7086,  0.2646,  0.5917, -0.4039, -1.3006, -0.2569,  0.6478,
          -0.0649]],

        [[ 0.6384,  0.5546,  0.8327,  0.7626,  1.8832, -0.7146,  0.4405,
           1.2791],
         [-0.9689,  1.9229, -0.0103,  0.4134, -1.9839, -0.4046, -1.1101,
           0.2057],
         [ 0.6626, -1.4404, -0.9034,  0.6538, -0.2449, -0.6609, -1.0664,
          -0.7413]],

        [[-0.9166, -1.2624,  0.8926,  0.7634,  0.8716,  0.2077, -0.5040,
          -1.1245],
         [ 1.3516, -0.9081,  0.0818,  0.1162,  0.3905, -1.4489,  1.2314,
           0.4351],
         [ 0.2721,  2.5996, -0.2269, -0.3493, -0.3936, -0.7441, -1.8143,
           0.4792]],

        [[-1.3926,  2.4016,  0.2430,  0.6469, -0.2895,  0.6609,  0.2984,
          -1.1430],
         [-1.3658, -0.3062, -0.6057,  0.1037,  0.9170, -0.1174, 

## Class code

In [24]:
import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        # Initialize LayerNormalization with parameters_shape and epsilon (small constant for numerical stability)
        self.parameters_shape = parameters_shape
        self.eps = eps

        # Learnable parameters: gamma (scaling factor) and beta (shift factor)
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta = nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        # Compute mean along specified dimensions
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")

        # Compute variance, standard deviation, and normalize the input
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")

        # Scale and shift the normalized input using learnable parameters
        out = self.gamma * y + self.beta
        print(f"out \n ({out.size()}) = \n {out}")

        return out


In [25]:
batch_size = 3
sentence_length = 5
embedding_dim = 8 
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 1.3925e+00, -1.6934e+00, -6.5006e-01, -1.4838e+00, -2.9387e-01,
          -9.2520e-02,  8.8869e-01,  1.0690e+00],
         [-7.0391e-01, -1.4360e+00,  5.1696e-01,  3.5007e-01, -6.1551e-02,
           9.2585e-01, -1.2313e+00, -1.7234e+00],
         [-1.9445e+00,  5.0140e-01, -1.8556e+00, -8.5962e-01, -9.9335e-01,
          -5.6037e-01,  1.5703e+00,  1.1237e-01]],

        [[-7.4327e-01,  1.3000e+00,  3.5844e-01, -4.3841e-01,  7.4540e-01,
           1.3511e+00,  6.1987e-01, -1.2158e+00],
         [ 5.1978e-01,  9.9815e-01, -8.5679e-02, -2.8547e-01,  5.4419e-01,
          -1.0778e-01, -1.1590e+00,  1.8168e+00],
         [ 3.4760e-01, -7.8709e-01,  3.2790e-01,  6.4962e-01, -2.1193e-01,
          -1.1368e-01,  8.2569e-01,  1.5813e-01]],

        [[-3.8157e-02, -1.5351e+00, -1.5647e-02,  8.1497e-01, -1.4372e+00,
          -1.8255e+00, -1.2092e-01, -7.3382e-01],
         [-2.8171e-01, -3.2226e-01,  1.1458e+00,  4.0444e-01,  1.6055e+00,
          

In [26]:
layer_norm = LayerNormalization(inputs.size()[-1:])

In [27]:
out = layer_norm.forward(inputs)

Mean 
 (torch.Size([5, 3, 1])): 
 tensor([[[-0.1079],
         [-0.4204],
         [-0.5037]],

        [[ 0.2472],
         [ 0.2801],
         [ 0.1495]],

        [[-0.6114],
         [ 0.8198],
         [-0.4101]],

        [[-0.8573],
         [-0.4736],
         [-0.0536]],

        [[-0.0811],
         [-0.6063],
         [ 0.0268]]])
Standard Deviation 
 (torch.Size([5, 3, 1])): 
 tensor([[[1.0820],
         [0.9274],
         [1.1161]],

        [[0.8887],
         [0.8426],
         [0.4816]],

        [[0.8648],
         [1.1531],
         [0.9769]],

        [[1.0044],
         [1.1169],
         [0.4699]],

        [[0.8150],
         [1.1561],
         [0.8350]]])
y 
 (torch.Size([5, 3, 8])) = 
 tensor([[[ 1.3867, -1.4654, -0.5010, -1.2716, -0.1718,  0.0142,  0.9211,
           1.0878],
         [-0.3057, -1.0950,  1.0107,  0.8308,  0.3869,  1.4516, -0.8744,
          -1.4050],
         [-1.2910,  0.9006, -1.2113, -0.3189, -0.4388, -0.0508,  1.8583,
           0.5520]],



In [28]:
out[0].mean(), out[0].std()

(tensor(4.9671e-09, grad_fn=<MeanBackward0>),
 tensor(1.0215, grad_fn=<StdBackward0>))