In [None]:
import torch
import torch.nn as nn

In [None]:
inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
B, S, E = inputs.size()
inputs = inputs.reshape(S,B,E)
inputs.size()

torch.Size([2, 1, 3])

In [None]:
parameter_shape = inputs.size()[-2:]
parameter_shape

torch.Size([1, 3])

In [None]:
gamma =  nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.zeros(parameter_shape))

In [None]:
dims = [-(i+1) for i in range (len(parameter_shape))]
dims

[-1, -2]

In [None]:
mean = inputs.mean(dim=dims, keepdim=True)
mean.size()

torch.Size([2, 1, 1])

In [None]:
mean

tensor([[[0.2000]],

        [[0.2333]]])

In [None]:
var = ((inputs - mean) ** 2 ).mean(dim=dims, keepdim=True)
epsilon = 1e-5
std = (var+epsilon).sqrt()
std

tensor([[[0.0817]],

        [[0.1886]]])

In [None]:
y = (inputs - mean) / std

In [None]:
y

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])

In [None]:
out = gamma * y + beta
out

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)

### Custom Layer Normalization Implementation

This cell defines a custom `LayerNormalization` module using PyTorch's `nn.Module`. Layer Normalization is a technique used to normalize the inputs of a layer across the features, which can help stabilize and accelerate training of deep neural networks, particularly in models like transformers.

The implementation includes:
- Learnable `gamma` and `beta` parameters for scaling and shifting the normalized output.
- Calculation of the mean and variance across the specified dimensions.
- Normalization of the input using the calculated mean and standard deviation.
- An affine transformation using the `gamma` and `beta` parameters.
- Debugging print statements to show intermediate values and shapes.

In [None]:
import torch
import torch.nn as nn

class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape = parameters_shape
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta = nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = dims = [-(i+1) for i in range (len(parameter_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}): \n {mean}")
        var = ((inputs - mean) ** 2 ).mean(dim=dims, keepdim=True)
        std = (var+epsilon).sqrt()
        print(f"Standard Deviation \n ({std.size()}): \n {std}")
        y = (inputs - mean) / std
        print(f"y \n ({y.size()}) = \n {y}")
        out = self.gamma * y + self.beta
        print(f"out \n ({out.size()}) = \n {out}")
        return out

In [None]:
batch_size = 3 #for Q,K,V
sentence_length = 5 #INPUT
embedding_dim = 8 #For 8 MultiHead Attention
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(f"input \n ({inputs.size()}) = \n {inputs}")

input 
 (torch.Size([5, 3, 8])) = 
 tensor([[[-1.2450e+00,  1.1626e+00,  5.4151e-01, -1.1042e+00,  3.5732e-01,
          -6.9579e-01, -1.1969e+00,  5.9790e-01],
         [ 1.0233e+00, -9.6010e-01, -1.4094e+00,  3.9346e-01,  1.7239e+00,
          -5.3952e-01,  7.7403e-01, -5.7328e-01],
         [-1.2331e-01, -1.0926e+00, -9.8302e-01, -1.4572e+00, -7.0129e-01,
           1.0467e+00, -5.0996e-01,  9.1159e-01]],

        [[ 1.0450e+00, -3.5307e-02,  9.4701e-02, -2.8675e-01,  3.0977e-01,
           1.0187e+00, -2.1272e-02,  5.0088e-01],
         [-8.4592e-01, -1.0629e+00, -5.5482e-01, -5.4918e-01, -1.0241e-01,
          -1.6936e+00,  2.2802e-01,  1.4542e-01],
         [-7.6899e-01, -2.1538e-01, -8.6672e-02,  6.3904e-01,  1.4018e+00,
           4.3134e-01, -1.6876e-02,  7.9366e-01]],

        [[ 4.8937e-01,  2.0301e-01,  6.9634e-01,  1.2340e-01, -4.6241e-01,
           4.6583e-01,  3.9648e-01, -7.8475e-01],
         [ 1.0100e+00, -2.1067e-01,  3.9061e-01, -1.2532e+00,  1.6536e+00,
          