In [18]:
# Normalization stablize and speed up the trainning 
# such the gradient descent will be more consistent.

import torch
import torch.nn as nn
import numpy as np

layer = torch.tensor(
[[0.2, 0.1, 0.3],
 [0.5, 0.1, 0.1]]
)  # 2 words and 3 dim vec for each word

mean11 = 1 / 3 * layer[0].sum()
std11  = layer[0].std()
print(f'Mean: {mean11.item():.2f}, Std: {std11.item():.2f}')
# The std has to do with Bessel's correction, so it's not 0.8

mean21 = layer[1].mean()
std21  = layer[1].std()
print(f'Mean: {mean21.item():.2f}, Std: {std21.item():.2f}')

print("layer 0 norm:", np.round(((layer[0] - mean11) / std11).numpy(), 2))

Mean: 0.20, Std: 0.10
Mean: 0.23, Std: 0.23
layer 0 norm: [-0. -1.  1.]


In [19]:
inputs = torch.tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
B, S, E = inputs.size() # B is batch
inputs = inputs.reshape(S, B, E)
inputs.size() # layer normalization will be applied to a layer across batches

torch.Size([2, 1, 3])

In [21]:
parameter_shape = inputs.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape)) # parameters will change during training
beta  = nn.Parameter(torch.zeros(parameter_shape))
print(gamma.size(), beta.size())

dims = [-(i + 1) for i in range(len(parameter_shape))]
print(dims)

torch.Size([1, 3]) torch.Size([1, 3])
[-1, -2]


In [23]:
mean = inputs.mean(dim=dims, keepdim=True)
print(mean.size())
print(mean)

torch.Size([2, 1, 1])
tensor([[[0.2000]],

        [[0.2333]]])


In [24]:
var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
epsilon = 1e-5
std = (var + epsilon).sqrt()
print(std)

tensor([[[0.0817]],

        [[0.1886]]])


In [25]:
y = (inputs - mean) / std
print(y)

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])


In [31]:
out = gamma * y + beta
print(out)

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)


In [40]:
import torch
from torch import nn

class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"mean {mean.size()}\n{mean}")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"std {std.size()}\n{std}")
        y = (inputs - mean) / std
        print(f"y {y.size()}\n{y}")
        out = self.gamma * y  + self.beta
        print(f"out {out.size()}\n{out}")
        return out
    
batch_size = 3
sentence_length = 5
embedding_dim = 8 
inputs = torch.randn(sentence_length, batch_size, embedding_dim)

print(inputs.size())
print(inputs.size()[-2:]) # perform norm to last two dimensions (batch, embeddings)

layer_norm = LayerNormalization(inputs.size()[-2:])
out = layer_norm.forward(inputs)

torch.Size([5, 3, 8])
torch.Size([3, 8])
mean torch.Size([5, 1, 1])
tensor([[[-0.3462]],

        [[-0.3627]],

        [[ 0.0518]],

        [[ 0.1670]],

        [[-0.1598]]])
std torch.Size([5, 1, 1])
tensor([[[1.0113]],

        [[0.9629]],

        [[1.0710]],

        [[0.9919]],

        [[0.8889]]])
y torch.Size([5, 3, 8])
tensor([[[ 1.0614e+00, -4.7620e-01, -5.8960e-01, -7.0821e-01, -7.5087e-01,
           5.6443e-01, -1.0498e+00, -1.0689e+00],
         [-7.1629e-02, -4.3661e-01,  1.9082e+00,  3.9167e-01, -9.1295e-01,
           1.3029e+00, -1.2868e+00, -5.3802e-01],
         [-9.0042e-02,  1.4245e+00, -3.6603e-01, -1.6860e-03, -1.1025e+00,
          -5.4567e-01,  9.8800e-01,  2.3543e+00]],

        [[ 8.3086e-01,  1.1783e+00,  1.2845e-01,  1.8076e-01, -2.1545e+00,
          -6.3254e-01,  8.3142e-01, -7.3211e-01],
         [ 2.3212e-02, -5.9913e-01, -8.7102e-01,  3.7432e-01, -8.7464e-01,
          -5.0119e-01,  4.4294e-01, -2.3912e+00],
         [-1.7329e-01,  7.5511e-01, -1.0