# Playground for Normalization Layer

## Prototype Normalization Layer
* Normalizes activation levels to mean 0 and variance 1
    * shifts activation values by mean
    * divides activation levels by sqrt(variance)
* Contains additional learnable parameters to shift mean from 0 if necessary
* Contains additional learnable parameters to scale variance away from 1 if necessary

In [None]:
import torch
import torch.nn as nn

class LayerNorm(nn.Module):

    def __init__(self, embed_dim, verbose=False):
        super().__init__()
        
        self.eps = 1e-5         # prevents division by 0
        self.shift = nn.Parameter(torch.zeros(embed_dim))
        self.scale = nn.Parameter(torch.ones(embed_dim))
        self.verbose = verbose

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x-mean) / torch.sqrt(variance + self.eps)

        if self.verbose:
            print(f'Normalizing (results without learned parameters) ...')
            print(f'    Input: ', x)
            print(f'    Mean: ', mean)
            print(f'    Variance: ', variance)
            print(f'    Normalized: ', norm_x)

        return self.scale * norm_x + self.shift