In [None]:
import torch
import torch.nn as nn

# Batch Normalization

Purpose:

* BatchNorm is primarily used to address the internal covariate shift problem in deep neural networks by normalizing the activations of each layer across mini-batches during training.
Normalization Strategy:

* BatchNorm normalizes the activations across the mini-batch dimension. It calculates the mean and standard deviation of each feature across the mini-batch and normalizes the features using these statistics.

Learnable Parameters:

* BatchNorm introduces learnable parameters (scale and shift) to the normalized data, allowing the model to adapt and learn the optimal normalization for each layer.

Applicability:

* BatchNorm is commonly used in deep neural networks, especially in convolutional neural networks (CNNs) and recurrent neural networks (RNNs).

Performance:

* BatchNorm can suffer from issues such as batch size sensitivity during inference and training instability in small batch sizes.

### References:

[Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167)

In [None]:
class BatchNorm1d(nn.Module):
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        super().__init__()
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum

        # Initialize parameters
        self.gamma = nn.Parameter(torch.ones(num_features))
        self.beta = nn.Parameter(torch.zeros(num_features))
        self.register_buffer('running_mean', torch.zeros(num_features))
        self.register_buffer('running_var', torch.ones(num_features))

    def forward(self, x):
        if self.training:
            mean = x.mean(dim=0, keepdim=True)
            var = x.var(dim=0, unbiased=False, keepdim=True)
            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.squeeze()
            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * var.squeeze()
        else:
            mean = self.running_mean.unsqueeze(0)
            var = self.running_var.unsqueeze(0)

        x_normalized = (x - mean) / torch.sqrt(var + self.eps)
        output = self.gamma * x_normalized + self.beta
        return output

# Layer Normalization:

Purpose:

* LayerNorm normalizes the activations of each layer across feature dimensions independently for each sample in a mini-batch. It aims to stabilize the training process and reduce the impact of the scale of input features.

Normalization Strategy:

* LayerNorm normalizes the activations across the feature dimension independently for each sample in the mini-batch. It calculates the mean and standard deviation of each feature across the feature dimension and normalizes the features using these statistics.

Learnable Parameters:

* LayerNorm does not introduce any learnable parameters. It normalizes the activations based solely on the mean and standard deviation computed across the feature dimension.

Applicability:

* LayerNorm is often used in natural language processing (NLP) tasks, such as sequence modeling and language understanding, where the feature dimensions can vary across samples.

Performance:

* LayerNorm is less sensitive to batch size and performs well even in small batch sizes. It can also be applied in scenarios where the sequence length varies across samples.

### References:
[LAYERNORM](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html#torch.nn.LayerNorm)

[Layer Normalization](https://arxiv.org/abs/1607.06450)

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-6):
        super().__init__()
        self.normalized_shape = normalized_shape
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(normalized_shape))
        self.beta = nn.Parameter(torch.zeros(normalized_shape))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

# Root Mean Square Normalization:
Purpose:

- RMSNorm normalizes the inputs by dividing them by the root mean square (RMS) value along a specific axis, typically along the feature dimension. It aims to stabilize and accelerate training by normalizing the magnitude of the inputs.

Normalization Strategy:

- RMSNorm normalizes the inputs by dividing them by the root mean square (RMS) value along a specific axis, typically along the feature dimension. It calculates the RMS value of each feature along the specified axis and normalizes the features using these RMS values.

Learnable Parameters:

- RMSNorm introduces a single learnable parameter (gamma) to scale the normalized data, but it does not introduce any parameter to shift the data.

Applicability:

- RMSNorm is less common compared to BatchNorm and LayerNorm but can be useful in scenarios where stabilizing the magnitude of inputs is beneficial, such as in reinforcement learning or certain types of recurrent neural networks.

Performance:

- RMSNorm is relatively simple and computationally efficient compared to BatchNorm and LayerNorm. It can provide stable performance across different batch sizes and input dimensions.

### References:

[Root Mean Square Layer Normalization](https://arxiv.org/abs/1910.07467)

[]()

In [None]:
class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, x: torch.Tensor):
        rms = x.pow(2).mean(-1, keepdim=True).add_(self.eps).sqrt_()
        return self.weight * (x / rms)