In [2]:
import numpy as np


# 1. Simulate activations coming into a layer
# Suppose our layer receives activations from previous layer
# Shape: (batch_size, num_features)
batch_size = 3
num_features = 4

# Random activations 
np.random.seed(42)
x = np.random.randn(batch_size, num_features) * 5 + 10  # scaled & shifted activations
print("Input activations (x):")
print(x)
print("Shape of x:", x.shape)


# 2. Compute per-feature mean and variance (across the batch)

mu = np.mean(x, axis=0)       # mean per feature (shape = [num_features])
var = np.var(x, axis=0)       # variance per feature
print("\nMean per feature:", mu)
print("Variance per feature:", var)


# 3. Normalize

eps = 1e-5
x_hat = (x - mu) / np.sqrt(var + eps)
print("\nNormalized activations (x_hat):")
print(x_hat)
print("Mean of normalized features (should be ~0):", np.mean(x_hat, axis=0))
print("Var of normalized features (should be ~1):", np.var(x_hat, axis=0))


# 4. Apply learnable parameters γ (scale) and β (shift)

gamma = np.ones((num_features,))   # typically starts as 1
beta = np.zeros((num_features,))   # typically starts as 0



y = gamma * x_hat + beta
print("\nOutput after applying gamma and beta (y):")
print(y)


# 5. Sanity check shapes

print("\nShapes:")
print("x      :", x.shape)
print("mu     :", mu.shape)
print("var    :", var.shape)
print("x_hat  :", x_hat.shape)
print("gamma  :", gamma.shape)
print("beta   :", beta.shape)
print("y (output):", y.shape)


Input activations (x):
[[12.48357077  9.30867849 13.23844269 17.61514928]
 [ 8.82923313  8.82931522 17.89606408 13.83717365]
 [ 7.65262807 12.71280022  7.68291154  7.67135123]]
Shape of x: (3, 4)

Mean per feature: [ 9.65514399 10.28359798 12.93913943 13.04122472]
Variance per feature: [ 4.23073226  2.98880996 17.42953869 16.79662062]

Normalized activations (x_hat):
[[ 1.37510752 -0.56392178  0.07169162  1.11603525]
 [-0.40153637 -0.84119945  1.187324    0.19421113]
 [-0.97357114  1.40512123 -1.25901561 -1.31024638]]
Mean of normalized features (should be ~0): [ 0.00000000e+00  0.00000000e+00 -1.48029737e-16  0.00000000e+00]
Var of normalized features (should be ~1): [0.99999764 0.99999665 0.99999943 0.9999994 ]

Output after applying gamma and beta (y):
[[ 1.37510752 -0.56392178  0.07169162  1.11603525]
 [-0.40153637 -0.84119945  1.187324    0.19421113]
 [-0.97357114  1.40512123 -1.25901561 -1.31024638]]

Shapes:
x      : (3, 4)
mu     : (4,)
var    : (4,)
x_hat  : (3, 4)
gamma  : (4