In [2]:
import torch
import torch.nn as nn

In [3]:
# Set seed for reproducibility
torch.manual_seed(42)

# Create a random vector (e.g., simulating a residual stream)
x = torch.randn(10)  # shape: [d_model]
print("Original vector:\n", x)

# Apply ReLU (simulating post-activation in a transformer block)
x_relu = torch.relu(x)
print("\nAfter ReLU:\n", x_relu)

# Initialize LayerNorm (LayerNorm normalizes across the last dimension)
# Here, eps is a small number for numerical stability
layer_norm = nn.LayerNorm(normalized_shape=x_relu.shape, eps=1e-5)

# Apply LayerNorm
x_normed = layer_norm(x_relu)
print("\nAfter LayerNorm:\n", x_normed)

# Optionally: Check mean and std after LN
print("\nMean of normalized vector:", x_normed.mean().item())
print("Std of normalized vector:", x_normed.std(unbiased=False).item())

Original vector:
 tensor([ 0.3367,  0.1288,  0.2345,  0.2303, -1.1229, -0.1863,  2.2082, -0.6380,
         0.4617,  0.2674])

After ReLU:
 tensor([0.3367, 0.1288, 0.2345, 0.2303, 0.0000, 0.0000, 2.2082, 0.0000, 0.4617,
        0.2674])

After LayerNorm:
 tensor([-0.0801, -0.4129, -0.2438, -0.2504, -0.6191, -0.6191,  2.9155, -0.6191,
         0.1199, -0.1911], grad_fn=<NativeLayerNormBackward0>)

Mean of normalized vector: 1.1920929132713809e-08
Std of normalized vector: 0.9999872446060181
