# 5-Layer Neural Network

A neural network with configurable layer sizes.

**Architecture:**
```
Input (X) -> Layer1 -> Layer2 -> Layer3 -> Layer4 -> Layer5 (Output)
```

## Import Libraries

In [None]:
import numpy as np

## FiveLayerNN Class

In [None]:
class FiveLayerNN:
    """
    5-Layer Neural Network with configurable layer sizes.

    Architecture:
    Input (X) -> Layer1 -> Layer2 -> Layer3 -> Layer4 -> Layer5 (Output)
    """

    def __init__(self, layer_dims, learning_rate=0.01):
        """
        Initialize the neural network.

        Args:
            layer_dims: list of 6 integers [n_x, n1, n2, n3, n4, n5]
                        where n_x is input size, n5 is output size
            learning_rate: learning rate for gradient descent
        """
        self.layer_dims = layer_dims
        self.learning_rate = learning_rate
        self.parameters = {}
        self.cache = {}
        self.gradients = {}

        self._initialize_parameters()

    def _initialize_parameters(self):
        """Initialize W and b for all 5 layers using He initialization."""
        np.random.seed(42)

        for l in range(1, 6):
            #* np.sqrt(2 / ...)	He initialization — make numbers not too big, not too small!
            self.parameters[f'W{l}'] = np.random.randn(self.layer_dims[l], self.layer_dims[l-1]) * np.sqrt(2 / self.layer_dims[l-1])
            self.parameters[f'b{l}'] = np.zeros((self.layer_dims[l], 1))

## Activation Functions

In [None]:
# Add activation functions to the class
def relu(self, Z):
    return np.maximum(0, Z)

def relu_derivative(self, Z):
    return (Z > 0).astype(float)

def sigmoid(self, Z):
    return 1 / (1 + np.exp(-np.clip(Z, -500, 500)))

def sigmoid_derivative(self, Z):
    s = self.sigmoid(Z)
    return s * (1 - s)

def softmax(self, Z):
    exp_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return exp_Z / np.sum(exp_Z, axis=0, keepdims=True)

# Attach methods to class
FiveLayerNN.relu = relu
FiveLayerNN.relu_derivative = relu_derivative
FiveLayerNN.sigmoid = sigmoid
FiveLayerNN.sigmoid_derivative = sigmoid_derivative
FiveLayerNN.softmax = softmax

## Forward Propagation

In [None]:
def forward_propagation(self, X):
    """
    Forward pass through all 5 layers.

    Layers 1-4: ReLU activation
    Layer 5: Sigmoid activation (binary) or Softmax (multiclass)
    """
    self.cache['A0'] = X
    A = X

    # Layers 1-4: Linear -> ReLU
    for l in range(1, 5):
        W = self.parameters[f'W{l}']
        b = self.parameters[f'b{l}']

        Z = np.dot(W, A) + b
        A = self.relu(Z)

        self.cache[f'Z{l}'] = Z
        self.cache[f'A{l}'] = A

    # Layer 5: Linear -> Sigmoid (output)
    W5 = self.parameters['W5']
    b5 = self.parameters['b5']

    Z5 = np.dot(W5, A) + b5
    A5 = self.sigmoid(Z5)

    self.cache['Z5'] = Z5
    self.cache['A5'] = A5

    return A5

FiveLayerNN.forward_propagation = forward_propagation

## Backward Propagation

**Layer 5 (Output):**
- dZ⁵ = A⁵ - Y
- dW⁵ = (1/m) * dZ⁵ · (A⁴)ᵀ
- db⁵ = (1/m) * Σ dZ⁵

**Layers 4-1:**
- dZˡ = (Wˡ⁺¹)ᵀ · dZˡ⁺¹ * g'ˡ(Zˡ)
- dWˡ = (1/m) * dZˡ · (Aˡ⁻¹)ᵀ
- dbˡ = (1/m) * Σ dZˡ

In [None]:
def backward_propagation(self, Y):
    """
    Backward pass computing dZ, dW, db for all 5 layers.
    """
    m = Y.shape[1]

    # ============ Layer 5 (Output Layer) ============
    # dZ⁵ = A⁵ - Y
    dZ5 = self.cache['A5'] - Y

    # dW⁵ = (1/m) * dZ⁵ · (A⁴)ᵀ
    dW5 = (1/m) * np.dot(dZ5, self.cache['A4'].T)

    # db⁵ = (1/m) * Σ dZ⁵
    db5 = (1/m) * np.sum(dZ5, axis=1, keepdims=True)

    self.gradients['dZ5'] = dZ5
    self.gradients['dW5'] = dW5
    self.gradients['db5'] = db5

    # ============ Layer 4 ============
    # dZ⁴ = (W⁵)ᵀ · dZ⁵ * g'⁴(Z⁴)
    dZ4 = np.dot(self.parameters['W5'].T, dZ5) * self.relu_derivative(self.cache['Z4'])

    # dW⁴ = (1/m) * dZ⁴ · (A³)ᵀ
    dW4 = (1/m) * np.dot(dZ4, self.cache['A3'].T)

    # db⁴ = (1/m) * Σ dZ⁴
    db4 = (1/m) * np.sum(dZ4, axis=1, keepdims=True)

    self.gradients['dZ4'] = dZ4
    self.gradients['dW4'] = dW4
    self.gradients['db4'] = db4

    # ============ Layer 3 ============
    # dZ³ = (W⁴)ᵀ · dZ⁴ * g'³(Z³)
    dZ3 = np.dot(self.parameters['W4'].T, dZ4) * self.relu_derivative(self.cache['Z3'])

    # dW³ = (1/m) * dZ³ · (A²)ᵀ
    dW3 = (1/m) * np.dot(dZ3, self.cache['A2'].T)

    # db³ = (1/m) * Σ dZ³
    db3 = (1/m) * np.sum(dZ3, axis=1, keepdims=True)

    self.gradients['dZ3'] = dZ3
    self.gradients['dW3'] = dW3
    self.gradients['db3'] = db3

    # ============ Layer 2 ============
    # dZ² = (W³)ᵀ · dZ³ * g'²(Z²)
    dZ2 = np.dot(self.parameters['W3'].T, dZ3) * self.relu_derivative(self.cache['Z2'])

    # dW² = (1/m) * dZ² · (A¹)ᵀ
    dW2 = (1/m) * np.dot(dZ2, self.cache['A1'].T)

    # db² = (1/m) * Σ dZ²
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)

    self.gradients['dZ2'] = dZ2
    self.gradients['dW2'] = dW2
    self.gradients['db2'] = db2

    # ============ Layer 1 ============
    # dZ¹ = (W²)ᵀ · dZ² * g'¹(Z¹)
    dZ1 = np.dot(self.parameters['W2'].T, dZ2) * self.relu_derivative(self.cache['Z1'])

    # dW¹ = (1/m) * dZ¹ · (X)ᵀ
    dW1 = (1/m) * np.dot(dZ1, self.cache['A0'].T)

    # db¹ = (1/m) * Σ dZ¹
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

    self.gradients['dZ1'] = dZ1
    self.gradients['dW1'] = dW1
    self.gradients['db1'] = db1

FiveLayerNN.backward_propagation = backward_propagation

## Update Parameters

In [None]:
def update_parameters(self):
    """Update W and b using gradient descent."""
    for l in range(1, 6):
        self.parameters[f'W{l}'] -= self.learning_rate * self.gradients[f'dW{l}']
        self.parameters[f'b{l}'] -= self.learning_rate * self.gradients[f'db{l}']

FiveLayerNN.update_parameters = update_parameters

## Loss Function

In [None]:
def compute_loss(self, Y):
    """Binary cross-entropy loss."""
    m = Y.shape[1]
    A5 = self.cache['A5']

    # Clip to prevent log(0)
    A5 = np.clip(A5, 1e-15, 1 - 1e-15)

    loss = -(1/m) * np.sum(Y * np.log(A5) + (1 - Y) * np.log(1 - A5))
    return loss

FiveLayerNN.compute_loss = compute_loss

## Training

In [None]:
def train(self, X, Y, epochs=1000, print_loss=True):
    """Train the neural network."""
    losses = []

    for epoch in range(epochs):
        # Forward propagation
        self.forward_propagation(X)

        # Compute loss
        loss = self.compute_loss(Y)
        losses.append(loss)

        # Backward propagation
        self.backward_propagation(Y)

        # Update parameters
        self.update_parameters()

        if print_loss and epoch % 100 == 0:
            print(f"Epoch {epoch:4d} | Loss: {loss:.6f}")

    return losses

FiveLayerNN.train = train

## Prediction

In [None]:
def predict(self, X):
    """Make predictions."""
    A5 = self.forward_propagation(X)
    return (A5 > 0.5).astype(int)

def accuracy(self, X, Y):
    """Compute accuracy."""
    predictions = self.predict(X)
    return np.mean(predictions == Y) * 100

FiveLayerNN.predict = predict
FiveLayerNN.accuracy = accuracy

## Demo - XOR-like Problem

In [None]:
# Generate sample data (XOR-like problem)
np.random.seed(1)
m = 1000  # number of samples

X = np.random.randn(2, m)
Y = ((X[0, :] * X[1, :]) > 0).astype(int).reshape(1, m)

print(f"X shape: {X.shape}")
print(f"Y shape: {Y.shape}")

In [None]:
# Network architecture: [input, layer1, layer2, layer3, layer4, output]
layer_dims = [2, 8, 8, 6, 4, 1]

# Create and train network
nn = FiveLayerNN(layer_dims, learning_rate=0.1)

print("=" * 50)
print("5-Layer Neural Network Training")
print("=" * 50)
print(f"Architecture: {layer_dims}")
print(f"Training samples: {m}")
print("=" * 50)

In [None]:
# Train the network
losses = nn.train(X, Y, epochs=1000, print_loss=True)

In [None]:
print("=" * 50)
print(f"Final Accuracy: {nn.accuracy(X, Y):.2f}%")
print("=" * 50)

In [None]:
# Print gradient shapes for verification
print("\nGradient Shapes:")
for l in range(1, 6):
    print(f"  Layer {l}: dW{l}{nn.gradients[f'dW{l}'].shape}, "
          f"db{l}{nn.gradients[f'db{l}'].shape}, "
          f"dZ{l}{nn.gradients[f'dZ{l}'].shape}")