In [82]:
import numpy as np
from keras.datasets import mnist
import time

## Part 1: Forward Propagation

In [83]:
np.random.seed(10)

In [84]:
def initialize_parameters(layer_dims):
    """
    Initialize the parameters W and b for each layer in the network.

    Arguments:
    layer_dims -- list containing the dimensions of each layer in the network

    Returns:
    parameters -- dictionary containing initialized W and b parameters
    """
    parameters = {}
    for l in range(1, len(layer_dims)):
        parameters[f"W{l}"] = np.random.randn(layer_dims[l], layer_dims[l - 1]) / layer_dims[l-1]
        parameters[f"b{l}"] = np.zeros((layer_dims[l], 1))
    return parameters


In [85]:
def linear_forward(A, W, b):
    """
    Compute the linear part of a layer's forward propagation.

    Arguments:
    A -- activations from the previous layer
    W -- weight matrix of the current layer
    b -- bias vector of the current layer

    Returns:
    Z -- linear component of the activation function
    linear_cache -- dictionary containing A, W, b
    """
    Z = np.dot(W, A) + b
    linear_cache = {"A": A, "W": W, "b": b}
    return Z, linear_cache


In [86]:
def softmax(Z):
    """
    Compute the softmax activation for the input Z.

    Arguments:
    Z -- linear component of the activation function

    Returns:
    A -- activations of the layer
    activation_cache -- returns Z for use in backpropagation
    """
    Z = Z - np.max(Z, axis=0, keepdims=True)  # Normalize Z
    A = np.exp(Z) / np.sum(np.exp(Z), axis=0, keepdims=True)
    return A, Z


In [87]:
def relu(Z):
    """
    Compute the ReLU activation for the input Z.

    Arguments:
    Z -- linear component of the activation function

    Returns:
    A -- activations of the layer
    activation_cache -- returns Z for use in backpropagation
    """
    A = np.maximum(0, Z)
    return A, Z


In [88]:
def linear_activation_forward(A_prev, W, B, activation):
    """
    Implements the forward propagation step for the LINEAR->ACTIVATION layer.

    Parameters:
    -----------
    A_prev : np.ndarray
        Activations from the previous layer, of shape (size of previous layer, number of examples).
    W : np.ndarray
        Weights matrix of the current layer, of shape (size of current layer, size of previous layer).
    B : np.ndarray
        Bias vector of the current layer, of shape (size of current layer, 1).
    activation : str
        The activation function to be applied. Must be either "relu" or "softmax".

    Returns:
    --------
    A : np.ndarray
        The post-activation value, of shape (size of current layer, number of examples).
    cache : dict
        A dictionary containing:
        - "linear_cache": A dictionary with values (A_prev, W, B) for use in backward propagation.
        - "activation_cache": Z (the input to the activation function) for use in backward propagation.

    Raises:
    -------
    Exception:
        If the provided activation function is not "relu" or "softmax".

    """
    # Compute linear part of forward propagation
    Z, linear_cache = linear_forward(A_prev, W, B)

    # Apply the specified activation function
    if activation == 'relu':
        A, activation_cache = relu(Z)
    elif activation == 'softmax':
        A, activation_cache = softmax(Z)
    else:
        raise Exception('Activation function must be either "relu" or "softmax"')

    # Combine linear and activation caches
    cache = {"linear_cache": linear_cache, "activation_cache": activation_cache}

    return A, cache


In [89]:
def l_model_forward(X, parameters, use_batchnorm=False):
    """
    Implement forward propagation for [LINEAR->RELU]*(L-1)->LINEAR->SOFTMAX.

    Arguments:
    X -- input data
    parameters -- initialized W and b parameters
    use_batchnorm -- flag to determine whether to apply batch normalization

    Returns:
    AL -- output of the last layer
    caches -- list of caches for each layer
    """
    A = X
    caches = []
    L = len(parameters) // 2

    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(A_prev, parameters[f"W{l}"], parameters[f"b{l}"], "relu")
        if use_batchnorm:
            A = apply_batchnorm(A)
        caches.append(cache)

    AL, cache = linear_activation_forward(A, parameters[f"W{L}"], parameters[f"b{L}"], "softmax")
    caches.append(cache)

    return AL, caches


In [90]:
def compute_cost(AL, Y):
    """
    Compute the categorical cross-entropy cost.

    Arguments:
    AL -- probability vector from softmax, shape (num_classes, number of examples)
    Y -- ground truth labels, one-hot encoded, shape (num_classes, number of examples)

    Returns:
    cost -- cross-entropy cost
    """
    m = Y.shape[1]
    cost = -np.sum(Y * np.log(AL + 1e-8)) / (m+1e-8)  # Added epsilon for numerical stability
    return cost


In [91]:
def apply_batchnorm(A):
    """
    Perform batch normalization on the activations.

    Arguments:
    A -- activations of a given layer

    Returns:
    NA -- normalized activations
    """
    mean = np.mean(A, axis=1, keepdims=True)
    variance = np.var(A, axis=1, keepdims=True)
    NA = (A - mean) / np.sqrt(variance + 1e-8)  # Added epsilon for stability
    return NA


# Part 2: Backward Propagation

In [92]:
def linear_backward(dZ, cache):
    """
    Implements the linear portion of backward propagation for a single layer.

    Parameters:
    -----------
    dZ : numpy.ndarray
        Gradient of the cost with respect to the linear output of the current layer.
    cache : dict
        Dictionary containing:
        - "A": Activations from the previous layer.
        - "W": Weights of the current layer.
        - "b": Biases of the current layer.

    Returns:
    --------
    dA_prev : numpy.ndarray
        Gradient of the cost with respect to the activations of the previous layer.
    dW : numpy.ndarray
        Gradient of the cost with respect to the weights of the current layer.
    db : numpy.ndarray
        Gradient of the cost with respect to the biases of the current layer.
    """
    # Extract cached values
    A_prev = cache["A"]
    W = cache["W"]
    m = A_prev.shape[1]  # Number of examples

    # Compute gradients
    dW = np.dot(dZ, A_prev.T) / m
    dA_prev = np.dot(W.T, dZ)
    db = np.sum(dZ, axis=1, keepdims=True) / m

    return dA_prev, dW, db


In [93]:
def linear_activation_backward(dA, cache, activation):
    """
    Implements the backward propagation step for the LINEAR->ACTIVATION layer.

    Parameters:
    -----------
    dA : np.ndarray
        Gradient of the cost with respect to the post-activation value,
        of shape (size of current layer, number of examples).
    cache : dict
        Dictionary containing:
        - "linear_cache": Cached values (A_prev, W, B) from the forward propagation step.
        - "activation_cache": Cached Z value (linear activation input) from the forward propagation step.
    activation : str
        The activation function used in the forward propagation. Must be either "relu" or "softmax".

    Returns:
    --------
    dA_prev : np.ndarray
        Gradient of the cost with respect to the activation of the previous layer,
        of shape (size of previous layer, number of examples).
    dW : np.ndarray
        Gradient of the cost with respect to the weights of the current layer,
        of shape (size of current layer, size of previous layer).
    db : np.ndarray
        Gradient of the cost with respect to the biases of the current layer,
        of shape (size of current layer, 1).

    Raises:
    -------
    Exception:
        If the provided activation function is not "relu" or "softmax".
    """
    # Retrieve cached values
    linear_cache = cache["linear_cache"]
    activation_cache = cache["activation_cache"]

    # Compute the gradient of Z based on the activation function
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
    elif activation == "softmax":
        dZ = softmax_backward(dA, activation_cache)
    else:
        raise Exception('Activation function must be either "relu" or "softmax"')

    # Compute gradients for the linear component
    dA_prev, dW, db = linear_backward(dZ, linear_cache)

    return dA_prev, dW, db


In [94]:
def relu_backward(dA, activation_cache):
    """
    Implements the backward propagation for a ReLU activation.

    Parameters:
    -----------
    dA : numpy.ndarray
        Gradient of the cost with respect to the activation of the current layer.
    activation_cache : numpy.ndarray
        Stored linear activation (Z) from the forward propagation.

    Returns:
    --------
    dZ : numpy.ndarray
        Gradient of the cost with respect to Z (input to ReLU).
    """
    # Retrieve Z from the activation cache
    Z = activation_cache

    # Initialize dZ as a copy of dA
    dZ = dA * (Z > 0).astype(float)

    return dZ


In [95]:
def softmax_backward(dA, activation_cache):
    """
    Implements the backward propagation for a softmax activation.

    Parameters:
    -----------
    dA : numpy.ndarray
        Gradient of the cost with respect to the activation of the current layer.
    activation_cache : dict
        Dictionary containing:
        - "Y": True labels (one-hot encoded).

    Returns:
    --------
    dZ : numpy.ndarray
        Gradient of the cost with respect to Z (input to softmax).
    """
    # Extract true labels from the activation cache
    Y_true = activation_cache["Y"]

    # Compute gradient
    dZ = dA - Y_true

    return dZ


In [96]:
def l_model_backward(AL, Y, caches):
    """
    Implements the backward propagation for the entire network.

    Parameters:
    -----------
    AL : numpy.ndarray
        Post-activation output of the final layer.
    Y : numpy.ndarray
        True labels in one-hot encoded format.
    caches : list
        List of caches for each layer, generated during forward propagation.

    Returns:
    --------
    grads : dict
        Dictionary containing gradients for all layers.
    """
    grads = {}
    L = len(caches)  # Number of layers in the model

    # Backpropagation for the final layer (Softmax)
    final_cache = caches[-1]
    softmax_cache = {
        "linear_cache": final_cache["linear_cache"],
        "activation_cache": {"Z": final_cache["activation_cache"], "Y": Y},
    }
    dA_prev, dW, db = linear_activation_backward(AL, softmax_cache, activation="softmax")
    grads[f"dA{L}"] = dA_prev
    grads[f"dW{L}"] = dW
    grads[f"db{L}"] = db

    # Backpropagation for hidden layers (ReLU)
    for l in reversed(range(1, L)):  # From L-1 to 1
        current_cache = caches[l - 1]
        dA_prev, dW, db = linear_activation_backward(dA_prev, current_cache, activation="relu")
        grads[f"dA{l}"] = dA_prev
        grads[f"dW{l}"] = dW
        grads[f"db{l}"] = db

    return grads


In [97]:
def update_parameters(parameters, grads, learning_rate):
    """
    Updates parameters using gradient descent.

    Arguments:
    parameters -- Dictionary containing the DNN parameters
    grads -- Dictionary containing the gradients (from L_model_backward)
    learning_rate -- Learning rate for gradient descent

    Returns:
    parameters -- Updated parameters
    """
    L = len(parameters) // 2  # Number of layers

    for l in range(1, L + 1):
        parameters[f"W{l}"] -= learning_rate * grads[f"dW{l}"]
        parameters[f"b{l}"] -= learning_rate * grads[f"db{l}"]

    return parameters


# Part 3: Train the network and produce predictions

In [98]:
def l_layer_model(X, Y, layers_dims, learning_rate, num_iterations=100, batch_size=256, use_bnorm=False):
    """
    Trains a deep neural network with optional batch normalization.

    Parameters:
    ---------
    X : np.ndarray
        Input data with shape (features, number_of_samples).
    Y : np.ndarray
        True labels (one-hot encoded) with shape (classes, number_of_samples).
    layers_dims : list
        List specifying the size of each layer in the network.
    learning_rate : float
        Learning rate for the gradient descent updates.
    num_iterations : int, optional
        Total number of epochs for training. Default is 100.
    batch_size : int, optional
        Number of samples per mini-batch. Default is 32.
    use_bnorm : bool, optional
        If True, apply batch normalization during training.

    Returns:
    -------
    parameters : dict
        The learned weights and biases of the network.
    costs_train : list
        Cost values recorded at checkpoints during training.
    """
    # Lists to track costs for training and validation
    costs_train = []
    costs_valid = []

    # Split data into training (80%) and validation (20%)
    num_samples = X.shape[1]
    train_size = int(0.8 * num_samples)
    indices = np.random.permutation(num_samples)
    train_indices = indices[:train_size]
    valid_indices = indices[train_size:]

    X_train, X_valid = X[:, train_indices], X[:, valid_indices]
    Y_train, Y_valid = Y[:, train_indices], Y[:, valid_indices]

    # Initialize parameters for the network
    parameters = initialize_parameters(layers_dims)

    # Counters for iterations and epochs
    batch_counter = 1
    total_epochs = 0

    # Training loop over epochs
    for epoch in range(num_iterations):
        # Mini-batch loop
        for start in range(0, X_train.shape[1], batch_size):
            end = min(start + batch_size, X_train.shape[1])
            X_batch = X_train[:, start:end]
            Y_batch = Y_train[:, start:end]

            # Forward pass
            AL, caches = l_model_forward(X_batch, parameters, use_bnorm)

            # Backward pass
            gradients = l_model_backward(AL, Y_batch, caches)

            # Update parameters
            parameters = update_parameters(parameters, gradients, learning_rate)

            # Track the epoch number
            total_epochs = epoch

            # Periodic cost logging
            if batch_counter % 100 == 0:
                # Compute training cost
                train_cost = compute_cost(AL, Y_batch)
                costs_train.append(train_cost)

                # Compute validation cost
                AL_valid, _ = l_model_forward(X_valid, parameters, use_bnorm)
                valid_cost = compute_cost(AL_valid, Y_valid)
                costs_valid.append(valid_cost)

                print(f"Step {batch_counter}: Training Cost = {train_cost:.5f}, Validation Cost = {valid_cost:.5f}")

                # Early stopping: Check validation cost improvement
                if len(costs_valid) > 1 and abs(costs_valid[-2] - costs_valid[-1]) < 0.0001:
                    validation_accuracy = predict(X_valid, Y_valid, parameters, use_bnorm)
                    print("Early stopping activated.")
                    print(f"Final Training Cost: {train_cost}")
                    print(f"Final Validation Cost: {valid_cost}")
                    print(f"Accuracy on validation: {validation_accuracy}")
                    print(f"Epoch: {total_epochs}, Iterations: {batch_counter}")


                    # Compute average magnitudes of weights for each layer
                    print("\nAverage Magnitude of Weights ('without' L2):")
                    print(f"This run is {'with' if use_bnorm else 'without'} batch normalization. (use_bnorm={use_bnorm})")
                    for l in range(1, len(layers_dims)):
                        W = parameters[f"W{l}"]
                        avg_magnitude = np.mean(np.abs(W))
                        print(f"Layer {l}: Average Magnitude = {avg_magnitude:.5f}")

                    return parameters, costs_train

            batch_counter += 1

    # Final cost computations
    final_train_cost = compute_cost(AL, Y_batch)
    final_valid_cost = compute_cost(AL_valid, Y_valid)

    print("Training complete.")
    print(f"Final Training Cost: {final_train_cost}")
    print(f"Final Validation Cost: {final_valid_cost}")
    print("Accuracy on validation: ", predict(X_valid, Y_valid, parameters, use_bnorm))
    print(f"Epoch: {total_epochs}, Iterations: {batch_counter}")


    # Compute average magnitudes of weights for each layer
    print("\nAverage Magnitude of Weights (without L2):")
    print(f"This run is {'with' if use_bnorm else 'without'} batch normalization. (use_bnorm={use_bnorm})")
    for l in range(1, len(layers_dims)):
        W = parameters[f"W{l}"]
        avg_magnitude = np.mean(np.abs(W))
        print(f"Layer {l}: Average Magnitude = {avg_magnitude:.5f}")

    return parameters, costs_train


In [99]:
def predict(X, Y, parameters,use_bnorm= False):
    """
    Predict the labels for the input data and calculate accuracy.

    Arguments:
    X -- Input data, numpy array of shape (height*width, number_of_examples)
    Y -- True labels of the data, one-hot encoded, shape (num_of_classes, number_of_examples)
    parameters -- Dictionary containing the DNN architecture's parameters

    Returns:
    accuracy -- Accuracy of the model as a percentage
    """
    # Forward propagation
    AL, _ = l_model_forward(X, parameters, use_batchnorm=use_bnorm)

    # Predictions: Choose the class with the highest probability
    predictions = np.argmax(AL, axis=0)  # Predicted class indices
    true_labels = np.argmax(Y, axis=0)  # True class indices

    # Calculate accuracy
    accuracy = np.mean(predictions == true_labels)

    return accuracy


# MNIST dataset

In [100]:
# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize the pixel values to the range [0, 1]
x_train = x_train / 255.0
x_test = x_test / 255.0

# Flatten the images (convert 2D arrays into 1D arrays)
x_train = x_train.reshape(x_train.shape[0], -1)  # Flatten each image
x_test = x_test.reshape(x_test.shape[0], -1)    # Flatten each image

# Convert labels to one-hot encoded format
num_classes = 10
y_train_one_hot = np.eye(num_classes)[y_train]  # One-hot encoding for training labels
y_test_one_hot = np.eye(num_classes)[y_test]    # One-hot encoding for test labels

# Assign the one-hot encoded labels back to the original variables
y_train = y_train_one_hot
y_test = y_test_one_hot


In [101]:
past = time.time()
params, loss_lst = l_layer_model(x_train.T, y_train.T, [x_train.shape[1], 20, 7, 5, 10],
                                 learning_rate=0.009, num_iterations=100,
                                 batch_size=256, use_bnorm=False)

print("Accuarcy on train: ", predict(x_train.T, y_train.T, params))
print("Accuarcy on test: ", predict(x_test.T, y_test.T, params,))

print(f"The time it takes: {np.round(time.time() - past, 2)} seconds")


Step 100: Training Cost = 2.30193, Validation Cost = 2.30221
Step 200: Training Cost = 2.30199, Validation Cost = 2.30168
Step 300: Training Cost = 2.29946, Validation Cost = 2.30100
Step 400: Training Cost = 2.30110, Validation Cost = 2.30024
Step 500: Training Cost = 2.29900, Validation Cost = 2.29941
Step 600: Training Cost = 2.30050, Validation Cost = 2.29857
Step 700: Training Cost = 2.29997, Validation Cost = 2.29753
Step 800: Training Cost = 2.29411, Validation Cost = 2.29640
Step 900: Training Cost = 2.29421, Validation Cost = 2.29502
Step 1000: Training Cost = 2.29466, Validation Cost = 2.29339
Step 1100: Training Cost = 2.28864, Validation Cost = 2.29133
Step 1200: Training Cost = 2.28832, Validation Cost = 2.28870
Step 1300: Training Cost = 2.28311, Validation Cost = 2.28523
Step 1400: Training Cost = 2.28001, Validation Cost = 2.28064
Step 1500: Training Cost = 2.27407, Validation Cost = 2.27427
Step 1600: Training Cost = 2.26511, Validation Cost = 2.26523
Step 1700: Traini

In [102]:
past = time.time()
params, loss_lst = l_layer_model(x_train.T, y_train.T, [x_train.shape[1], 20, 7, 5, 10],
                                 learning_rate=0.009, num_iterations=100,
                                 batch_size=256, use_bnorm=True)

print("Accuarcy on train: ", predict(x_train.T, y_train.T, params, True))
print("Accuarcy on test: ", predict(x_test.T, y_test.T, params, True))

print(f"The time it takes: {np.round(time.time() - past, 2)} seconds")


Step 100: Training Cost = 2.12300, Validation Cost = 2.12958
Step 200: Training Cost = 2.00878, Validation Cost = 1.99921
Step 300: Training Cost = 1.89166, Validation Cost = 1.91129
Step 400: Training Cost = 1.81946, Validation Cost = 1.83523
Step 500: Training Cost = 1.71411, Validation Cost = 1.76397
Step 600: Training Cost = 1.69367, Validation Cost = 1.70335
Step 700: Training Cost = 1.61979, Validation Cost = 1.63817
Step 800: Training Cost = 1.56912, Validation Cost = 1.57706
Step 900: Training Cost = 1.54291, Validation Cost = 1.51601
Step 1000: Training Cost = 1.44681, Validation Cost = 1.46045
Step 1100: Training Cost = 1.35723, Validation Cost = 1.40691
Step 1200: Training Cost = 1.29295, Validation Cost = 1.35561
Step 1300: Training Cost = 1.31666, Validation Cost = 1.30986
Step 1400: Training Cost = 1.32658, Validation Cost = 1.27529
Step 1500: Training Cost = 1.19009, Validation Cost = 1.23859
Step 1600: Training Cost = 1.20094, Validation Cost = 1.20442
Step 1700: Traini

# L2

In [103]:
def compute_cost(AL, Y, params, lambd=0):
    """
    Compute the categorical cross-entropy cost with L2 regularization.

    Arguments:
    ----------
    AL : numpy.ndarray
        Probability vector from softmax, shape (num_classes, number of examples).
    Y : numpy.ndarray
        Ground truth labels, one-hot encoded, shape (num_classes, number of examples).
    params : dict
        Dictionary containing the parameters of the network (weights and biases).
    lambd : float
        Regularization strength (default is 0, meaning no regularization).

    Returns:
    --------
    cost : float
        The total cost (cross-entropy + L2 regularization penalty).
    """
    m = Y.shape[1]  # Number of examples

    # Cross-entropy cost
    cross_entropy_cost = -np.sum(Y * np.log(AL + 1e-8)) / (m + 1e-8)  # Added epsilon for numerical stability

    # L2 regularization cost
    l2_penalty = 0
    for key in params:
        if key.startswith("W"):  # Apply L2 regularization only to weights
            l2_penalty += np.sum(np.square(params[key]))
    l2_penalty = (lambd / (2 * m)) * l2_penalty

    # Total cost
    cost = cross_entropy_cost + l2_penalty

    return cost


In [104]:
def l_model_backward(AL, Y, caches, parameters, lambd=0):
    """
    Implements the backward propagation for the entire network with L2 regularization.

    Parameters:
    -----------
    AL : numpy.ndarray
        Post-activation output of the final layer.
    Y : numpy.ndarray
        True labels in one-hot encoded format.
    caches : list
        List of caches for each layer, generated during forward propagation.
    parameters : dict
        Dictionary containing the DNN parameters (weights and biases).
    lambd : float
        Regularization strength (default is 0, meaning no regularization).

    Returns:
    --------
    grads : dict
        Dictionary containing gradients for all layers.
    """
    grads = {}
    L = len(caches)  # Number of layers in the model
    m = AL.shape[1]  # Number of examples

    # Backpropagation for the final layer (Softmax)
    final_cache = caches[-1]
    softmax_cache = {
        "linear_cache": final_cache["linear_cache"],
        "activation_cache": {"Z": final_cache["activation_cache"], "Y": Y},
    }
    dA_prev, dW, db = linear_activation_backward(AL, softmax_cache, activation="softmax")
    if lambd > 0:
        dW += (lambd / m) * parameters[f"W{L}"]  # Add L2 regularization term to dW
    grads[f"dA{L}"] = dA_prev
    grads[f"dW{L}"] = dW
    grads[f"db{L}"] = db

    # Backpropagation for hidden layers (ReLU)
    for l in reversed(range(1, L)):  # From L-1 to 1
        current_cache = caches[l - 1]
        dA_prev, dW, db = linear_activation_backward(dA_prev, current_cache, activation="relu")
        if lambd > 0:
            dW += (lambd / m) * parameters[f"W{l}"]  # Add L2 regularization term to dW
        grads[f"dA{l}"] = dA_prev
        grads[f"dW{l}"] = dW
        grads[f"db{l}"] = db

    return grads


In [105]:
def l_layer_model(X, Y, layers_dims, learning_rate, num_iterations=100, batch_size=256, use_bnorm=False,lamda=0.001):
    """
    Trains a deep neural network with optional batch normalization.

    Parameters:
    ---------
    X : np.ndarray
        Input data with shape (features, number_of_samples).
    Y : np.ndarray
        True labels (one-hot encoded) with shape (classes, number_of_samples).
    layers_dims : list
        List specifying the size of each layer in the network.
    learning_rate : float
        Learning rate for the gradient descent updates.
    num_iterations : int, optional
        Total number of epochs for training. Default is 100.
    batch_size : int, optional
        Number of samples per mini-batch. Default is 32.
    use_bnorm : bool, optional
        If True, apply batch normalization during training.

    Returns:
    -------
    parameters : dict
        The learned weights and biases of the network.
    costs_train : list
        Cost values recorded at checkpoints during training.
    """
    # Lists to track costs for training and validation
    costs_train = []
    costs_valid = []

    # Split data into training (80%) and validation (20%)
    num_samples = X.shape[1]
    train_size = int(0.8 * num_samples)
    indices = np.random.permutation(num_samples)
    train_indices = indices[:train_size]
    valid_indices = indices[train_size:]

    X_train, X_valid = X[:, train_indices], X[:, valid_indices]
    Y_train, Y_valid = Y[:, train_indices], Y[:, valid_indices]

    # Initialize parameters for the network
    parameters = initialize_parameters(layers_dims)

    # Counters for iterations and epochs
    batch_counter = 1
    total_epochs = 0

    # Training loop over epochs
    for epoch in range(num_iterations):
        # Mini-batch loop
        for start in range(0, X_train.shape[1], batch_size):
            end = min(start + batch_size, X_train.shape[1])
            X_batch = X_train[:, start:end]
            Y_batch = Y_train[:, start:end]

            # Forward pass
            AL, caches = l_model_forward(X_batch, parameters, use_bnorm)

            # Backward pass
            gradients = l_model_backward(AL, Y_batch, caches,lamda)

            # Update parameters
            parameters = update_parameters(parameters, gradients, learning_rate)

            # Track the epoch number
            total_epochs = epoch

            # Periodic cost logging
            if batch_counter % 100 == 0:
                # Compute training cost
                train_cost = compute_cost(AL, Y_batch,parameters,lamda)
                costs_train.append(train_cost)

                # Compute validation cost
                AL_valid, _ = l_model_forward(X_valid, parameters, use_bnorm)
                valid_cost = compute_cost(AL_valid, Y_valid,parameters,lamda)
                costs_valid.append(valid_cost)

                print(f"Step {batch_counter}: Training Cost = {train_cost:.5f}, Validation Cost = {valid_cost:.5f}")

                # Early stopping: Check validation cost improvement
                if len(costs_valid) > 1 and abs(costs_valid[-2] - costs_valid[-1]) < 0.0001:
                    validation_accuracy = predict(X_valid, Y_valid, parameters, use_bnorm)
                    print("Early stopping activated.")
                    print(f"Final Training Cost: {train_cost}")
                    print(f"Final Validation Cost: {valid_cost}")
                    print(f"Accuracy on validation: {validation_accuracy}")
                    print(f"Epoch: {total_epochs}, Iterations: {batch_counter}")


                    # Compute average magnitudes of weights for each layer
                    print("\nAverage Magnitude of Weights (with L2):")
                    print(f"This run is {'with' if use_bnorm else 'without'} batch normalization. (use_bnorm={use_bnorm})")
                    for l in range(1, len(layers_dims)):
                        W = parameters[f"W{l}"]
                        avg_magnitude = np.mean(np.abs(W))
                        print(f"Layer {l}: Average Magnitude = {avg_magnitude:.5f}")

                    return parameters, costs_train

            batch_counter += 1

    # Final cost computations
    final_train_cost = compute_cost(AL, Y_batch,parameters,lamda)
    final_valid_cost = compute_cost(AL_valid, Y_valid,parameters,lamda)

    print("Training complete.")
    print(f"Final Training Cost: {final_train_cost}")
    print(f"Final Validation Cost: {final_valid_cost}")
    print("Accuracy on validation: ", predict(X_valid, Y_valid, parameters, use_bnorm))
    print(f"Epoch: {total_epochs}, Iterations: {batch_counter}")


    # Compute average magnitudes of weights for each layer
    print("\nAverage Magnitude of Weights (without L2):")
    print(f"This run is {'with' if use_bnorm else 'without'} batch normalization. (use_bnorm={use_bnorm})")
    for l in range(1, len(layers_dims)):
        W = parameters[f"W{l}"]
        avg_magnitude = np.mean(np.abs(W))
        print(f"Layer {l}: Average Magnitude = {avg_magnitude:.5f}")

    return parameters, costs_train


In [106]:
# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize the pixel values to the range [0, 1]
x_train = x_train / 255.0
x_test = x_test / 255.0

# Flatten the images (convert 2D arrays into 1D arrays)
x_train = x_train.reshape(x_train.shape[0], -1)  # Flatten each image
x_test = x_test.reshape(x_test.shape[0], -1)    # Flatten each image

# Convert labels to one-hot encoded format
num_classes = 10
y_train_one_hot = np.eye(num_classes)[y_train]  # One-hot encoding for training labels
y_test_one_hot = np.eye(num_classes)[y_test]    # One-hot encoding for test labels

# Assign the one-hot encoded labels back to the original variables
y_train = y_train_one_hot
y_test = y_test_one_hot


In [107]:
past = time.time()
params, loss_lst = l_layer_model(x_train.T, y_train.T, [x_train.shape[1], 20, 7, 5, 10],
                                 learning_rate=0.009, num_iterations=100,
                                 batch_size=256, use_bnorm=False,lamda=0.001)

print("Accuracy on train: ", predict(x_train.T, y_train.T, params))
print("Accuracy on test: ", predict(x_test.T, y_test.T, params))

print(f"The time it takes: {np.round(time.time() - past, 2)} seconds")


Step 100: Training Cost = 2.30219, Validation Cost = 2.30218
Step 200: Training Cost = 2.30168, Validation Cost = 2.30168
Step 300: Training Cost = 2.30150, Validation Cost = 2.30106
Step 400: Training Cost = 2.30034, Validation Cost = 2.30052
Step 500: Training Cost = 2.30066, Validation Cost = 2.29985
Step 600: Training Cost = 2.29799, Validation Cost = 2.29923
Step 700: Training Cost = 2.29959, Validation Cost = 2.29855
Step 800: Training Cost = 2.29883, Validation Cost = 2.29784
Step 900: Training Cost = 2.29708, Validation Cost = 2.29707
Step 1000: Training Cost = 2.29583, Validation Cost = 2.29619
Step 1100: Training Cost = 2.29392, Validation Cost = 2.29523
Step 1200: Training Cost = 2.29880, Validation Cost = 2.29413
Step 1300: Training Cost = 2.28662, Validation Cost = 2.29290
Step 1400: Training Cost = 2.29350, Validation Cost = 2.29145
Step 1500: Training Cost = 2.28749, Validation Cost = 2.28980
Step 1600: Training Cost = 2.28899, Validation Cost = 2.28783
Step 1700: Traini

In [108]:
past = time.time()
params, loss_lst = l_layer_model(x_train.T, y_train.T, [x_train.shape[1], 20, 7, 5, 10],
                                 learning_rate=0.009, num_iterations=100,
                                 batch_size=256, use_bnorm=True,lamda=0.001)

print("Accuarcy on train: ", predict(x_train.T, y_train.T, params, True))
print("Accuarcy on test: ", predict(x_test.T, y_test.T, params, True))

print(f"The time it takes: {np.round(time.time() - past, 2)} seconds")


Step 100: Training Cost = 2.16448, Validation Cost = 2.16424
Step 200: Training Cost = 2.04966, Validation Cost = 2.04966
Step 300: Training Cost = 2.00821, Validation Cost = 1.96802
Step 400: Training Cost = 1.87288, Validation Cost = 1.88100
Step 500: Training Cost = 1.81977, Validation Cost = 1.80244
Step 600: Training Cost = 1.75071, Validation Cost = 1.72280
Step 700: Training Cost = 1.63294, Validation Cost = 1.64756
Step 800: Training Cost = 1.58684, Validation Cost = 1.57784
Step 900: Training Cost = 1.49652, Validation Cost = 1.51583
Step 1000: Training Cost = 1.45530, Validation Cost = 1.45923
Step 1100: Training Cost = 1.43551, Validation Cost = 1.41013
Step 1200: Training Cost = 1.45518, Validation Cost = 1.36640
Step 1300: Training Cost = 1.27884, Validation Cost = 1.32881
Step 1400: Training Cost = 1.33898, Validation Cost = 1.28988
Step 1500: Training Cost = 1.28501, Validation Cost = 1.25679
Step 1600: Training Cost = 1.23635, Validation Cost = 1.22542
Step 1700: Traini