In [21]:
import numpy as np 
import matplotlib.pyplot as plt

In [22]:
def initialise(layer_sizes, activation='relu', seed=42):
    np.random.seed(seed)
    num_layers = len(layer_sizes) - 1

    # Initialize weights and biases using Xavier initialisation
    weights = []
    biases = []

    for i in range(num_layers):
        weight_matrix = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2 / (layer_sizes[i] + layer_sizes[i+1]))
        bias_vector = np.zeros((1, layer_sizes[i+1]))
        weights.append(weight_matrix)
        biases.append(bias_vector)
    
    print(f"Initialized network with layer sizes: {layer_sizes}")
    print(f"Activation function: {activation}")
    print(f"Number of layers: {num_layers}")
    return weights, biases, layer_sizes, activation

In [23]:
def count_parameters(weights, biases):
    total_params = 0
    for w, b in zip(weights, biases):
        total_params += w.size + b.size
    return total_params

Activation functiosn with their respective derivative

In [25]:
def relu(x):
    return np.maximum(0, x)
def relu_derivative(x):
    return (x > 0).astype(float)

def tanh(x):
    return np.tanh(x)
def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    sig = sigmoid(x)
    return sig * (1 - sig)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def apply_activation(x, activation_type):
    if activation_type == 'relu':
        return relu(x)
    elif activation_type == 'tanh':
        return tanh(x)
    elif activation_type == 'sigmoid':
        return sigmoid(x)
    else:
        raise ValueError("Unsupported activation function: {activation_type}")

Forward propagation throughout the network

mathematical formulation: 
    For each layer *l*: 
    Z[l] = A[l-1] @ W[l] + b[l] (Linear transformation)
    A[l] = activation(Z[l]) (Activation Function)

    Final Layer uses Softmax for classifcation

    Args: 
        X: Input data (batch_size, input_features)
        return_cache: If true return immediate values for back prop

    returns: 
        Predictions: Final output probaility (batch_size, num_classes)
        cache: Dictionary of intermediate values (if return_cache=True)

In [26]:
def forward_pass(X, weights, biases, activation_type, return_cache=False):
    cache = {}
    A = X
    cache['A0'] = X # Store input as activation of layer 0

    num_layers = len(weights)

    for i in range(num_layers):
        # Hidden Layers
        Z = np.dot(A, weights[i]) + biases[i]
        
        if i < num_layers - 1:
            A = apply_activation(Z, activation_type)
        else:
            A = softmax(Z)  # Output layer with softmax
        
        cache[f'Z{i+1}'] = Z
        cache[f'A{i+1}'] = A

    if return_cache:
        return A, cache
    return A

def predict(X, weights, biases, activation_type):
    probabilities = forward_pass(X, return_cache=False)
    return np.argmax(probabilities, axis=1)

In [27]:
def testing_forward_pass():
    print("="*50)
    print("Testing forward pass")
    print("="*50)

    X = np.array([[0,0],[0,1],[1,0],[1,1]], dtype=np.float32)
    y = np.array([0,1,1,0], dtype=int)  # XOR truth table

    weights, biases, layer_sizes, activation = initialise(layer_sizes=[2, 2, 2], activation='relu')

    print("\nNetwork Architecture:")
    for i in range(len(layer_sizes)-1):
        print(f" Layer {i+1}: {layer_sizes[i]} -> {layer_sizes[i+1]}")

    # forward pass
    print(f"\n{'-'*30}")
    print("FORWARD PASS:")
    print(f"{'-'*30}")

    predictions, cache = forward_pass(X, weights, biases, activation, return_cache=True)
    predicted_classes = np.argmax(predictions, axis=1)

    print(f"\nFinal Results:")
    print(f" Predicted Probabilities:\n{predictions}")
    print(f" Predicted Classes: {predicted_classes}")
    print(f" True Classes: {y}")
    print(f"Accuracy: {np.mean(predicted_classes == y) * 100:.2f}%")

    return weights, biases, layer_sizes, activation, X, y, cache

In [28]:
if __name__ == "__main__":
    weights, biases, layer_sizes, activation, X, y, cache = testing_forward_pass()
    
    print("\n" + "="*50)
    print("FORWARD PASS IMPLEMENTATION COMPLETE!")
    print("="*50)
    print("Available functions:")
    print("- initialize_network(layer_sizes, activation, seed)")
    print("- forward_pass(X, weights, biases, activation_type, return_cache)")
    print("- predict(X, weights, biases, activation_type)")
    print("- All activation functions: relu, tanh, sigmoid, softmax")

Testing forward pass
Initialized network with layer sizes: [2, 2, 2]
Activation function: relu
Number of layers: 2

Network Architecture:
 Layer 1: 2 -> 2
 Layer 2: 2 -> 2

------------------------------
FORWARD PASS:
------------------------------

Final Results:
 Predicted Probabilities:
[[0.5        0.5       ]
 [0.64980355 0.35019645]
 [0.49999898 0.50000102]
 [0.63692715 0.36307285]]
 Predicted Classes: [0 0 1 0]
 True Classes: [0 1 1 0]
Accuracy: 75.00%

FORWARD PASS IMPLEMENTATION COMPLETE!
Available functions:
- initialize_network(layer_sizes, activation, seed)
- forward_pass(X, weights, biases, activation_type, return_cache)
- predict(X, weights, biases, activation_type)
- All activation functions: relu, tanh, sigmoid, softmax


Loss Function

In [1]:
def softmax(x):

    '''
    Compute the softmax activation for a vector or a matrix of logits.

    Args: 
        x: A numpy array of shape (n,) or (m, n) where n is the number of classes.
    Returns:
        A numpy array of the same shape as x representing the softmax probabilities.
    '''

    shifted_x = x - np.max(x, axis=-1, keepdims=True)  # Prevent overflow
    exp_x = np.exp(shifted_x)

    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

In [3]:
def cross_entropy_loss(y_true, y_pred):

    '''
    Compute the cross-entropy loss between true labels and predicted probabilities.
    Mathematical Formula:
    L = -1/N * Σ(i=1 to N) Σ(j=1 to C) y_true[i,j] * log(y_pred[i,j])

    Args:
        y_true: A numpy array of shape (m, n) representing one-hot encoded true labels.
        y_pred: A numpy array of shape (m, n) representing predicted probabilities.
    Returns:
        A float representing the average cross-entropy loss over the batch.
    '''
    batch_size = y_pred.shape[0]
    num_classes = y_pred.shape[1]

    if y_true.ndim == 1:
        y_true_onehot = np.zeros((batch_size, num_classes))
        y_true_onehot[np.arange(batch_size), y_true] = 1
    else:
        y_true_onehot = y_true

    epsilon = 1e-15
    y_pred_clipped = np.clip(y_pred, epsilon, 1 - epsilon)

    # Calclulate cross-entropy loss
    sample_losses = -np.sum(y_true_onehot * np.log(y_pred_clipped), axis=1)

    return np.mean(sample_losses), y_true_onehot