In [8]:
import numpy as np 
import matplotlib.pyplot as plt

In [9]:
def initialise(layer_sizes, activation='relu', seed=42):
    np.random.seed(seed)
    num_layers = len(layer_sizes) - 1

    # Initialize weights and biases using Xavier initialisation
    weights = []
    biases = []

    for i in range(num_layers):
        weight_matrix = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2 / (layer_sizes[i] + layer_sizes[i+1]))
        bias_vector = np.zeros((1, layer_sizes[i+1]))
        weights.append(weight_matrix)
        biases.append(bias_vector)
    
    print(f"Initialized network with layer sizes: {layer_sizes}")
    print(f"Activation function: {activation}")
    print(f"Number of layers: {num_layers}")
    return weights, biases, layer_sizes, activation

In [10]:
def count_parameters(weights, biases):
    total_params = 0
    for w, b in zip(weights, biases):
        total_params += w.size + b.size
    return total_params

Activation functiosn with their respective derivative

In [11]:
def relu(x):
    return np.maximum(0, x)
def relu_derivative(x):
    return (x > 0).astype(float)

def tanh(x):
    return np.tanh(x)
def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    sig = sigmoid(x)
    return sig * (1 - sig)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def apply_activation(x, activation_type):
    if activation_type == 'relu':
        return relu(x)
    elif activation_type == 'tanh':
        return tanh(x)
    elif activation_type == 'sigmoid':
        return sigmoid(x)
    else:
        raise ValueError("Unsupported activation function: {activation_type}")

Forward propagation throughout the network

mathematical formulation: 
    For each layer *l*: 
    Z[l] = A[l-1] @ W[l] + b[l] (Linear transformation)
    A[l] = activation(Z[l]) (Activation Function)

    Final Layer uses Softmax for classifcation

    Args: 
        X: Input data (batch_size, input_features)
        return_cache: If true return immediate values for back prop

    returns: 
        Predictions: Final output probaility (batch_size, num_classes)
        cache: Dictionary of intermediate values (if return_cache=True)

In [12]:
def forward_pass(X, weights, biases, activation_type, return_cache=False):
    cache = {}
    A = X
    cache['A0'] = X # Store input as activation of layer 0

    num_layers = len(weights)

    for i in range(num_layers):
        # Hidden Layers
        Z = np.dot(A, weights[i]) + biases[i]
        
        if i < num_layers - 1:
            A = apply_activation(Z, activation_type)
        else:
            A = softmax(Z)  # Output layer with softmax
        
        cache[f'Z{i+1}'] = Z
        cache[f'A{i+1}'] = A

    if return_cache:
        return A, cache
    return A

def predict(X, weights, biases, activation_type):
    probabilities = forward_pass(X, return_cache=False)
    return np.argmax(probabilities, axis=1)

In [13]:
def testing_forward_pass():
    print("="*50)
    print("Testing forward pass")
    print("="*50)

    X = np.array([[0,0],[0,1],[1,0],[1,1]], dtype=np.float32)
    y = np.array([0,1,1,0], dtype=int)  # XOR truth table

    weights, biases, layer_sizes, activation = initialise(layer_sizes=[2, 2, 2], activation='relu')

    print("\nNetwork Architecture:")
    for i in range(len(layer_sizes)-1):
        print(f" Layer {i+1}: {layer_sizes[i]} -> {layer_sizes[i+1]}")

    # forward pass
    print(f"\n{'-'*30}")
    print("FORWARD PASS:")
    print(f"{'-'*30}")

    predictions, cache = forward_pass(X, weights, biases, activation, return_cache=True)
    predicted_classes = np.argmax(predictions, axis=1)

    print(f"\nFinal Results:")
    print(f" Predicted Probabilities:\n{predictions}")
    print(f" Predicted Classes: {predicted_classes}")
    print(f" True Classes: {y}")
    print(f"Accuracy: {np.mean(predicted_classes == y) * 100:.2f}%")

    return weights, biases, layer_sizes, activation, X, y, cache

Loss Function

In [15]:
def softmax(x):

    '''
    Compute the softmax activation for a vector or a matrix of logits.

    Args: 
        x: A numpy array of shape (n,) or (m, n) where n is the number of classes.
    Returns:
        A numpy array of the same shape as x representing the softmax probabilities.
    '''

    shifted_x = x - np.max(x, axis=-1, keepdims=True)  # Prevent overflow
    exp_x = np.exp(shifted_x)

    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

In [16]:
def cross_entropy_loss(y_true, y_pred):

    '''
    Compute the cross-entropy loss between true labels and predicted probabilities.
    Mathematical Formula:
    L = -1/N * Σ(i=1 to N) Σ(j=1 to C) y_true[i,j] * log(y_pred[i,j])

    Args:
        y_true: A numpy array of shape (m, n) representing one-hot encoded true labels.
        y_pred: A numpy array of shape (m, n) representing predicted probabilities.
    Returns:
        A float representing the average cross-entropy loss over the batch.
    '''
    batch_size = y_pred.shape[0]
    num_classes = y_pred.shape[1]

    if y_true.ndim == 1:
        y_true_onehot = np.zeros((batch_size, num_classes))
        y_true_onehot[np.arange(batch_size), y_true] = 1
    else:
        y_true_onehot = y_true

    epsilon = 1e-15
    y_pred_clipped = np.clip(y_pred, epsilon, 1 - epsilon)

    # Calclulate cross-entropy loss
    sample_losses = -np.sum(y_true_onehot * np.log(y_pred_clipped), axis=1)

    return np.mean(sample_losses), y_true_onehot

In [17]:
def test_cross_entropy_loss():
    """Test the cross-entropy loss function with different scenarios"""
    print("="*60)
    print("TESTING CROSS-ENTROPY LOSS FUNCTION")
    print("="*60)
    
    # Test data - XOR problem
    y_true = np.array([0, 1, 1, 0])  # True classes
    
    # Scenario 1: Perfect predictions
    y_pred_perfect = np.array([
        [1.0, 0.0],  # Perfectly predicts class 0
        [0.0, 1.0],  # Perfectly predicts class 1  
        [0.0, 1.0],  # Perfectly predicts class 1
        [1.0, 0.0]   # Perfectly predicts class 0
    ])
    
    # Scenario 2: Random predictions (50-50)
    y_pred_random = np.array([
        [0.5, 0.5],
        [0.5, 0.5],
        [0.5, 0.5],
        [0.5, 0.5]
    ])
    
    # Scenario 3: Completely wrong predictions
    y_pred_wrong = np.array([
        [0.0, 1.0],  # Predicts class 1, true is 0
        [1.0, 0.0],  # Predicts class 0, true is 1
        [1.0, 0.0],  # Predicts class 0, true is 1
        [0.0, 1.0]   # Predicts class 1, true is 0
    ])
    
    # Scenario 4: Realistic predictions (like your network output)
    y_pred_realistic = np.array([
        [0.7, 0.3],   # Somewhat confident in class 0
        [0.45, 0.55], # Slightly favors class 1
        [0.4, 0.6],   # Somewhat confident in class 1
        [0.8, 0.2]    # Very confident in class 0
    ])
    
    # Test all scenarios
    scenarios = [
        ("Perfect Predictions", y_pred_perfect),
        ("Random Predictions", y_pred_random), 
        ("Wrong Predictions", y_pred_wrong),
        ("Realistic Predictions", y_pred_realistic)
    ]
    
    print(f"True labels: {y_true}\n")
    
    for name, y_pred in scenarios:
        loss, y_true_onehot = cross_entropy_loss(y_true, y_pred)
        predicted_classes = np.argmax(y_pred, axis=1)
        accuracy = np.mean(predicted_classes == y_true) * 100
        
        print(f"{name}:")
        print(f"  Predictions:\n{y_pred}")
        print(f"  Predicted classes: {predicted_classes}")
        print(f"  Cross-entropy loss: {loss:.6f}")
        print(f"  Accuracy: {accuracy:.1f}%")
        print(f"  Loss interpretation: {'Excellent' if loss < 0.1 else 'Good' if loss < 0.5 else 'Poor' if loss < 1.0 else 'Very Poor'}")
        print()
    
    return y_true_onehot

In [22]:
if __name__ == "__main__":
    test_cross_entropy_loss()
    calculate_loss_with_your_network_output()
    explain_cross_entropy_math()
    print("\n" + "="*60)
    print("CROSS-ENTROPY LOSS IMPLEMENTATION COMPLETE!")
    print("="*60)

TESTING CROSS-ENTROPY LOSS FUNCTION
True labels: [0 1 1 0]

Perfect Predictions:
  Predictions:
[[1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]]
  Predicted classes: [0 1 1 0]
  Cross-entropy loss: 0.000000
  Accuracy: 100.0%
  Loss interpretation: Excellent

Random Predictions:
  Predictions:
[[0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]]
  Predicted classes: [0 0 0 0]
  Cross-entropy loss: 0.693147
  Accuracy: 50.0%
  Loss interpretation: Poor

Wrong Predictions:
  Predictions:
[[0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]]
  Predicted classes: [1 0 0 1]
  Cross-entropy loss: 34.538776
  Accuracy: 0.0%
  Loss interpretation: Very Poor

Realistic Predictions:
  Predictions:
[[0.7  0.3 ]
 [0.45 0.55]
 [0.4  0.6 ]
 [0.8  0.2 ]]
  Predicted classes: [0 1 1 0]
  Cross-entropy loss: 0.422120
  Accuracy: 100.0%
  Loss interpretation: Good

LOSS CALCULATION WITH YOUR NETWORK OUTPUT
Your Network Results:
True labels:      [0 1 1 0]
Predicted classes: [0 0 1 0]
Predicted probs:
[[0.5        0.5       ]
 [0.64980355 0