# Neural Network Utility

This notebook provides a reusable implementation of neural networks that can be used across different chapters and examples. The implementation includes modern features like momentum optimization, L2 regularization, and early stopping.

## Features

- Flexible layer configuration
- Multiple activation functions (ReLU, Sigmoid)
- Advanced optimization techniques
- Regularization options
- Training utilities and visualizations

## Usage Example

```python
from utilities.neural_network import EnhancedNeuralNetwork, ReLU, Sigmoid

# Create a network
network = EnhancedNeuralNetwork(
    layer_sizes=[2, 4, 1],
    activations=[ReLU(), Sigmoid()]
)

# Train the network
losses = network.train(
    X=X_train,
    y=y_train,
    epochs=1000,
    learning_rate=0.01,
    momentum=0.9,
    l2_lambda=0.01
)
```

In [None]:
# Import required libraries
import numpy as np
from typing import List, Tuple, Optional
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
class Activation:
    """Base class for activation functions"""
    
    @staticmethod
    def forward(x: np.ndarray) -> np.ndarray:
        raise NotImplementedError
        
    @staticmethod
    def backward(x: np.ndarray) -> np.ndarray:
        raise NotImplementedError


class ReLU(Activation):
    """Rectified Linear Unit activation function
    
    Forward: f(x) = max(0, x)
    Backward: f'(x) = 1 if x > 0 else 0
    """
    
    @staticmethod
    def forward(x: np.ndarray) -> np.ndarray:
        return np.maximum(0, x)
    
    @staticmethod
    def backward(x: np.ndarray) -> np.ndarray:
        return np.where(x > 0, 1, 0)


class Sigmoid(Activation):
    """Sigmoid activation function
    
    Forward: f(x) = 1 / (1 + exp(-x))
    Backward: f'(x) = f(x) * (1 - f(x))
    """
    
    @staticmethod
    def forward(x: np.ndarray) -> np.ndarray:
        return 1 / (1 + np.exp(-x))
    
    @staticmethod
    def backward(x: np.ndarray) -> np.ndarray:
        s = Sigmoid.forward(x)
        return s * (1 - s)

In [None]:
class Layer:
    """Neural network layer with advanced features
    
    Args:
        input_size (int): Number of input features
        output_size (int): Number of output features
        activation (Activation): Activation function to use
        
    Attributes:
        weights (np.ndarray): Layer weights
        bias (np.ndarray): Layer biases
        activation (Activation): Activation function
        weight_momentum (np.ndarray): Momentum for weight updates
        bias_momentum (np.ndarray): Momentum for bias updates
    """
    
    def __init__(self, input_size: int, output_size: int, activation: Activation):
        # He initialization
        self.weights = np.random.randn(input_size, output_size) * np.sqrt(2.0/input_size)
        self.bias = np.zeros((1, output_size))
        self.activation = activation
        
        # Momentum parameters
        self.weight_momentum = np.zeros_like(self.weights)
        self.bias_momentum = np.zeros_like(self.bias)
        
        # Cache for backpropagation
        self.input = None
        self.output = None
        self.activation_input = None
        
    def forward(self, x: np.ndarray) -> np.ndarray:
        """Forward pass through the layer"""
        self.input = x
        self.activation_input = x @ self.weights + self.bias
        self.output = self.activation.forward(self.activation_input)
        return self.output
    
    def backward(self, grad_output: np.ndarray, learning_rate: float, 
                momentum: float = 0.9, l2_lambda: float = 0.01) -> np.ndarray:
        """Backward pass through the layer with momentum and L2 regularization"""
        # Gradient of activation
        grad_activation = grad_output * self.activation.backward(self.activation_input)
        
        # Gradients with L2 regularization
        grad_weights = self.input.T @ grad_activation + l2_lambda * self.weights
        grad_bias = np.sum(grad_activation, axis=0, keepdims=True)
        
        # Update with momentum
        self.weight_momentum = momentum * self.weight_momentum - learning_rate * grad_weights
        self.bias_momentum = momentum * self.bias_momentum - learning_rate * grad_bias
        
        self.weights += self.weight_momentum
        self.bias += self.bias_momentum
        
        # Gradient for next layer
        grad_input = grad_activation @ self.weights.T
        
        return grad_input

In [None]:
class EnhancedNeuralNetwork:
    """Neural network with advanced training features
    
    Args:
        layer_sizes (List[int]): List of layer sizes (including input and output)
        activations (List[Activation]): List of activation functions for each layer
        
    Attributes:
        layers (List[Layer]): List of network layers
        best_loss (float): Best loss achieved during training
        patience_counter (int): Counter for early stopping
    """
    
    def __init__(self, layer_sizes: List[int], activations: List[Activation]):
        assert len(layer_sizes) >= 2, "Need at least input and output layers"
        assert len(layer_sizes) - 1 == len(activations), "Need activation for each layer except input"
        
        self.layers = []
        for i in range(len(layer_sizes) - 1):
            layer = Layer(layer_sizes[i], layer_sizes[i + 1], activations[i])
            self.layers.append(layer)
            
        self.best_loss = float('inf')
        self.patience_counter = 0
    
    def forward(self, x: np.ndarray) -> np.ndarray:
        """Forward pass through the network"""
        current_input = x
        for layer in self.layers:
            current_input = layer.forward(current_input)
        return current_input
    
    def train(self, X: np.ndarray, y: np.ndarray, epochs: int, 
             learning_rate: float = 0.01, momentum: float = 0.9,
             l2_lambda: float = 0.01, patience: int = 5,
             lr_decay: float = 0.95) -> List[float]:
        """Train the network with advanced features
        
        Args:
            X (np.ndarray): Input features
            y (np.ndarray): Target values
            epochs (int): Number of training epochs
            learning_rate (float): Initial learning rate
            momentum (float): Momentum coefficient
            l2_lambda (float): L2 regularization strength
            patience (int): Early stopping patience
            lr_decay (float): Learning rate decay factor
            
        Returns:
            List[float]: Training loss history
        """
        losses = []
        current_lr = learning_rate
        
        for epoch in range(epochs):
            # Forward pass
            output = self.forward(X)
            
            # Compute loss with L2 regularization
            l2_reg = l2_lambda * sum(np.sum(layer.weights**2) for layer in self.layers)
            loss = -np.mean(y * np.log(output + 1e-15) + 
                          (1 - y) * np.log(1 - output + 1e-15)) + l2_reg
            losses.append(loss)
            
            # Early stopping check
            if loss < self.best_loss:
                self.best_loss = loss
                self.patience_counter = 0
            else:
                self.patience_counter += 1
                if self.patience_counter > patience:
                    print(f"Early stopping at epoch {epoch}")
                    break
            
            # Backward pass
            grad_output = -(y / (output + 1e-15) - 
                          (1 - y) / (1 - output + 1e-15)) / len(X)
            
            for layer in reversed(self.layers):
                grad_output = layer.backward(grad_output, current_lr, 
                                          momentum, l2_lambda)
            
            # Learning rate decay
            current_lr *= lr_decay
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.4f}, LR: {current_lr:.6f}")
        
        return losses

## Visualization Utilities

The following functions help visualize the neural network's behavior and training progress.

In [None]:
def plot_decision_boundary(X: np.ndarray, y: np.ndarray, 
                          network: EnhancedNeuralNetwork, 
                          title: str = "Decision Boundary"):
    """Plot the decision boundary for a 2D classification problem
    
    Args:
        X (np.ndarray): Input features (n_samples, 2)
        y (np.ndarray): Target values (n_samples, 1)
        network (EnhancedNeuralNetwork): Trained neural network
        title (str): Plot title
    """
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    h = 0.02
    
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = network.forward(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.4)
    plt.scatter(X[:, 0], X[:, 1], c=y.ravel(), alpha=0.8)
    plt.title(title)
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.show()


def plot_training_curve(losses: List[float], title: str = "Training Loss"):
    """Plot the training loss curve
    
    Args:
        losses (List[float]): List of training losses
        title (str): Plot title
    """
    plt.figure(figsize=(10, 6))
    plt.plot(losses)
    plt.title(title)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.grid(True)
    plt.show()

In [None]:
def visualize_weights(network: EnhancedNeuralNetwork, layer_index: int, 
                      figsize: Tuple[int, int] = (12, 4), 
                      title: str = "Weight Matrix Visualization"):
    """Visualize the weight matrix of a specific layer as a heatmap
    
    Args:
        network (EnhancedNeuralNetwork): Neural network instance
        layer_index (int): Index of the layer to visualize
        figsize (Tuple[int, int]): Figure size
        title (str): Plot title
    """
    weights = network.layers[layer_index].weights
    
    plt.figure(figsize=figsize)
    sns.heatmap(weights, cmap='coolwarm', center=0, annot=True, fmt='.2f')
    plt.title(f"{title} - Layer {layer_index}")
    plt.xlabel("Output Features")
    plt.ylabel("Input Features")
    plt.show()

def plot_activation_distributions(network: EnhancedNeuralNetwork, X: np.ndarray, 
                                figsize: Tuple[int, int] = (15, 5)):
    """Plot the distribution of activation values for each layer
    
    Args:
        network (EnhancedNeuralNetwork): Neural network instance
        X (np.ndarray): Input data
        figsize (Tuple[int, int]): Figure size
    """
    activations = [X]
    current_input = X
    
    # Forward pass to collect activations
    for layer in network.layers:
        current_input = layer.forward(current_input)
        activations.append(current_input)
    
    n_layers = len(activations)
    plt.figure(figsize=figsize)
    
    for i in range(n_layers):
        plt.subplot(1, n_layers, i + 1)
        plt.hist(activations[i].flatten(), bins=50, density=True)
        plt.title(f"Layer {i} Output")
        plt.xlabel("Activation Value")
        plt.ylabel("Density")
    
    plt.tight_layout()
    plt.show()

def plot_gradient_flow(network: EnhancedNeuralNetwork):
    """Visualize the gradient flow through the network layers
    
    Args:
        network (EnhancedNeuralNetwork): Neural network instance
    """
    gradients = []
    for layer in network.layers:
        if hasattr(layer, 'weight_gradients'):
            grad_mean = np.mean(np.abs(layer.weight_gradients))
            grad_std = np.std(np.abs(layer.weight_gradients))
            gradients.append((grad_mean, grad_std))
    
    if not gradients:
        print("No gradient information available. Run backpropagation first.")
        return
    
    means, stds = zip(*gradients)
    plt.figure(figsize=(10, 6))
    plt.errorbar(range(len(means)), means, yerr=stds, fmt='o-', capsize=5)
    plt.title("Gradient Flow Across Layers")
    plt.xlabel("Layer Index")
    plt.ylabel("Mean Absolute Gradient (with std)")
    plt.grid(True)
    plt.show()