# Neural Network from Scratch

Build a complete neural network using only NumPy to understand the fundamentals.

## Learning Objectives
- Implement forward propagation
- Implement backpropagation
- Train a neural network on real data
- Understand gradient descent

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split

np.random.seed(42)

## 1. Activation Functions

Implement common activation functions and their derivatives.

In [None]:
def sigmoid(x):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

def sigmoid_derivative(x):
    """Derivative of sigmoid"""
    s = sigmoid(x)
    return s * (1 - s)

def relu(x):
    """ReLU activation function"""
    return np.maximum(0, x)

def relu_derivative(x):
    """Derivative of ReLU"""
    return (x > 0).astype(float)

def tanh(x):
    """Tanh activation function"""
    return np.tanh(x)

def tanh_derivative(x):
    """Derivative of tanh"""
    return 1 - np.tanh(x)**2

# Test activations
x = np.linspace(-5, 5, 100)
plt.figure(figsize=(15, 4))

plt.subplot(131)
plt.plot(x, sigmoid(x), label='Sigmoid')
plt.plot(x, sigmoid_derivative(x), label='Sigmoid Derivative', linestyle='--')
plt.legend()
plt.grid(True)
plt.title('Sigmoid')

plt.subplot(132)
plt.plot(x, relu(x), label='ReLU')
plt.plot(x, relu_derivative(x), label='ReLU Derivative', linestyle='--')
plt.legend()
plt.grid(True)
plt.title('ReLU')

plt.subplot(133)
plt.plot(x, tanh(x), label='Tanh')
plt.plot(x, tanh_derivative(x), label='Tanh Derivative', linestyle='--')
plt.legend()
plt.grid(True)
plt.title('Tanh')

plt.tight_layout()
plt.show()

## 2. Neural Network Class

Implement a flexible neural network class.

In [None]:
class NeuralNetwork:
    def __init__(self, layer_sizes, activation='relu'):
        """
        Initialize neural network
        
        Args:
            layer_sizes: List of layer sizes [input, hidden1, hidden2, ..., output]
            activation: Activation function ('relu', 'sigmoid', 'tanh')
        """
        self.layer_sizes = layer_sizes
        self.num_layers = len(layer_sizes)
        self.activation = activation
        
        # Initialize weights and biases
        self.weights = []
        self.biases = []
        
        for i in range(len(layer_sizes) - 1):
            # He initialization for ReLU, Xavier for others
            if activation == 'relu':
                w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2.0 / layer_sizes[i])
            else:
                w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(1.0 / layer_sizes[i])
            
            b = np.zeros((1, layer_sizes[i+1]))
            
            self.weights.append(w)
            self.biases.append(b)
    
    def _activate(self, x):
        """Apply activation function"""
        if self.activation == 'relu':
            return relu(x)
        elif self.activation == 'sigmoid':
            return sigmoid(x)
        elif self.activation == 'tanh':
            return tanh(x)
    
    def _activate_derivative(self, x):
        """Apply activation derivative"""
        if self.activation == 'relu':
            return relu_derivative(x)
        elif self.activation == 'sigmoid':
            return sigmoid_derivative(x)
        elif self.activation == 'tanh':
            return tanh_derivative(x)
    
    def forward(self, X):
        """
        Forward propagation
        
        Args:
            X: Input data (batch_size, input_size)
        
        Returns:
            Output predictions
        """
        self.z_values = []  # Pre-activation values
        self.activations = [X]  # Post-activation values
        
        A = X
        
        # Hidden layers
        for i in range(len(self.weights) - 1):
            Z = A @ self.weights[i] + self.biases[i]
            A = self._activate(Z)
            
            self.z_values.append(Z)
            self.activations.append(A)
        
        # Output layer (sigmoid for binary classification)
        Z = A @ self.weights[-1] + self.biases[-1]
        A = sigmoid(Z)
        
        self.z_values.append(Z)
        self.activations.append(A)
        
        return A
    
    def backward(self, X, y, learning_rate=0.01):
        """
        Backpropagation
        
        Args:
            X: Input data
            y: True labels
            learning_rate: Learning rate
        """
        m = X.shape[0]
        
        # Output layer gradient
        dZ = self.activations[-1] - y
        
        # Backpropagate through layers
        for i in range(len(self.weights) - 1, -1, -1):
            # Compute gradients
            dW = (1/m) * (self.activations[i].T @ dZ)
            db = (1/m) * np.sum(dZ, axis=0, keepdims=True)
            
            if i > 0:
                # Propagate to previous layer
                dA = dZ @ self.weights[i].T
                dZ = dA * self._activate_derivative(self.z_values[i-1])
            
            # Update weights and biases
            self.weights[i] -= learning_rate * dW
            self.biases[i] -= learning_rate * db
    
    def train(self, X, y, epochs=1000, learning_rate=0.01, verbose=True):
        """
        Train the neural network
        
        Args:
            X: Training data
            y: Training labels
            epochs: Number of training epochs
            learning_rate: Learning rate
            verbose: Print progress
        
        Returns:
            List of losses
        """
        losses = []
        
        for epoch in range(epochs):
            # Forward pass
            predictions = self.forward(X)
            
            # Compute loss (binary cross-entropy)
            loss = -np.mean(y * np.log(predictions + 1e-8) + (1 - y) * np.log(1 - predictions + 1e-8))
            losses.append(loss)
            
            # Backward pass
            self.backward(X, y, learning_rate)
            
            if verbose and epoch % 100 == 0:
                accuracy = np.mean((predictions > 0.5) == y)
                print(f"Epoch {epoch}: Loss = {loss:.4f}, Accuracy = {accuracy:.4f}")
        
        return losses
    
    def predict(self, X):
        """Make predictions"""
        predictions = self.forward(X)
        return (predictions > 0.5).astype(int)

print("Neural Network class implemented!")

## 3. Train on Moons Dataset

Test the neural network on a non-linear dataset.

In [None]:
# Generate dataset
X, y = make_moons(n_samples=1000, noise=0.2, random_state=42)
y = y.reshape(-1, 1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Visualize dataset
plt.figure(figsize=(8, 6))
plt.scatter(X[y.flatten()==0, 0], X[y.flatten()==0, 1], c='blue', label='Class 0', alpha=0.6)
plt.scatter(X[y.flatten()==1, 0], X[y.flatten()==1, 1], c='red', label='Class 1', alpha=0.6)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Moons Dataset')
plt.legend()
plt.grid(True)
plt.show()

print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

In [None]:
# Create and train neural network
nn = NeuralNetwork([2, 16, 8, 1], activation='relu')

# Train
losses = nn.train(X_train, y_train, epochs=1000, learning_rate=0.1, verbose=True)

# Plot training loss
plt.figure(figsize=(10, 4))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.grid(True)
plt.show()

In [None]:
# Evaluate on test set
test_predictions = nn.predict(X_test)
test_accuracy = np.mean(test_predictions == y_test)
print(f"\nTest Accuracy: {test_accuracy:.4f}")

# Visualize decision boundary
def plot_decision_boundary(model, X, y):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    plt.scatter(X[y.flatten()==0, 0], X[y.flatten()==0, 1], c='blue', label='Class 0', edgecolors='k')
    plt.scatter(X[y.flatten()==1, 0], X[y.flatten()==1, 1], c='red', label='Class 1', edgecolors='k')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('Decision Boundary')
    plt.legend()
    plt.show()

plot_decision_boundary(nn, X_test, y_test)

## 4. Experiment with Different Architectures

Try different network architectures and activation functions.

In [None]:
# Compare different architectures
architectures = [
    [2, 8, 1],
    [2, 16, 1],
    [2, 16, 8, 1],
    [2, 32, 16, 1]
]

results = []

for arch in architectures:
    print(f"\nTraining architecture: {arch}")
    nn = NeuralNetwork(arch, activation='relu')
    losses = nn.train(X_train, y_train, epochs=500, learning_rate=0.1, verbose=False)
    
    test_pred = nn.predict(X_test)
    test_acc = np.mean(test_pred == y_test)
    
    results.append({
        'architecture': str(arch),
        'accuracy': test_acc,
        'final_loss': losses[-1]
    })
    
    print(f"Test Accuracy: {test_acc:.4f}")

# Plot comparison
import pandas as pd
df = pd.DataFrame(results)
print("\nResults Summary:")
print(df)

## Summary

You've implemented:
- ✅ Activation functions (sigmoid, ReLU, tanh)
- ✅ Forward propagation
- ✅ Backpropagation
- ✅ Complete training loop
- ✅ Decision boundary visualization
- ✅ Architecture comparison

**Key Takeaways**:
- Neural networks can learn non-linear decision boundaries
- Deeper networks can capture more complex patterns
- Proper initialization is important
- ReLU works well for hidden layers