# Forward and Backward Propagation in Neural Networks

In this notebook, we will:
- Understand the idea of forward propagation (computing output)
- Understand backward propagation (computing gradients)
- Implement a **2-layer neural network** from scratch
- Train it on a toy dataset

## 1. Import Libraries

In [None]:
import numpy as np
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

## 2. Generate Dataset
We will use the `make_moons` dataset (2D toy dataset).

In [None]:
X, y = make_moons(n_samples=200, noise=0.2, random_state=42)
y = y.reshape(-1,1)  # convert to column vector

plt.scatter(X[:,0], X[:,1], c=y.ravel(), cmap=plt.cm.coolwarm)
plt.title("Toy Dataset (Moons)")
plt.show()

## 3. Helper Functions

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def initialize_parameters(input_dim, hidden_dim, output_dim):
    np.random.seed(42)
    W1 = np.random.randn(input_dim, hidden_dim) * 0.01
    b1 = np.zeros((1, hidden_dim))
    W2 = np.random.randn(hidden_dim, output_dim) * 0.01
    b2 = np.zeros((1, output_dim))
    return W1, b1, W2, b2

## 4. Forward Propagation
- Compute hidden layer output
- Compute final prediction

In [None]:
def forward_propagation(X, W1, b1, W2, b2):
    Z1 = np.dot(X, W1) + b1
    A1 = np.tanh(Z1)  # hidden activation
    Z2 = np.dot(A1, W2) + b2
    A2 = sigmoid(Z2)  # output activation
    return Z1, A1, Z2, A2

## 5. Backward Propagation
- Compute gradients using chain rule
- Update weights

In [None]:
def backward_propagation(X, y, Z1, A1, Z2, A2, W2):
    m = X.shape[0]

    dZ2 = A2 - y
    dW2 = (1/m) * np.dot(A1.T, dZ2)
    db2 = (1/m) * np.sum(dZ2, axis=0, keepdims=True)

    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * (1 - np.power(A1, 2))  # derivative of tanh
    dW1 = (1/m) * np.dot(X.T, dZ1)
    db1 = (1/m) * np.sum(dZ1, axis=0, keepdims=True)

    return dW1, db1, dW2, db2

## 6. Training the Network

In [None]:
def train(X, y, hidden_dim=4, learning_rate=0.1, epochs=1000):
    input_dim = X.shape[1]
    output_dim = 1
    
    W1, b1, W2, b2 = initialize_parameters(input_dim, hidden_dim, output_dim)
    losses = []
    
    for i in range(epochs):
        # Forward pass
        Z1, A1, Z2, A2 = forward_propagation(X, W1, b1, W2, b2)
        
        # Compute loss (Binary Cross-Entropy)
        loss = -np.mean(y*np.log(A2+1e-8) + (1-y)*np.log(1-A2+1e-8))
        losses.append(loss)
        
        # Backward pass
        dW1, db1, dW2, db2 = backward_propagation(X, y, Z1, A1, Z2, A2, W2)
        
        # Update weights
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2
        
        if i % 100 == 0:
            print(f"Epoch {i}, Loss: {loss:.4f}")
    
    return W1, b1, W2, b2, losses

## 7. Run Training

In [None]:
W1, b1, W2, b2, losses = train(X, y, hidden_dim=4, learning_rate=0.1, epochs=1000)

plt.plot(losses)
plt.title("Training Loss over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()

## 8. Decision Boundary

In [None]:
def plot_decision_boundary(X, y, W1, b1, W2, b2):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    h = 0.01
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    Z1, A1, Z2, A2 = forward_propagation(np.c_[xx.ravel(), yy.ravel()], W1, b1, W2, b2)
    Z = A2.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.6)
    plt.scatter(X[:,0], X[:,1], c=y.ravel(), cmap=plt.cm.coolwarm, edgecolors='k')
    plt.title("Decision Boundary")
    plt.show()

plot_decision_boundary(X, y, W1, b1, W2, b2)

## ✅ Summary
- **Forward Propagation**: Computes outputs step by step.
- **Backward Propagation**: Computes gradients using chain rule.
- Together, they allow neural networks to **learn weights** by minimizing loss.