# Neural Network From Scratch

In this notebook, we will implement a **Multi-Layer Perceptron (MLP)** from scratch using `numpy`.

Key components:
1.  **Dense Layer**: Fully connected layer.
2.  **Activation Functions**: Sigmoid, ReLU.
3.  **Loss Function**: Mean Squared Error (MSE).
4.  **Backpropagation**: Gradient descent optimization.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## 1. Helper Functions (Activations)

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

## 2. Neural Network Class

In [None]:
class NeuralNetwork:
    def __init__(self, inputs, hidden, outputs):
        # Initialize weights and biases randomly
        self.input_size = inputs
        self.hidden_size = hidden
        self.output_size = outputs

        # Weights between Input -> Hidden
        self.W1 = np.random.uniform(size=(self.input_size, self.hidden_size))
        self.b1 = np.random.uniform(size=(1, self.hidden_size))

        # Weights between Hidden -> Output
        self.W2 = np.random.uniform(size=(self.hidden_size, self.output_size))
        self.b2 = np.random.uniform(size=(1, self.output_size))

    def forward(self, X):
        # Input -> Hidden
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = sigmoid(self.z1) # Hidden activation

        # Hidden -> Output
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.output = sigmoid(self.z2) # Output activation
        return self.output

    def backward(self, X, y, output):
        # Calculate gradients
        self.error = y - output
        self.d_output = self.error * sigmoid_derivative(output)

        self.error_hidden = self.d_output.dot(self.W2.T)
        self.d_hidden = self.error_hidden * sigmoid_derivative(self.a1)

        # Update Weights and Biases (Gradient Ascent logic as error = y - output)
        # If we defined error = MSE, it would be gradient descent (- learning_rate)
        # Here we add because direction is towards minimizing difference
        
        learning_rate = 0.1
        self.W2 += self.a1.T.dot(self.d_output) * learning_rate
        self.b2 += np.sum(self.d_output, axis=0, keepdims=True) * learning_rate
        
        self.W1 += X.T.dot(self.d_hidden) * learning_rate
        self.b1 += np.sum(self.d_hidden, axis=0, keepdims=True) * learning_rate

    def train(self, X, y, epochs=10000):
        for _ in range(epochs):
            output = self.forward(X)
            self.backward(X, y, output)

## 3. Training on XOR Dataset

The XOR problem is a classic non-linear classification problem that a single linear layers cannot solve.

In [None]:
X = np.array([[0,0], [0,1], [1,0], [1,1]])
y = np.array([[0], [1], [1], [0]])

nn = NeuralNetwork(2, 4, 1)
nn.train(X, y, epochs=20000)

print("Predicted Output:\n", nn.forward(X))