# MNIST Digit Classifier: Two-Layer Neural Network

Building a neural network from scratch using NumPy.

## Data Loading and Preprocessing

Loading the MNIST dataset and preparing it for training.

In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Load MNIST dataset
def load_mnist():
    mnist = fetch_openml('mnist_784', version=1)
    X, y = mnist.data / 255.0, mnist.target.astype(int)
    return X, y.to_numpy()  # Convert y to a NumPy array

# One-hot encode labels
def one_hot_encode(y, num_classes):
    encoder = OneHotEncoder(sparse_output=False, categories=[range(num_classes)])
    return encoder.fit_transform(y.reshape(-1, 1))

# Split dataset
def prepare_data(test_size=0.2):
    X, y = load_mnist()
    y_encoded = one_hot_encode(y, num_classes=10)
    return train_test_split(X, y_encoded, test_size=test_size, random_state=42)

X_train, X_test, y_train, y_test = prepare_data()
X_train, X_test = X_train.to_numpy(), X_test.to_numpy()
print(f"Training Data Shape: {X_train.shape}, Test Data Shape: {X_test.shape}")

## Neural Network Implementation

Implementing the two-layer neural network architecture with forward propagation.

In [None]:
import numpy as np

class TwoLayerNN:
    def __init__(self, input_size, hidden_size, output_size):
        """
        Initialize weights and biases.
        """
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))

    def relu(self, Z):
        """
        ReLU activation function.
        """
        return np.maximum(0, Z)

    def relu_derivative(self, Z):
        """
        Derivative of ReLU activation.
        """
        return (Z > 0).astype(float)

    def softmax(self, Z):
        """
        Softmax activation function.
        """
        expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
        return expZ / np.sum(expZ, axis=1, keepdims=True)

    def forward(self, X):
        """
        Forward pass.
        """
        self.Z1 = np.dot(X, self.W1) + self.b1
        self.A1 = self.relu(self.Z1)
        self.Z2 = np.dot(self.A1, self.W2) + self.b2
        self.A2 = self.softmax(self.Z2)
        return self.A2

    def compute_loss(self, y_true, y_pred):
        """
        Compute cross-entropy loss.
        """
        m = y_true.shape[0]

        # Handle both one-hot and class index labels
        if len(y_true.shape) > 1:
            y_true = np.argmax(y_true, axis=1)

        log_likelihood = -np.log(y_pred[range(m), y_true])
        loss = np.sum(log_likelihood) / m
        return loss

    def backward(self, X, y):
        """
        Backpropagation to compute gradients.
        Return: dW1, db1, dW2, db2
        """
        m = X.shape[0]

        # Handle both one-hot and class index labels
        if len(y.shape) > 1:
            y = np.argmax(y, axis=1)

        y_one_hot = np.zeros_like(self.A2)
        y_one_hot[range(m), y] = 1

        dZ2 = self.A2 - y_one_hot
        dW2 = np.dot(self.A1.T, dZ2) / m
        db2 = np.sum(dZ2, axis=0, keepdims=True) / m

        dA1 = np.dot(dZ2, self.W2.T)
        dZ1 = dA1 * self.relu_derivative(self.Z1)
        dW1 = np.dot(X.T, dZ1) / m
        db1 = np.sum(dZ1, axis=0, keepdims=True) / m

        return dW1, db1, dW2, db2
    def predict(self, X):
        """
        Predict class labels.
        """
        probabilities = self.forward(X)
        return np.argmax(probabilities, axis=1)

    def train(self, X, y, epochs, learning_rate):
        """
        Train the model using gradient descent.
        """
        for epoch in range(epochs):
            # Forward pass
            y_pred = self.forward(X)

            # Compute loss
            loss = self.compute_loss(y, y_pred)

            # Backward pass
            dW1, db1, dW2, db2 = self.backward(X, y)

            # Update parameters
            self.W1 -= learning_rate * dW1
            self.b1 -= learning_rate * db1
            self.W2 -= learning_rate * dW2
            self.b2 -= learning_rate * db2

            # Print loss every 10 epochs or last epoch
            if epoch % 10 == 0 or epoch == epochs - 1:
                print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")

## Model Training and Evaluation

In [None]:
# Initialize model
input_size = X_train.shape[1]
hidden_size = 64  # You can choose a suitable value
output_size = 10  # Number of classes

model = TwoLayerNN(input_size, hidden_size, output_size)

# Training the model
epochs = 100
learning_rate = 0.5
model.train(X_train, y_train, epochs, learning_rate)

# Evaluate on test data
predictions = model.predict(X_test)
accuracy = np.mean(predictions == np.argmax(y_test, axis=1))
print(f"Test Accuracy: {accuracy * 100:.2f}%")