# Image Classification on fashion MNIST dataset

In [11]:
# libraries
import numpy as np
import pandas as pd

### Pre-processing function
- Loads the datatset from CSV and segregates it into training and testing data
- Also, normalizes the data for pixel values to be in the range of 0 to 1

In [12]:
# Load dataset from CSV
def load_fashion_mnist(csv_path):
    data = pd.read_csv(csv_path).values  
    labels = data[:, 0]  
    images = data[:, 1:] 

    # Normalize pixel values to [0,1]
    images = images.astype(np.float32) / 255.0

    return images, labels

### Information on fashin mnist dataset
- The training dataset contains 60,000 images and the test dataset contains 10,000 images.
- Each image is 28x28 pixels which are converted into an array of 784 elements.
- Each image is labeled with a number from 0 to 9 which represents the class of the image.
- The validation dataset is obtained as 20% of the training dataset.

### Data size
* Training dataset - 48,000 images
* Validation dataset - 12,000 images
* Test dataset - 10,000 images

In [13]:
# Load train and test datasets
train_images, train_labels = load_fashion_mnist("datasets/fashion-mnist_train.csv")
test_images, test_labels = load_fashion_mnist("datasets/fashion-mnist_test.csv")

# Split train into (train + validation)
num_train = int(0.8 * train_images.shape[0])
val_images, val_labels = train_images[num_train:], train_labels[num_train:]
train_images, train_labels = train_images[:num_train], train_labels[:num_train]

print(f"Train: {train_images.shape}, Validation: {val_images.shape}, Test: {test_images.shape}")

Train: (48000, 784), Validation: (12000, 784), Test: (10000, 784)


### Converting labels into one-hot encoding vectors

In [14]:
def one_hot_encode(labels, num_classes=10):
    return np.eye(num_classes)[labels]

# Convert labels to one-hot encoding
y_train_one_hot = one_hot_encode(train_labels)
y_val_one_hot = one_hot_encode(val_labels)
y_test_one_hot = one_hot_encode(test_labels)


# MLP for Image Classification

In [15]:
# import numpy as np

# class MLP:
#     def __init__(self, layer_sizes, activation='relu', dropout_rate=0.0):
#         """
#         Initializes an MLP with a variable number of layers.
        
#         :param layer_sizes: List containing sizes of each layer (including input & output).
#                             Example: [784, 128, 64, 10] -> 2 hidden layers (128 and 64 neurons).
#         :param activation: Activation function ('relu', 'leaky_relu', 'tanh', 'gelu').
#         :param dropout_rate: Dropout rate (0.0 means no dropout, 0.5 means 50% dropout).
#         """
#         self.num_layers = len(layer_sizes) - 1
#         self.weights = []
#         self.biases = []
#         self.dropout_rate = dropout_rate
#         self.activation_func = self.get_activation_function(activation)
#         self.activation_derivative = self.get_activation_derivative(activation)

#         # Xavier Initialization
#         for i in range(self.num_layers):
#             self.weights.append(np.random.randn(layer_sizes[i], layer_sizes[i + 1]) / np.sqrt(layer_sizes[i]))
#             self.biases.append(np.zeros((1, layer_sizes[i + 1])))

#     def get_activation_function(self, activation):
#         """
#         Returns the appropriate activation function.
#         """
#         if activation == 'relu':
#             return lambda x: np.maximum(0, x)
#         elif activation == 'leaky_relu':
#             return lambda x: np.where(x > 0, x, 0.01 * x)
#         elif activation == 'tanh':
#             return lambda x: np.tanh(x)
#         elif activation == 'gelu':
#             return lambda x: 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))
#         else:
#             raise ValueError("Unsupported activation function")

#     def get_activation_derivative(self, activation):
#         """
#         Returns the derivative of the activation function.
#         """
#         if activation == 'relu':
#             return lambda x: (x > 0).astype(float)
#         elif activation == 'leaky_relu':
#             return lambda x: np.where(x > 0, 1, 0.01)
#         elif activation == 'tanh':
#             return lambda x: 1 - np.tanh(x)**2
#         elif activation == 'gelu':
#             return lambda x: 0.5 * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3))) + \
#                             0.5 * x * (1 - np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3))**2) * \
#                             (np.sqrt(2 / np.pi) * (1 + 3 * 0.044715 * x**2))
#         else:
#             raise ValueError("Unsupported activation function")

#     def softmax(self, x):
#         exps = np.exp(x - np.max(x, axis=1, keepdims=True))
#         return exps / np.sum(exps, axis=1, keepdims=True)

#     def cross_entropy_loss(self, y_true, y_pred):
#         num_samples = y_true.shape[0]
#         return -np.sum(y_true * np.log(y_pred + 1e-9)) / num_samples

#     def cross_entropy_derivative(self, y_true, y_pred):
#         return y_pred - y_true

#     def forward(self, X, training=True):
#         """
#         Forward propagation through multiple layers with dropout.
#         """
#         self.activations = [X]
#         self.z_values = []
#         self.drop_masks = []

#         for i in range(self.num_layers - 1):  # Hidden layers
#             z = np.dot(self.activations[-1], self.weights[i]) + self.biases[i]
#             self.z_values.append(z)

#             a = self.activation_func(z)

#             # Apply dropout during training
#             if training and self.dropout_rate > 0:
#                 mask = (np.random.rand(*a.shape) > self.dropout_rate) / (1.0 - self.dropout_rate)
#                 self.drop_masks.append(mask)
#                 a *= mask
#             else:
#                 self.drop_masks.append(None)

#             self.activations.append(a)

#         # Output layer with softmax (no dropout)
#         z_out = np.dot(self.activations[-1], self.weights[-1]) + self.biases[-1]
#         self.z_values.append(z_out)
#         self.activations.append(self.softmax(z_out))

#         return self.activations[-1]

#     def backward(self, y_true, learning_rate=0.01):
#         """
#         Backpropagation through multiple layers.
#         """
#         num_samples = y_true.shape[0]
#         grads_W = [None] * self.num_layers
#         grads_b = [None] * self.num_layers

#         # Compute gradients for output layer
#         dZ = self.cross_entropy_derivative(y_true, self.activations[-1])
#         grads_W[-1] = np.dot(self.activations[-2].T, dZ) / num_samples
#         grads_b[-1] = np.sum(dZ, axis=0, keepdims=True) / num_samples

#         # Backpropagate through hidden layers
#         for i in range(self.num_layers - 2, -1, -1):
#             dA = np.dot(dZ, self.weights[i + 1].T)

#             if self.drop_masks[i] is not None:  # Apply dropout mask during backprop
#                 dA *= self.drop_masks[i]

#             dZ = dA * self.activation_derivative(self.z_values[i])
#             grads_W[i] = np.dot(self.activations[i].T, dZ) / num_samples
#             grads_b[i] = np.sum(dZ, axis=0, keepdims=True) / num_samples

#         # Update weights and biases
#         for i in range(self.num_layers):
#             self.weights[i] -= learning_rate * grads_W[i]
#             self.biases[i] -= learning_rate * grads_b[i]

#     def train(self, X_train, y_train, X_val=None, y_val=None, epochs=100, learning_rate=0.01, patience=10):
#         best_val_loss = float('inf')
#         patience_counter = 0

#         for epoch in range(epochs):
#             y_pred = self.forward(X_train, training=True)
#             loss = self.cross_entropy_loss(y_train, y_pred)

#             # Backpropagation step
#             self.backward(y_train, learning_rate)

#             if (epoch+1) % 10 == 0:
#                 # Compute accuracy
#                 y_true_labels = np.argmax(y_train, axis=1) 
#                 y_pred_labels = np.argmax(y_pred, axis=1)   
#                 accuracy = np.mean(y_pred_labels == y_true_labels) * 100

#                 print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f} - Accuracy: {accuracy:.2f}%")

#             # Validation step
#             if X_val is not None and y_val is not None:
#                 y_val_pred = self.forward(X_val, training=False)
#                 val_loss = self.cross_entropy_loss(y_val, y_val_pred)

#                 if val_loss < best_val_loss:
#                     best_val_loss = val_loss
#                     patience_counter = 0
#                 else:
#                     patience_counter += 1

#                 if patience_counter >= patience:
#                     print(f"Early stopping at epoch {epoch+1}")
#                     break

#     def predict(self, X):
#         """
#         Predict labels for given input.
#         """
#         return np.argmax(self.forward(X, training=False), axis=1)


In [16]:
class MLP:
    def __init__(self, layer_sizes, activation='relu', dropout_rate=0.0, optimizer='sgd', learning_rate=0.01, momentum=0.9):
        """
        Initializes an MLP with a variable number of layers.

        :param layer_sizes: List of sizes of each layer (including input & output).
                            Example: [784, 128, 64, 10] -> 2 hidden layers.
        :param activation: Activation function ('relu', 'leaky_relu', 'tanh', 'sigmoid').
        :param dropout_rate: Dropout rate (0.0 means no dropout, 0.5 means 50% dropout).
        :param optimizer: Optimization method ('sgd' or 'momentum').
        :param learning_rate: Learning rate for weight updates.
        :param momentum: Momentum term for gradient updates (only for momentum optimizer).
        """
        self.num_layers = len(layer_sizes) - 1
        self.weights = []
        self.biases = []
        self.dropout_rate = dropout_rate
        self.activation_func = self.get_activation_function(activation)
        self.activation_derivative = self.get_activation_derivative(activation)
        self.learning_rate = learning_rate
        self.optimizer = optimizer
        self.momentum = momentum

        # Initialize weight velocities for momentum-based optimization
        self.velocity_W = [np.zeros((layer_sizes[i], layer_sizes[i + 1])) for i in range(self.num_layers)]
        self.velocity_b = [np.zeros((1, layer_sizes[i + 1])) for i in range(self.num_layers)]

        # Xavier/He Initialization for weights
        for i in range(self.num_layers):
            limit = np.sqrt(2 / layer_sizes[i])  # He Initialization
            self.weights.append(np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * limit)
            self.biases.append(np.zeros((1, layer_sizes[i + 1])))

    def get_activation_function(self, activation):
        """ Returns the activation function. """
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'leaky_relu':
            return lambda x: np.where(x > 0, x, 0.01 * x)
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        else:
            raise ValueError("Unsupported activation function")

    def get_activation_derivative(self, activation):
        """ Returns the derivative of the activation function. """
        if activation == 'relu':
            return lambda x: (x > 0).astype(float)
        elif activation == 'leaky_relu':
            return lambda x: np.where(x > 0, 1, 0.01)
        elif activation == 'tanh':
            return lambda x: 1 - np.tanh(x) ** 2
        elif activation == 'sigmoid':
            sigmoid = lambda x: 1 / (1 + np.exp(-x))
            return lambda x: sigmoid(x) * (1 - sigmoid(x))
        else:
            raise ValueError("Unsupported activation function")

    def softmax(self, x):
        """ Computes softmax activation for the output layer. """
        exps = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exps / np.sum(exps, axis=1, keepdims=True)

    def cross_entropy_loss(self, y_true, y_pred):
        """ Computes cross-entropy loss. """
        num_samples = y_true.shape[0]
        return -np.sum(y_true * np.log(y_pred + 1e-9)) / num_samples

    def forward(self, X, training=True):
        """
        Forward propagation through multiple layers with optional dropout.
        """
        self.activations = [X]
        self.z_values = []
        self.drop_masks = []

        for i in range(self.num_layers - 1):  # Hidden layers
            z = np.dot(self.activations[-1], self.weights[i]) + self.biases[i]
            self.z_values.append(z)

            a = self.activation_func(z)

            # Apply dropout during training
            if training and self.dropout_rate > 0:
                mask = (np.random.rand(*a.shape) > self.dropout_rate) / (1.0 - self.dropout_rate)
                self.drop_masks.append(mask)
                a *= mask
            else:
                self.drop_masks.append(None)

            self.activations.append(a)

        # Output layer with softmax (no dropout)
        z_out = np.dot(self.activations[-1], self.weights[-1]) + self.biases[-1]
        self.z_values.append(z_out)
        self.activations.append(self.softmax(z_out))

        return self.activations[-1]

    def backward(self, y_true):
        """
        Backpropagation through multiple layers.
        """
        num_samples = y_true.shape[0]
        grads_W = [None] * self.num_layers
        grads_b = [None] * self.num_layers

        # Compute gradients for output layer
        dZ = self.activations[-1] - y_true
        grads_W[-1] = np.dot(self.activations[-2].T, dZ) / num_samples
        grads_b[-1] = np.sum(dZ, axis=0, keepdims=True) / num_samples

        # Backpropagate through hidden layers
        for i in range(self.num_layers - 2, -1, -1):
            dA = np.dot(dZ, self.weights[i + 1].T)

            if self.drop_masks[i] is not None:  # Apply dropout mask during backprop
                dA *= self.drop_masks[i]

            dZ = dA * self.activation_derivative(self.z_values[i])
            grads_W[i] = np.dot(self.activations[i].T, dZ) / num_samples
            grads_b[i] = np.sum(dZ, axis=0, keepdims=True) / num_samples

        # Update weights and biases with optimizer
        for i in range(self.num_layers):
            if self.optimizer == 'momentum':
                self.velocity_W[i] = self.momentum * self.velocity_W[i] - self.learning_rate * grads_W[i]
                self.velocity_b[i] = self.momentum * self.velocity_b[i] - self.learning_rate * grads_b[i]
                self.weights[i] += self.velocity_W[i]
                self.biases[i] += self.velocity_b[i]
            else:
                self.weights[i] -= self.learning_rate * grads_W[i]
                self.biases[i] -= self.learning_rate * grads_b[i]

    def train(self, X_train, y_train, X_val=None, y_val=None, epochs=100, batch_size=64):
        """
        Train the model using mini-batch stochastic gradient descent (SGD).
        """
        num_samples = X_train.shape[0]

        for epoch in range(epochs):
            indices = np.arange(num_samples)
            np.random.shuffle(indices)
            X_train, y_train = X_train[indices], y_train[indices]

            for i in range(0, num_samples, batch_size):
                X_batch = X_train[i:i + batch_size]
                y_batch = y_train[i:i + batch_size]

                y_pred = self.forward(X_batch, training=True)
                self.backward(y_batch)

            # Evaluate on validation data
            if X_val is not None and y_val is not None:
                y_val_pred = self.forward(X_val, training=False)
                val_loss = self.cross_entropy_loss(y_val, y_val_pred)
                val_accuracy = np.mean(np.argmax(y_val_pred, axis=1) == np.argmax(y_val, axis=1)) * 100
                print(f"Epoch {epoch+1}/{epochs} - Val Loss: {val_loss:.4f} - Val Acc: {val_accuracy:.2f}%")

    def predict(self, X):
        """ Predicts labels for input data. """
        return np.argmax(self.forward(X, training=False), axis=1)


## Experimenting under different conditions
- Varying the number of hidden layers
- Varying the dropout rate
- Varying the activation functions

In [17]:
mlp1 = MLP(layer_sizes=[784, 20, 10], activation='relu', dropout_rate=0.2)
mlp2 = MLP(layer_sizes=[784, 128, 64, 10], activation='tanh', dropout_rate=0.3)
mlp3 = MLP(layer_sizes=[784, 256, 128, 64, 10], activation='relu', dropout_rate=0.1)

## Training and Testing

In [18]:
# Train model: train(self, X_train, y_train, X_val=None, y_val=None, epochs=100, batch_size=64)
mlp1.train(train_images, y_train_one_hot, val_images, y_val_one_hot, epochs=20)

Epoch 1/20 - Val Loss: 0.8185 - Val Acc: 73.47%
Epoch 2/20 - Val Loss: 0.6702 - Val Acc: 78.29%
Epoch 3/20 - Val Loss: 0.6088 - Val Acc: 80.28%
Epoch 4/20 - Val Loss: 0.5723 - Val Acc: 81.00%
Epoch 5/20 - Val Loss: 0.5520 - Val Acc: 81.67%
Epoch 6/20 - Val Loss: 0.5339 - Val Acc: 82.37%
Epoch 7/20 - Val Loss: 0.5169 - Val Acc: 82.56%
Epoch 8/20 - Val Loss: 0.5077 - Val Acc: 82.79%
Epoch 9/20 - Val Loss: 0.4998 - Val Acc: 82.95%
Epoch 10/20 - Val Loss: 0.4948 - Val Acc: 83.20%
Epoch 11/20 - Val Loss: 0.4864 - Val Acc: 83.29%
Epoch 12/20 - Val Loss: 0.4828 - Val Acc: 83.38%
Epoch 13/20 - Val Loss: 0.4754 - Val Acc: 83.43%
Epoch 14/20 - Val Loss: 0.4725 - Val Acc: 83.67%
Epoch 15/20 - Val Loss: 0.4681 - Val Acc: 83.79%
Epoch 16/20 - Val Loss: 0.4641 - Val Acc: 83.93%
Epoch 17/20 - Val Loss: 0.4622 - Val Acc: 84.17%
Epoch 18/20 - Val Loss: 0.4572 - Val Acc: 83.83%
Epoch 19/20 - Val Loss: 0.4562 - Val Acc: 84.06%
Epoch 20/20 - Val Loss: 0.4522 - Val Acc: 84.20%


In [24]:
# Evaluate model
predictions = mlp1.predict(val_images)
accuracy = np.mean(predictions == val_labels) * 100
print(f"Validation Accuracy: {accuracy:.2f}%")

Validation Accuracy: 84.20%
