### Importing Required Libraries

We start by importing the necessary Python libraries for numerical computation, data visualization, and data handling:

- **NumPy** – for numerical operations  
- **Matplotlib** – for creating visualizations  
- **Pandas** – for data manipulation and analysis  

We also define the mathematical constant **Euler's Number (E)** for later use.


In [None]:
import numpy as np
# import nnfs
# from nnfs.datasets import spiral_data
import matplotlib.pyplot as plt
import pandas as pd
# nnfs.init()
E = 2.71828182846



-Creating a Dense Layer class for a neural network.
-Initializes small random weights and zero biases.
-Forward pass computes outputs, backward pass calculates gradients for weights, biases, and inputs.

In [None]:
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights) + self.biases

    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        self.dinputs = np.dot(dvalues, self.weights.T)


Defines two activation functions for neural networks:

* ReLU: outputs positive values, zeroing negatives; backward pass stops gradients for inactive neurons.
* Softmax: converts outputs to probabilities; backward pass uses the Jacobian matrix for gradient calculation.


In [None]:
class Activation_ReLU:
    def forward(self,inputs):
        self.inputs = inputs
        self.output = np.maximum(0,inputs)
    def backward(self,dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <=0] = 0


class Activation_Softmax:
    def forward(self,inputs):

        exp_values = np.exp(inputs - np.max(inputs,axis=1,keepdims=True))
        probablities = exp_values/np.sum(exp_values,axis=1,keepdims=True)

        self.output = probablities
    def backward(self,dvalues):
        self.dinputs = np.empty_like(dvalues)

        for index, (single_output,single_dvalues)in enumerate(zip(self.output,dvalues)):
            single_output = single_output.reshape(-1,1)
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            self.dinputs[index] = np.dot(jacobian_matrix,single_dvalues)



Base Loss class with a `Calculate` method.
Calls `forward` to compute per-sample losses, then returns their mean as the final loss value.


In [None]:
class Loss:
    def Calculate(self,output,y):
        sample_losses = self.forward(output,y)
        data_loss = np.mean(sample_losses)

        return data_loss


Categorical Cross-Entropy loss implementation.
In `forward`, it clips predictions to avoid log(0), selects the correct class probabilities, and returns the negative log-likelihoods.
In `backward`, it converts class indices to one-hot if needed and computes the gradient with respect to predictions.


In [None]:
class Loss_CategoricalCrossentropy(Loss):
    def forward(self,y_pred,y_true):
        samples = len(y_pred)
        y_pred_clpped = np.clip(y_pred,1e-7,1-1e-7)

        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clpped[
                range(samples),
                y_true
            ]

        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clpped * y_true,
                axis = 1
            )
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

    def backward(self,dvalues,y_true):
        samples = len(dvalues)
        labels = len(dvalues[0])

        if(len(y_true.shape) == 1):
            y_true = np.eye(labels)[y_true]
        self.dinputs = -y_true / dvalues
        self.dinputs = self.dinputs / samples


Combined softmax activation and categorical cross-entropy loss for efficiency.
`forward` runs softmax then computes the loss.
`backward` simplifies gradient calculation using the derivative of softmax + CCE, avoiding the full Jacobian.


In [None]:
class Activation_Softmax_Loss_CategoricalCrossentropy():
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    def forward(self, inputs, y_true):
        self.activation.forward(inputs)
        self.output = self.activation.output
        return self.loss.Calculate(self.output, y_true)

    def backward(self, dvalues, y_true):
        samples = len(dvalues)

        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        self.dinputs = dvalues.copy()
        self.dinputs[range(samples), y_true] -= 1
        self.dinputs = self.dinputs / samples

SGD optimizer with optional learning rate decay.
`pre_update_params` adjusts learning rate before updates,
`update_params` applies gradients,
`post_update_params` increments iteration count.


In [None]:
class Optimizer_SGD:
    def __init__(self,learning_rate = 1.0,decay = 0.01):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0

    def update_params(self,layer):
        if hasattr(layer, 'dweights') and hasattr(layer, 'dbiases'):
            layer.weights += - self.current_learning_rate  * layer.dweights
            layer.biases += -self.current_learning_rate * layer.dbiases
    def pre_update_params(self):
        if(self.decay):
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
    def post_update_params(self):
        self.iterations += 1

That's the Adam optimizer implementation — it combines momentum (`beta_1`) and RMSProp-like adaptive learning rates (`beta_2`) with bias correction for stability, plus optional decay and a small `epsilon` to prevent division by zero.


In [None]:
class Optimizer_Adam:
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7,
                 beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)

        layer.weight_momentums = self.beta_1 * \
                                 layer.weight_momentums + \
                                 (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * \
                               layer.bias_momentums + \
                               (1 - self.beta_1) * layer.dbiases

        weight_momentums_corrected = layer.weight_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))

        layer.weight_cache = self.beta_2 * layer.weight_cache + \
                             (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + \
                           (1 - self.beta_2) * layer.dbiases**2

        weight_cache_corrected = layer.weight_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))

        layer.weights += -self.current_learning_rate * \
                         weight_momentums_corrected / \
                         (np.sqrt(weight_cache_corrected) +
                          self.epsilon)
        layer.biases += -self.current_learning_rate * \
                        bias_momentums_corrected / \
                        (np.sqrt(bias_cache_corrected) +
                         self.epsilon)


    def post_update_params(self):
        self.iterations += 1




That’s a straightforward MNIST loader using raw IDX file parsing — no `tensorflow` or `torch` needed, just binary reading with NumPy reshape.

It:

* Skips the IDX file headers (`16` bytes for images, `8` for labels)
* Reads the pixel/label data directly into NumPy arrays
* Reshapes image arrays to `(num_samples, 28, 28)`
* Returns train/test sets from the given directory

If you want, I can also make a short `.ipynb` cell description for this so it looks clean in your notebook.


In [None]:
import os
import cv2
import numpy as np

def load_mnist_images(filepath):
    with open(filepath, 'rb') as f:
        f.read(16)
        buf = f.read()
        data = np.frombuffer(buf, dtype=np.uint8)
        return data.reshape(-1, 28, 28)

def load_mnist_labels(filepath):
    with open(filepath, 'rb') as f:
        f.read(8)
        buf = f.read()
        data = np.frombuffer(buf, dtype=np.uint8)
        return data

def create_data_mnist(path):
    train_images_path = os.path.join(path, 'train-images-idx3-ubyte')
    train_labels_path = os.path.join(path, 'train-labels-idx1-ubyte')
    test_images_path = os.path.join(path, 't10k-images-idx3-ubyte')
    test_labels_path = os.path.join(path, 't10k-labels-idx1-ubyte')

    X_train = load_mnist_images(train_images_path)
    y_train = load_mnist_labels(train_labels_path)
    X_test = load_mnist_images(test_images_path)
    y_test = load_mnist_labels(test_labels_path)

    return X_train, y_train, X_test, y_test

This snippet handles MNIST setup end-to-end — it:

* **Downloads** the four compressed IDX files (`.gz`) from TensorFlow’s hosted dataset if they aren’t already present.
* **Extracts** them from gzip to raw binary IDX format so they can be read directly by your loader.
* Creates a `mnist_dataset` folder if it doesn’t exist.

Perfect as a preprocessing step before your earlier `create_data_mnist()` function.


In [None]:
import urllib.request
import os
import gzip

def download_mnist(path):
    base_url = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
    files = [
        "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz",
        "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz"
    ]
    for file in files:
        url = base_url + file
        filepath = os.path.join(path, file)
        if not os.path.exists(filepath):
            print(f"Downloading {file}...")
            urllib.request.urlretrieve(url, filepath)
            print("Done.")

def extract_mnist(path):
    files = [
        "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz",
        "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz"
    ]
    for file in files:
        filepath = os.path.join(path, file)
        if os.path.exists(filepath):
            print(f"Extracting {file}...")
            with gzip.open(filepath, 'rb') as f_in:
                with open(filepath[:-3], 'wb') as f_out:
                    f_out.writelines(f_in)
            print("Done.")

path = 'mnist_dataset'
if not os.path.exists(path):
    os.makedirs(path)

download_mnist(path)
extract_mnist(path)

Trains a 3-layer neural network on MNIST using ReLU, softmax, cross-entropy loss, and Adam optimizer with mini-batch training, tracking accuracy and loss for training and validation.

In [None]:
import numpy as np

X, y, X_test, y_test = create_data_mnist('mnist_dataset')
X = (X.reshape(X.shape[0], -1).astype(np.float32) - 127.5) / 127.5
X_test = (X_test.reshape(X_test.shape[0], -1).astype(np.float32) - 127.5) / 127.5

dense1 = Layer_Dense(X.shape[1], 128)
activation1 = Activation_ReLU()
dense2 = Layer_Dense(128, 64)
activation2 = Activation_ReLU()
dense3 = Layer_Dense(64, 10)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
optimizer = Optimizer_Adam(learning_rate=0.001, decay=1e-5)

EPOCHS, BATCH_SIZE = 90, 128
epoch_losses, epoch_accuracies = [], []

for epoch in range(EPOCHS):
    keys = np.random.permutation(X.shape[0])
    X, y = X[keys], y[keys]
    steps = -(-X.shape[0] // BATCH_SIZE)

    epoch_loss = epoch_accuracy = 0

    for step in range(steps):
        batch_X = X[step*BATCH_SIZE:(step+1)*BATCH_SIZE]
        batch_y = y[step*BATCH_SIZE:(step+1)*BATCH_SIZE]

        dense1.forward(batch_X)
        activation1.forward(dense1.output)
        dense2.forward(activation1.output)
        activation2.forward(dense2.output)
        dense3.forward(activation2.output)
        loss = loss_activation.forward(dense3.output, batch_y)
        predictions = np.argmax(loss_activation.output, axis=1)
        accuracy = np.mean(predictions == batch_y)

        epoch_loss += loss
        epoch_accuracy += accuracy

        if step == 0:
            print(f'epoch: {epoch}, step: {step}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

        loss_activation.backward(loss_activation.output, batch_y)
        dense3.backward(loss_activation.dinputs)
        activation2.backward(dense3.dinputs)
        dense2.backward(activation2.dinputs)
        activation1.backward(dense2.dinputs)
        dense1.backward(activation1.dinputs)

        optimizer.pre_update_params()
        optimizer.update_params(dense1)
        optimizer.update_params(dense2)
        optimizer.update_params(dense3)
        optimizer.post_update_params()

    epoch_loss /= steps
    epoch_accuracy /= steps
    epoch_losses.append(epoch_loss)
    epoch_accuracies.append(epoch_accuracy)

    dense1.forward(X_test)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    activation2.forward(dense2.output)
    dense3.forward(activation2.output)
    val_loss = loss_activation.forward(dense3.output, y_test)
    val_preds = np.argmax(loss_activation.output, axis=1)
    val_acc = np.mean(val_preds == y_test)

    print(f'Epoch {epoch} Validation -> acc: {val_acc:.3f}, loss: {val_loss:.3f}')

print("\nFinal test result:")
dense1.forward(X_test)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
activation2.forward(dense2.output)
dense3.forward(activation2.output)
loss = loss_activation.forward(dense3.output, y_test)
predictions = np.argmax(loss_activation.output, axis=1)
accuracy = np.mean(predictions == y_test)
print(f'Validation -> acc: {accuracy:.3f}, loss: {loss:.3f}')


Plots accuracy/loss over epochs and predicts the digit from an external image using the trained model.

In [None]:
import matplotlib.pyplot as plt
import cv2
import numpy as np

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(len(epoch_accuracies)), epoch_accuracies)
plt.title('Model Accuracy'); plt.ylabel('Accuracy'); plt.xlabel('Epoch')

plt.subplot(1, 2, 2)
plt.plot(range(len(epoch_losses)), epoch_losses)
plt.title('Model Loss'); plt.ylabel('Loss'); plt.xlabel('Epoch')

plt.suptitle('Model Training History')
plt.show()

def predict_external_image(image_path, dense1, activation1, dense2, activation2, dense3):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, (28, 28))
    if np.mean(image) >= 127.5: image = 255 - image
    image_data = (image.reshape(1, -1).astype(np.float32) - 127.5) / 127.5
    dense1.forward(image_data)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    activation2.forward(dense2.output)
    dense3.forward(activation2.output)
    return np.argmax(dense3.output, axis=1)[0]


Loads an image, runs it through the trained model, and prints the predicted digit.

In [None]:
image_file = '/content/download (2).png'

predicted_digit = predict_external_image(image_file, dense1, activation1, dense2,activation2,dense3)

print(f"The model predicts the digit is: {predicted_digit}")