In [10]:
# --- Data Loading ---
import gzip
import pickle
import numpy as np

def load_data():
    f = gzip.open('/content/mnist.pkl.gz', 'rb')  # adjust path if necessary
    training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
    f.close()
    return training_data, validation_data, test_data

def vectorized_result(j):
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

def load_data_wrapper():
    tr_d, va_d, te_d = load_data()
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_results = [vectorized_result(y) for y in tr_d[1]]
    training_data = list(zip(training_inputs, training_results))
    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_data = list(zip(validation_inputs, va_d[1]))
    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_data = list(zip(test_inputs, te_d[1]))
    return training_data, validation_data, test_data


In [11]:
import random
import numpy as np

# --- Cost Function ---
class CrossEntropyCost:
    @staticmethod
    def fn(a, y):
        return np.sum(np.nan_to_num(-y*np.log(a) - (1-y)*np.log(1-a)))

    @staticmethod
    def delta(z, a, y):
        return (a - y)


# --- Network Class ---
class Network:
    def __init__(self, sizes, cost=CrossEntropyCost):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.default_weight_initializer()
        self.cost = cost

    def large_weight_initializer(self):
        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
        self.weights = [np.random.randn(y, x)
                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]

    def default_weight_initializer(self):
        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
        self.weights = [np.random.randn(y, x) / np.sqrt(x)
                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]

    def feedforward(self, a):
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a

    def SGD(self, training_data, epochs, mini_batch_size, eta,
            lmbda=0.0,
            evaluation_data=None,
            monitor_evaluation_cost=False,
            monitor_evaluation_accuracy=False,
            monitor_training_cost=False,
            monitor_training_accuracy=False):

        if evaluation_data: n_data = len(evaluation_data)
        n = len(training_data)

        evaluation_cost, evaluation_accuracy = [], []
        training_cost, training_accuracy = [], []

        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in range(0, n, mini_batch_size)]

            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta, lmbda, n)

            print(f"Epoch {j} training complete")

            if monitor_training_cost:
                cost = self.total_cost(training_data, lmbda)
                training_cost.append(cost)
                print(f"Cost on training data: {cost}")

            if monitor_training_accuracy:
                accuracy = self.accuracy(training_data, convert=True)
                training_accuracy.append(accuracy)
                print(f"Accuracy on training data: {accuracy} / {n}")

            if monitor_evaluation_cost:
                cost = self.total_cost(evaluation_data, lmbda, convert=True)
                evaluation_cost.append(cost)
                print(f"Cost on evaluation data: {cost}")

            if monitor_evaluation_accuracy:
                accuracy = self.accuracy(evaluation_data)
                evaluation_accuracy.append(accuracy)
                print(f"Accuracy on evaluation data: {accuracy} / {n_data}")

        return evaluation_cost, evaluation_accuracy, training_cost, training_accuracy

    def update_mini_batch(self, mini_batch, eta, lmbda, n):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]

        self.weights = [(1 - eta * (lmbda / n)) * w - (eta / len(mini_batch)) * nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b - (eta / len(mini_batch)) * nb
                       for b, nb in zip(self.biases, nabla_b)]

    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        activation = x
        activations = [x]
        zs = []
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        delta = self.cost.delta(zs[-1], activations[-1], y)
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].T)
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].T, delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].T)
        return (nabla_b, nabla_w)

    def accuracy(self, data, convert=False):
        if convert:
            results = [(np.argmax(self.feedforward(x)), np.argmax(y)) for (x, y) in data]
        else:
            results = [(np.argmax(self.feedforward(x)), y) for (x, y) in data]
        return sum(int(x == y) for (x, y) in results)

    def total_cost(self, data, lmbda, convert=False):
        cost = 0.0
        for x, y in data:
            a = self.feedforward(x)
            if convert: y = vectorized_result(y)
            cost += self.cost.fn(a, y) / len(data)
        cost += 0.5 * (lmbda / len(data)) * sum(np.linalg.norm(w)**2 for w in self.weights)
        return cost


def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z) * (1 - sigmoid(z))


# Experiment 3: Overfitting on Small Dataset


In [9]:


# Load data
training_data, validation_data, test_data = load_data_wrapper()

# Initialize network with 30 hidden neurons
net = Network([784, 30, 10], cost=CrossEntropyCost)
net.large_weight_initializer()

# Train on first 1000 examples
net.SGD(training_data[:1000], 400, 10, 0.5,
        evaluation_data=test_data,
        monitor_evaluation_accuracy=True,
        monitor_training_cost=True,
        monitor_training_accuracy=True)


Epoch 0 training complete
Cost on training data: 1.9161820362261717
Accuracy on training data: 651 / 1000
Accuracy on evaluation data: 5259 / 10000
Epoch 1 training complete
Cost on training data: 1.451017926394901
Accuracy on training data: 755 / 1000
Accuracy on evaluation data: 6368 / 10000
Epoch 2 training complete
Cost on training data: 1.173382667331476
Accuracy on training data: 825 / 1000
Accuracy on evaluation data: 7097 / 10000
Epoch 3 training complete
Cost on training data: 0.9988949276506093
Accuracy on training data: 864 / 1000
Accuracy on evaluation data: 7273 / 10000
Epoch 4 training complete
Cost on training data: 0.8567673403871416
Accuracy on training data: 877 / 1000
Accuracy on evaluation data: 7472 / 10000
Epoch 5 training complete
Cost on training data: 0.7437641726265786
Accuracy on training data: 920 / 1000
Accuracy on evaluation data: 7584 / 10000
Epoch 6 training complete
Cost on training data: 0.6474265216331955
Accuracy on training data: 929 / 1000
Accuracy

([],
 [5259,
  6368,
  7097,
  7273,
  7472,
  7584,
  7739,
  7732,
  7744,
  7795,
  7908,
  7865,
  7903,
  7938,
  7957,
  7944,
  7972,
  7971,
  7948,
  8024,
  7989,
  8041,
  8004,
  8040,
  8035,
  8047,
  8073,
  8083,
  8085,
  8074,
  8066,
  8108,
  8086,
  8084,
  8094,
  8106,
  8106,
  8101,
  8079,
  8100,
  8114,
  8110,
  8118,
  8119,
  8138,
  8119,
  8125,
  8126,
  8129,
  8126,
  8128,
  8134,
  8124,
  8116,
  8146,
  8136,
  8153,
  8147,
  8160,
  8147,
  8159,
  8154,
  8154,
  8155,
  8157,
  8167,
  8155,
  8163,
  8164,
  8166,
  8167,
  8172,
  8172,
  8171,
  8174,
  8183,
  8175,
  8176,
  8171,
  8181,
  8179,
  8175,
  8177,
  8181,
  8179,
  8178,
  8183,
  8178,
  8183,
  8184,
  8183,
  8185,
  8180,
  8184,
  8179,
  8188,
  8186,
  8186,
  8187,
  8189,
  8189,
  8188,
  8191,
  8189,
  8188,
  8197,
  8189,
  8188,
  8192,
  8190,
  8186,
  8193,
  8183,
  8193,
  8191,
  8193,
  8195,
  8189,
  8195,
  8192,
  8194,
  8197,
  8196,
  8192,
  8

# Experiment 4: Using L2 Regularization

In [12]:


# Load data
training_data, validation_data, test_data = load_data_wrapper()

# Initialize network
net = Network([784, 30, 10], cost=CrossEntropyCost)
net.large_weight_initializer()

# Train with L2 regularization on 1000 examples
net.SGD(training_data[:1000], 400, 10, 0.5,
        evaluation_data=test_data,
        lmbda=0.1,
        monitor_evaluation_cost=True,
        monitor_evaluation_accuracy=True,
        monitor_training_cost=True,
        monitor_training_accuracy=True)


Epoch 0 training complete
Cost on training data: 2.8953866641040014
Accuracy on training data: 689 / 1000
Cost on evaluation data: 2.23381313794047
Accuracy on evaluation data: 5791 / 10000
Epoch 1 training complete
Cost on training data: 2.4922795980388854
Accuracy on training data: 781 / 1000
Cost on evaluation data: 1.8578071766480555
Accuracy on evaluation data: 6688 / 10000
Epoch 2 training complete
Cost on training data: 2.1619762221974304
Accuracy on training data: 848 / 1000
Cost on evaluation data: 1.639313396100369
Accuracy on evaluation data: 7131 / 10000
Epoch 3 training complete
Cost on training data: 1.970081800136849
Accuracy on training data: 881 / 1000
Cost on evaluation data: 1.5096697678438278
Accuracy on evaluation data: 7503 / 10000
Epoch 4 training complete
Cost on training data: 1.8894258463009241
Accuracy on training data: 899 / 1000
Cost on evaluation data: 1.4870589068329607
Accuracy on evaluation data: 7571 / 10000
Epoch 5 training complete
Cost on training d

([np.float64(2.23381313794047),
  np.float64(1.8578071766480555),
  np.float64(1.639313396100369),
  np.float64(1.5096697678438278),
  np.float64(1.4870589068329607),
  np.float64(1.4311532854817384),
  np.float64(1.42436012666953),
  np.float64(1.3170151458303538),
  np.float64(1.3683020768708563),
  np.float64(1.3049757569838412),
  np.float64(1.2951656321224811),
  np.float64(1.2950842331027985),
  np.float64(1.3218352380285252),
  np.float64(1.257383076836807),
  np.float64(1.257967751666464),
  np.float64(1.2379148815197436),
  np.float64(1.243457634629964),
  np.float64(1.233259475079607),
  np.float64(1.256568703881618),
  np.float64(1.269423278896301),
  np.float64(1.2477405216453608),
  np.float64(1.251242321703052),
  np.float64(1.2345774477007871),
  np.float64(1.2442849592080831),
  np.float64(1.2375750891637558),
  np.float64(1.2343621866485242),
  np.float64(1.2324581426868653),
  np.float64(1.249356905463297),
  np.float64(1.2344422092740273),
  np.float64(1.230614364707

# Experiment 5a: Full MNIST without regularization


In [13]:

training_data, validation_data, test_data =load_data_wrapper()

net = Network([784, 30, 10], cost=CrossEntropyCost)
net.large_weight_initializer()

net.SGD(training_data, 30, 10, 0.5,
        evaluation_data=test_data,
        monitor_evaluation_accuracy=True,
        monitor_training_accuracy=True)


Epoch 0 training complete
Accuracy on training data: 45790 / 50000
Accuracy on evaluation data: 9157 / 10000
Epoch 1 training complete
Accuracy on training data: 46518 / 50000
Accuracy on evaluation data: 9276 / 10000
Epoch 2 training complete
Accuracy on training data: 47076 / 50000
Accuracy on evaluation data: 9338 / 10000
Epoch 3 training complete
Accuracy on training data: 47357 / 50000
Accuracy on evaluation data: 9413 / 10000
Epoch 4 training complete
Accuracy on training data: 47524 / 50000
Accuracy on evaluation data: 9426 / 10000
Epoch 5 training complete
Accuracy on training data: 47704 / 50000
Accuracy on evaluation data: 9442 / 10000
Epoch 6 training complete
Accuracy on training data: 47860 / 50000
Accuracy on evaluation data: 9486 / 10000
Epoch 7 training complete
Accuracy on training data: 47893 / 50000
Accuracy on evaluation data: 9469 / 10000
Epoch 8 training complete
Accuracy on training data: 48138 / 50000
Accuracy on evaluation data: 9514 / 10000
Epoch 9 training co

([],
 [9157,
  9276,
  9338,
  9413,
  9426,
  9442,
  9486,
  9469,
  9514,
  9492,
  9494,
  9499,
  9502,
  9502,
  9510,
  9540,
  9529,
  9506,
  9510,
  9525,
  9538,
  9533,
  9539,
  9516,
  9518,
  9516,
  9521,
  9509,
  9521,
  9528],
 [],
 [45790,
  46518,
  47076,
  47357,
  47524,
  47704,
  47860,
  47893,
  48138,
  48045,
  48204,
  48262,
  48346,
  48364,
  48446,
  48574,
  48611,
  48504,
  48622,
  48628,
  48635,
  48751,
  48740,
  48818,
  48789,
  48792,
  48852,
  48882,
  48848,
  48959])

# Experiment 5b: Full MNIST with L2 regularization


In [14]:



training_data, validation_data, test_data = load_data_wrapper()

net =Network([784, 30, 10], cost=CrossEntropyCost)
net.large_weight_initializer()

net.SGD(training_data, 30, 10, 0.5,
        lmbda=5.0,
        evaluation_data=test_data,
        monitor_evaluation_accuracy=True,
        monitor_training_accuracy=True)


Epoch 0 training complete
Accuracy on training data: 45638 / 50000
Accuracy on evaluation data: 9108 / 10000
Epoch 1 training complete
Accuracy on training data: 46631 / 50000
Accuracy on evaluation data: 9298 / 10000
Epoch 2 training complete
Accuracy on training data: 47378 / 50000
Accuracy on evaluation data: 9444 / 10000
Epoch 3 training complete
Accuracy on training data: 47640 / 50000
Accuracy on evaluation data: 9426 / 10000
Epoch 4 training complete
Accuracy on training data: 47987 / 50000
Accuracy on evaluation data: 9540 / 10000
Epoch 5 training complete
Accuracy on training data: 47931 / 50000
Accuracy on evaluation data: 9523 / 10000
Epoch 6 training complete
Accuracy on training data: 48100 / 50000
Accuracy on evaluation data: 9532 / 10000
Epoch 7 training complete
Accuracy on training data: 48104 / 50000
Accuracy on evaluation data: 9550 / 10000
Epoch 8 training complete
Accuracy on training data: 48023 / 50000
Accuracy on evaluation data: 9519 / 10000
Epoch 9 training co

([],
 [9108,
  9298,
  9444,
  9426,
  9540,
  9523,
  9532,
  9550,
  9519,
  9536,
  9548,
  9517,
  9597,
  9554,
  9592,
  9580,
  9574,
  9575,
  9572,
  9615,
  9552,
  9595,
  9563,
  9622,
  9570,
  9616,
  9563,
  9647,
  9619,
  9613],
 [],
 [45638,
  46631,
  47378,
  47640,
  47987,
  47931,
  48100,
  48104,
  48023,
  48065,
  48093,
  48077,
  48319,
  48195,
  48310,
  48289,
  48316,
  48160,
  48233,
  48368,
  48090,
  48353,
  48163,
  48464,
  48365,
  48425,
  48270,
  48528,
  48473,
  48365])

# Experiment 5c: 100 Hidden Neurons, L2 regularization




In [15]:


training_data, validation_data, test_data = load_data_wrapper()

net = Network([784, 100, 10], cost=CrossEntropyCost)
net.large_weight_initializer()

net.SGD(training_data, 30, 10, 0.5,
        lmbda=5.0,
        evaluation_data=validation_data,
        monitor_evaluation_accuracy=True)


Epoch 0 training complete
Accuracy on evaluation data: 9389 / 10000
Epoch 1 training complete
Accuracy on evaluation data: 9488 / 10000
Epoch 2 training complete
Accuracy on evaluation data: 9608 / 10000
Epoch 3 training complete
Accuracy on evaluation data: 9649 / 10000
Epoch 4 training complete
Accuracy on evaluation data: 9699 / 10000
Epoch 5 training complete
Accuracy on evaluation data: 9703 / 10000
Epoch 6 training complete
Accuracy on evaluation data: 9713 / 10000
Epoch 7 training complete
Accuracy on evaluation data: 9718 / 10000
Epoch 8 training complete
Accuracy on evaluation data: 9768 / 10000
Epoch 9 training complete
Accuracy on evaluation data: 9736 / 10000
Epoch 10 training complete
Accuracy on evaluation data: 9652 / 10000
Epoch 11 training complete
Accuracy on evaluation data: 9734 / 10000
Epoch 12 training complete
Accuracy on evaluation data: 9735 / 10000
Epoch 13 training complete
Accuracy on evaluation data: 9652 / 10000
Epoch 14 training complete
Accuracy on evalu

([],
 [9389,
  9488,
  9608,
  9649,
  9699,
  9703,
  9713,
  9718,
  9768,
  9736,
  9652,
  9734,
  9735,
  9652,
  9755,
  9723,
  9743,
  9729,
  9773,
  9770,
  9769,
  9764,
  9771,
  9662,
  9769,
  9766,
  9740,
  9759,
  9747,
  9754],
 [],
 [])