In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
import random

In [2]:
# Open file to store model parameters in
file = open("c:\\Users\\XT\\Documents\\MNIST\\stored parameters", "w")

In [3]:
(trainImgs, trainLabels), (testImgs, testLabels) = mnist.load_data()

In [4]:
trainImgs = trainImgs/255
testImgs = testImgs/255

In [6]:
# Flatten each 28x28 image into one row
flatTrainImgs = trainImgs.reshape(60000, 28**2)
flatTestImgs = testImgs.reshape(10000, 28**2)

In [5]:
class Model:
    def __init__(self, nodeCounts, activationFuncs):
        # Input and output node count
        self.inCount = nodeCounts[0]
        self.outCount = nodeCounts[-1]

        # Number of layers
        self.layers = len(nodeCounts)

        # The activation function for each layer
        self.activationFuncs = activationFuncs
        
        # Initialize weights and biases
        self.params = [np.random.randn(nodeCounts[i]+1, nodeCounts[i+1]) for i in range(len(nodeCounts)-1)]
    
    # Run the model on a set of inputs
    def run(self, inArray):
        array1 = inArray

        nodeValues = [array1]
        for i in range(self.layers- 1):
            currentParams = self.params[i]

            array2 = np.matmul(array1, currentParams[:-1])
            array2 += currentParams[-1]
            
            array1 = self.activationFunc(array2, self.activationFuncs[i])
            
            nodeValues.append(array1)

        return nodeValues # Return list of node values at each layer
    

    # Calculate the gradient for a set of inputs using backpropagation
    def gradient(self, inArray, label):
        nodeValues = self.run(inArray)

        # Correct output values
        labelArray = np.zeros(self.outCount)
        labelArray[label] = 1

        # Derivatives of cost function w.r.t. weights and biases
        grad = []
        # Derivatives of cost function w.r.t. node values
        nodeGrad = [2*(nodeValues[-1] - labelArray)]
        
        for i in range(-1,-self.layers,-1):
            prevLayer = nodeValues[i]
            weights = self.params[i][:-1]

            activationDerivs = self.activationDeriv(prevLayer, self.activationFuncs[i])

            nodeDerivs = np.sum(weights * activationDerivs[None] * nodeGrad[-1][None], axis=1)
            
            nodeGrad.append(nodeDerivs)
            
            # Derivatives of node values w.r.t. weights and biases
            paramDerivs = np.concatenate((nodeValues[i-1], np.array(1)[None]))[:,None] @ activationDerivs[None]

            grad.append(paramDerivs * nodeGrad[-2][None])
        
        grad.reverse()
        # Return gradient and cost
        return grad, np.sum((nodeValues[-1] - labelArray)**2)
    

    # Change the weights and biases based on the gradient and given step size
    def adjustParams(self, gradient, stepSize=1):
        for i in range(self.layers-1):
            self.params[i] -= gradient[i] * stepSize
    
    # Adjust model parameters based on samples of training data
    def train(self, inData, labelData, sampSize, epochs, stepSize=0.001):
        for epoch in range(epochs):
            meanCost = 0
            totalGrad = [0] * (self.layers-1)

            for i in random.sample(range(len(labelData)), sampSize):
                gradient, cost = self.gradient(inData[i], labelData[i])
                
                # Add this individual sample's gradient to the total gradient
                totalGrad = list(map(lambda x,y: x+y, gradient, totalGrad))
                
                meanCost += cost
            
            self.adjustParams(totalGrad, stepSize)

            meanCost /= sampSize
            
            print("Epoch {} completed. Avg cost: {}".format(epoch+1, meanCost))
    

    # The activation function
    def activationFunc(self, rawValues, function):
        if function.lower() == "sigmoid": return self.sigmoid(rawValues)
        elif function.lower() == "relu": return self.reluDeriv(rawValues)
        elif function.lower() == "leaky relu": return self.leakyReluDeriv(rawValues)
        elif function.lower() == "identity": return rawValues

        raise ValueError('''Specified activation function is not supported
        Supported functions: identity, sigmoid, relu, leaky relu''')
    
    # Derivative of the activation function
    def activationDeriv(self, nodeValues, function):
        if function.lower() == "sigmoid": return self.sigmoidDeriv(nodeValues)
        elif function.lower() == "relu": return self.reluDeriv(nodeValues)
        elif function.lower() == "leaky relu": return self.leakyReluDeriv(nodeValues)
        elif function.lower() == "identity": return 1

        raise ValueError('''Specified activation function is not supported
        Supported functions: identity, sigmoid, relu, leaky relu''')


    def sigmoid(self, rawValues):
        return np.reciprocal(1 + np.exp(-rawValues))
    
    def sigmoidDeriv(self, nodeValues):
        return nodeValues - np.power(nodeValues, 2)

    def relu(self, rawValues):
        rawValues[rawValues < 0] = 0
        return rawValues
    
    def reluDeriv(self, nodeValues):
        nodeValues[nodeValues > 0] = 1
        return nodeValues

    def leakyRelu(self, rawValues):
        rawValues[rawValues < 0] *= 0.01
        return rawValues
    
    def leakyReluDeriv(self, nodeValues):
        nodeValues[nodeValues >= 0] = 1
        nodeValues[nodeValues < 0] = 0.01
        return nodeValues


    def getParams(self):
        return self.params

    def setParams(self, newParams):
        assert len(newParams) == len(self.params)
        for i, k in zip(self.params, newParams):
            assert i.shape == k.shape
        
        self.params = newParams

In [7]:
model = Model([784, 64, 32, 10], ["leaky relu", "leaky relu", "sigmoid"])

In [22]:
i = random.choice(range(60000))

out = model.run(flatTrainImgs[i])[-1]

print("Predicted", np.where(np.isclose(out, np.max(out)))[0][0])
print("Actual:", trainLabels[i])
out

Predicted 3
Actual: 2


array([5.24704821e-02, 4.29173610e-01, 2.57460858e-01, 9.84686727e-01,
       1.85983027e-01, 7.79884237e-01, 2.54532969e-01, 5.75545345e-04,
       1.14067036e-01, 6.08445881e-01])

In [11]:
model.train(flatTrainImgs, trainLabels, 5000, 500, 5e-6)

Epoch 1 completed. Avg cost: 0.9842435746636904
Epoch 2 completed. Avg cost: 0.9842474093264307
Epoch 3 completed. Avg cost: 0.9860955883877559
Epoch 4 completed. Avg cost: 0.9851923487918915
Epoch 5 completed. Avg cost: 0.9780748031957311
Epoch 6 completed. Avg cost: 0.9785718806350069
Epoch 7 completed. Avg cost: 0.9773189328933777
Epoch 8 completed. Avg cost: 0.9782231863360168
Epoch 9 completed. Avg cost: 0.9775062809586438
Epoch 10 completed. Avg cost: 0.975958610456053
Epoch 11 completed. Avg cost: 0.9760067134015693
Epoch 12 completed. Avg cost: 0.9758122109606134
Epoch 13 completed. Avg cost: 0.975730068434028
Epoch 14 completed. Avg cost: 0.9754751769793735
Epoch 15 completed. Avg cost: 0.9755925614999041
Epoch 16 completed. Avg cost: 0.9750829042743105
Epoch 17 completed. Avg cost: 0.9753046197194826
Epoch 18 completed. Avg cost: 0.9753615876825663
Epoch 19 completed. Avg cost: 0.9726776219941732
Epoch 20 completed. Avg cost: 0.9697686203951719
Epoch 21 completed. Avg cost: 0

In [45]:
flatTestImgs[random.sample(range(len(testLabels)), 5)]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])