In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
import random

In [2]:
# Open file to store model parameters in
file = open("c:\\Users\\XT\\Documents\\MNIST\\stored parameters", "w")

In [2]:
(trainImgs, trainLabels), (testImgs, testLabels) = mnist.load_data()

In [3]:
trainImgs = trainImgs/255
testImgs = testImgs/255

In [4]:
# Flatten each 28x28 image into one row
flatTrainImgs = trainImgs.reshape(60000, 28**2)
flatTestImgs = testImgs.reshape(10000, 28**2)

In [32]:
class Model:
    def __init__(self, nodeCounts, activationFuncs):
        # Input and output node count
        self.inCount = nodeCounts[0]
        self.outCount = nodeCounts[-1]

        # Number of layers
        self.layers = len(nodeCounts)

        # The activation function for each layer
        self.activationFuncs = activationFuncs
        
        # Initialize weights and biases
        self.params = [np.random.randn(nodeCounts[i]+1, nodeCounts[i+1]) for i in range(len(nodeCounts)-1)]
    
    # Run the model on a set of inputs
    def run(self, inArray):
        array1 = inArray

        nodeValues = [array1]
        for i in range(self.layers- 1):
            currentParams = self.params[i]

            array2 = np.matmul(array1, currentParams[:-1])
            array2 += currentParams[-1]
            
            array1 = self.activationFunc(array2, self.activationFuncs[i])
            
            nodeValues.append(array1)

        return nodeValues # Return list of node values at each layer
    

    # Calculate the gradient for a set of inputs using backpropagation
    def gradient(self, inArray, label):
        nodeValues = self.run(inArray)

        # Correct output values
        labelArray = np.zeros(self.outCount)
        labelArray[label] = 1

        # Derivatives of cost function w.r.t. weights and biases
        grad = []
        # Derivatives of cost function w.r.t. node values
        nodeGrad = [2*(nodeValues[-1] - labelArray)]
        
        for i in range(-1,-len(nodeValues),-1):
            prevLayer = nodeValues[i]
            weights = self.params[i][:-1]

            activationDerivs = self.activationDeriv(prevLayer, self.activationFuncs[i])

            nodeDerivs = np.sum(weights * activationDerivs[None] * nodeGrad[-1][None], axis=1)
            
            nodeGrad.append(nodeDerivs)
            
            # Derivatives of node values w.r.t. weights and biases
            paramDerivs = np.concatenate((nodeValues[i-1], np.array(1)[None]))[:,None] @ activationDerivs[None]

            grad.append(paramDerivs * nodeGrad[-2][None])
        
        grad.reverse()
        # Return gradient and cost
        return grad, np.sum((nodeValues[-1] - labelArray)**2)
    

    # Change the weights and biases based on the gradient and given step size
    def adjustParams(self, gradient, stepSize=1):
        for i in range(self.layers-1):
            self.params[i] -= gradient[i] * stepSize
    
    # Sample from training data and adjust weights and biases
    def train(self, inData, labelData, sampSize, epochs, stepSize=0.001, printCost=True):
        meanGrad = [0] * self.layers
        meanCost = 0
        
        for epoch in range(epochs):
            for i in random.sample(range(len(labelData)), sampSize):
                gradient, cost = self.gradient(inData[i], labelData[i])
                
                # Adjust parameters
                self.adjustParams(gradient, stepSize)
                
                # Add each image's contribution to the cost
                meanCost += cost
            
            meanCost /= sampSize
            
            if printCost:
                print("Epoch {} completed. Avg cost: {}".format(epoch+1, meanCost))
    

    # The activation function
    def activationFunc(self, rawValues, function):
        if function.lower() == "sigmoid": return self.sigmoid(rawValues)
        elif function.lower() == "relu": return self.reluDeriv(rawValues)
        elif function.lower() == "leaky_relu": return self.leakyReluDeriv(rawValues)
        elif function.lower() == "identity": return rawValues

        raise ValueError('''Specified activation function is not supported
        Supported functions: identity, sigmoid, relu, leaky relu''')
    
    # Derivative of the activation function
    def activationDeriv(self, nodeValues, function):
        if function.lower() == "sigmoid": return self.sigmoidDeriv(nodeValues)
        elif function.lower() == "relu": return self.reluDeriv(nodeValues)
        elif function.lower() == "leaky_relu": return self.leakyReluDeriv(nodeValues)
        elif function.lower() == "identity": return 1

        raise ValueError('''Specified activation function is not supported
        Supported functions: identity, sigmoid, relu, leaky relu''')


    def sigmoid(self, rawValues):
        return np.reciprocal(1 + np.exp(-rawValues))
    
    def sigmoidDeriv(self, nodeValues):
        return nodeValues - np.power(nodeValues, 2)

    def relu(self, rawValues):
        rawValues[rawValues < 0] = 0
        return rawValues
    
    def reluDeriv(self, nodeValues):
        nodeValues[nodeValues > 0] = 1
        return nodeValues

    def leakyRelu(self, rawValues):
        rawValues[rawValues < 0] *= 0.01
        return rawValues
    
    def leakyReluDeriv(self, nodeValues):
        nodeValues[nodeValues >= 0] = 1
        nodeValues[nodeValues < 0] = 0.01
        return nodeValues


    def getParams(self):
        return self.params

    def setParams(self, newParams):
        assert len(newParams) == len(self.params)
        for i, k in zip(self.params, newParams):
            assert i.shape == k.shape
        
        self.params = newParams

In [33]:
model = Model([784, 64, 32, 10], ["relu", "relu", "sigmoid"])

In [20]:
i = random.choice(range(60000))

out = model.run(flatTrainImgs[i])[-1]

print("Predicted", np.where(np.isclose(out, np.max(out)))[0][0])
print("Actual:", trainLabels[i])
out

Predicted 0
Actual: 1


array([1.00000000e+000, 1.00000000e+000, 3.33327156e-127, 4.69111708e-004,
       1.00000000e+000, 1.00000000e+000, 1.00000000e+000, 2.41092665e-014,
       1.36687312e-055, 1.00000000e+000])

In [34]:
model.train(flatTrainImgs, trainLabels, 1000, 1)

[ 2.98736299e-13  2.97477071e-13  2.50776973e-13  4.05841206e-14
 -1.33378231e-12 -1.24765727e-12 -9.11261014e-13  7.74661162e-13
  1.37997689e-12 -7.76042115e-13  5.91495442e-13 -6.97219368e-14
 -1.48793044e-13 -4.44308156e-15  4.92387596e-13  3.06822618e-13
 -3.70801154e-13 -1.16377808e-13 -3.15474294e-14 -1.21233432e-12
 -1.22905131e-12 -9.06229265e-13  7.75324762e-13 -2.45054922e-13
 -7.24515831e-13 -3.51455412e-13 -2.00843185e-13 -9.00846922e-13
  5.25983435e-13  1.69179680e-13 -1.22117215e-13 -7.32782217e-13]


ValueError: operands could not be broadcast together with shapes (64,32) (1,10) 

In [45]:
flatTestImgs[random.sample(range(len(testLabels)), 5)]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])