# Backpropagation Neural Network with MNIST image processing dataset

by Torey Hilbert (November 2017)

I used 4 layers in my network, with 784 neurons in the first, 30 neurons in the second and third, and 10 in the output layer. During training, I split the data into batches of 25 rows and ran each batch three times, updating in between. I also varied a training step parameter which describes how much the weights are changed during each trial. I measured error using sum of squares.

Data: MNIST is dataset of human handwritten numbers with labels. The full MNIST dataset has 60,000 training examples and 10,000 testing examples. A subset is available in the sklearn library (the digits dataset). I used the full dataset which is available at: https://www.kaggle.com/zxstest/handwritten-digits/code

In [1]:
import math
import random
from __future__ import division

In [2]:
#Smooth function, used to normalize the output of every node to (0, 1)
def smoothf(t):
    #return 1 / (1 + math.exp(-1 * t))
    return math.tanh(t)
#Derivative of smooth function
def d_smoothf(t):
    #f = smoothf(t)
    #return f * (1 - f)
    return 1 - t * t

In [3]:
class Vector:
    def __init__(self, size):
        self.n = size
        self.vals = []
        for i in range(self.n):
            self.vals.append(0)

    def set_vals(self, l):
        self.vals = l

    def set_rand(self, minx, maxx):
        for i in range(self.n):
            self.vals[i] = random.uniform(minx, maxx)

    def scale(self, c):
        vec = Vector(self.n)
        vec.set_vals([ai * c for ai in self.vals])
        return vec

    def __neg__(self):
        vec = Vector(self.n)
        vec.set_vals([ai * (-1) for ai in self.vals])
        return vec

    def __add__ (self, other):
        vec = Vector(self.n)
        vec.set_vals([ai + bi for (ai, bi) in zip(self.vals, other.vals)])
        return vec
    __radd__ = __add__

    def __sub__ (self, other):
        vec = Vector(self.n)
        vec.set_vals([ai - bi for (ai, bi) in zip(self.vals, other.vals)])
        return vec
    #__rsub__ = __sub__

    def __mul__ (self, other):
        return sum([ai * bi for (ai, bi) in zip(self.vals, other.vals)])
    __rmul__ = __mul__

    def __getitem__(self, i):
        return self.vals[i]

    def __setitem__(self, i, val):
        self.vals[i] = val

    def __len__(self):
        return self.n

    def __eq__(self, other):
        for ai, bi in zip(self.vals, other.vals):
            if (ai != bi):
                return False
        return True

    def __ne__(self, other):
        if (self == other):
            return False
        else:
            return True

    def __str__(self):
        return str(self.vals)

    def sumSqs(self):
        return self * self

    def matr_mult(self, mat):
        s = Vector(mat.n)
        if (self.n != mat.m):
            print ("They aren't teh same sizes!")
            return None
        for x in range(mat.n):
            for j in range(mat.m):
                s[x] += self.vals[j] * mat.rows[j][x]
        return s

    #Smooths every output of the vector using the smooth function above
    def smooth(self):
        vec = Vector(len(self))
        for i in range(len(self)):
            vec[i] = smoothf(self[i])
        return vec

In [4]:
class Matrix:
    def __init__(self, m, n):
        self.m = m
        self.n = n
        self.rows = []
        for i in range(self.m):
            col = []
            for j in range(self.n):
                col.append(0)
            self.rows.append(col)

    def vec_mult(self, vec):
        s = Vector(self.m)
        if (len(s) != self.m):
            print ("They aren't teh same sizes!")
        for x in range(self.m):
            for j in range(self.n):
                s[x] += self.rows[x][j] * vec[j]
        return s

    def set_vals(self, rows):
        self.rows = rows

    def set_rand(self, minx, maxx):
        for i in range(self.m):
            for j in range(self.n):
                self.rows[i][j] = random.uniform(minx, maxx)

    def __len__(self):
        return len(rows)

    def scale(self, c):
        s = Matrix(self.m, self.n)
        for i in range(self.m):
            for j in range(self.n):
                s.rows[i][j] = c * self.rows[i][j]
        return s

    def __add__(self, other):
        if (self.m != other.m or self.n != other.n):
            raise ValueError("The dimensions of the Matricies are not the same!")
            return None
        s = Matrix(self.m, self.n)
        for i in range(self.m):
            for j in range(self.n):
                s.rows[i][j] = self.rows[i][j] + other.rows[i][j]
        return s

    __radd__ = __add__

    def __sub__(self, other):
        if (self.m != other.m or self.n != other.n):
            raise ValueError("The dimensions of the Matricies are not the same!")
            return None
        s = Matrix(self.m, self.n)
        for i in range(self.m):
            for j in range(self.n):
                s.rows[i][j] = self.rows[i][j] - other.rows[i][j]
        return s

    def __mul__(self, other):
        if (self.n != other.m):
            raise ValueError("The inner dimensions must be the same!")
            return None
        s = Matrix(self.m, other.n)
        for i in range(self.m):
            for j in range(other.n):
                s.rows[i][j] = 0
                for x in range(self.n):
                    s.rows[i][j] += self.rows[i][x] * other.rows[x][j]
        return s

    __rmul__ = __mul__

    def __getitem__(self, i):
        return self.rows[i]

    def __setitem__(self, i, row):
        self.rows[i] = row

    def __str__(self):
        return str(self.rows)

In [5]:
class Layer:
    def __init__ (self, inputs, outputs):
        self.numInputs = inputs
        self.numNodes = outputs
        self.weights = Matrix(self.numNodes, self.numInputs)
        self.weights.set_rand(-0.5, 0.5)
        self.biases = Vector(self.numNodes)
        self.biases.set_rand(-0.5,0.5)

    def __len__(self):
        return self.numNodes

    def feedForward(self, inputs):
        self.z = self.weights.vec_mult(inputs) + self.biases
        self.output = self.z.smooth()
        return self.output
    
    def write(self, writer):
        for row in self.weights.rows:
            writer.writerow(row)
        writer.writerow("D")
        writer.writerow(self.biases.vals)
        writer.writerow("W")
        
    def load(self, rows, biases):
        self.weights.set_vals(rows)
        self.biases.set_vals(biases)

In [6]:
class Network:
    def __init__(self, layerSizes):
        self.numLayers = len(layerSizes)
        self.layers = [Layer(layerSizes[0], layerSizes[0])]
        for l in range(1, len(layerSizes)):
            self.layers.append(Layer(layerSizes[l - 1], layerSizes[l]))

    def load(self, weights, biases):
        for w, b, i in zip(weights, biases, range(numLayers)):
            self.layers[i].load(w, b)
            
    def beginRound(self):
        #weightGrads is a list of Matricies holding the gradient to each weight
        #biasGrads is a list of Vectors holding the gradient to each bias
        self.weightGrads = []
        self.biasGrads = []
        for i in range(self.numLayers):
            self.weightGrads.append(Matrix(self.layers[i].numNodes, self.layers[i].numInputs))
            self.biasGrads.append(Vector(self.layers[i].numNodes))

    def feedForward(self, inputs):
        prevOutput = inputs
        self.layers[0].output = inputs
        for i in range(1, self.numLayers):
            prevOutput = self.layers[i].feedForward(prevOutput)
        return prevOutput

    def backPropagate(self, inputs, output, goal):
        dC_daL = Vector(len(goal))
        """
            Calculate dC_daL here... Used later
        """
        for i in range(len(goal)):
            dC_daL[i] = 2 * (output[i] - goal[i])

        dC_da = [] #List to hold the dC_dal vectors
        for i in range(self.numLayers - 1):
            dC_da.append(Vector(len(self.layers[i])))
        dC_da.append(dC_daL)
        """
            Calculate dC_dwL and dC_dbL here... Not used in further calculations
        """
        for i in range(self.layers[self.numLayers - 1].numNodes):
                daiL_dziL = d_smoothf(self.layers[self.numLayers - 1].output[i])
                for j in range(self.layers[self.numLayers - 1].numInputs):
                    self.weightGrads[self.numLayers - 1][i][j] += self.layers[self.numLayers - 2].output[j] * daiL_dziL * dC_da[self.numLayers - 1][i]
                self.biasGrads[self.numLayers - 1][i] += daiL_dziL * dC_da[self.numLayers - 1][i]
        for l in range(self.numLayers - 2, 0, -1): # Loops from L-1 to 1, since l = 0 doesn't have any weights
            """
                We know dC_da[l + 1], first use that to calculate dC_da[l]
            """
            vec = Vector(len(self.layers[l + 1]))
            for k in range(len(vec)):
                vec[k] = d_smoothf(self.layers[l + 1].output[k]) * dC_da[l + 1][k]
            dC_da[l] = vec.matr_mult(self.layers[l + 1].weights)
            #print(dC_da[l], l)
            """
                Using dC_da[l], we calculate dC_dwij[l] for the entire matrix individually
            """
            #print self.layers[l].z
            for i in range(self.layers[l].numNodes):
                dail_dzil = d_smoothf(self.layers[l].output[i])
                #print (dail_dzil)
                for j in range(self.layers[l].numInputs):
                    self.weightGrads[l][i][j] += self.layers[l - 1].output[j] * dail_dzil * dC_da[l][i]
                self.biasGrads[l][i] += dail_dzil * dC_da[l][i]
                    
        #for i in range(len(self.weightGrads)):
            #print self.weightGrads[i]
    def update(self, n, trainStep):
        for i in range(self.numLayers):
            self.layers[i].biases -= self.biasGrads[i].scale(trainStep / n)
            self.layers[i].weights -= self.weightGrads[i].scale(trainStep / n)

In [7]:
import csv

ans = []
vals = []
with open("mnist_train.csv", "r") as f_obj:
    reader = csv.reader(f_obj)

    for row in reader:
        if (row[0] == 'label'):
            pass
        else:
            ans.append(int(row[0]))
            vals.append(row[1:])
data = []
for i in range(len(vals)):
    for j in range(len(vals[i])):
        vals[i][j] = int(vals[i][j]) / 256.0
    data.append(Vector(len(vals[i])))
    data[i].set_vals(vals[i])

target = []
for i in range(len(vals)):
    target.append(Vector(10))
    for j in range(10):
        if (j == ans[i]):
            target[i][j] = 1.0
        else:
            target[i][j] = 0.0
    if (i < 10):
        print(target[i])

[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [25]:
network = Network([784, 30, 30, 10])
trainstep = 0.2
s = []
error = []
trialRatio = 1680


trialSize = round(len(data) / trialRatio)
trialRepeats = 4
print("Batch Size: " + str(trialSize))
print("Total # of rows: " + str(len(data)))
print("Batch # || Trial # || % correct in trial || Total Error for trial")
for trialNumber in range(250):
    network.beginRound()
    if (trialNumber == 50):
        trainstep = 0.1
    for x in range(trialRepeats):
        s.append(0)
        error.append(0)
        network.beginRound()
        for i in range(trialNumber, len(data) + trialNumber, trialRatio):
            out = network.feedForward(data[i])
            if (out.vals.index(max(out)) == target[i].vals.index(1.0)):
                s[trialNumber * trialRepeats + x] += 1
            network.backPropagate(data[i], out, target[i])
            error[trialNumber * trialRepeats + x] += (out - target[i]).sumSqs()    

        print(trialNumber, trialNumber * trialRepeats + x, s[trialNumber * trialRepeats + x] / trialSize, error[trialNumber * trialRepeats + x] / trialSize)
        network.update(trialSize, trainstep)

Batch Size: 25.0
Total # of rows: 42000
Batch # || Trial # || % correct in trial || Total Error for trial
(0, 0, 0.2, 5.088534399930568)
(0, 1, 0.32, 3.8129271075372135)
(0, 2, 0.16, 4.556722491845366)
(0, 3, 0.32, 4.797455702546021)
(1, 4, 0.08, 5.632404469981659)
(1, 5, 0.2, 4.682956291480551)
(1, 6, 0.16, 4.083381060951901)
(1, 7, 0.32, 3.914831406626422)
(2, 8, 0.08, 4.202109302769989)
(2, 9, 0.6, 4.142028395650725)
(2, 10, 0.64, 4.0693434304323315)
(2, 11, 0.64, 3.7415630186717364)
(3, 12, 0.28, 2.4659705490688393)
(3, 13, 0.52, 1.5679819574165268)
(3, 14, 0.52, 1.8683907345352446)
(3, 15, 0.36, 2.753685690075027)
(4, 16, 0.24, 3.550180002586105)
(4, 17, 0.44, 2.569732218463711)
(4, 18, 0.36, 2.0936845801704225)
(4, 19, 0.32, 2.0207424050048166)
(5, 20, 0.2, 2.827312996294215)
(5, 21, 0.6, 1.8523988099766908)
(5, 22, 0.44, 1.4831206569850155)
(5, 23, 0.8, 1.4661575643329263)
(6, 24, 0.28, 2.347743711212619)
(6, 25, 0.28, 1.6971106106004572)
(6, 26, 0.72, 1.5940234276591527)
(6, 27