In [1]:
import numpy as np
import random

In [2]:
import mnist_loader
import network

In [3]:
def sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

## Training using the normal network

In [4]:

training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
net = network.Network([784,30,10])
net.SGD(training_data,5,16,3.0,test_data)

Epoch 0: 8960/10000
Epoch 1: 9187/10000
Epoch 2: 9272/10000
Epoch 3: 9282/10000
Epoch 4: 9338/10000


## Feedforward with network_matx

In [9]:
# Load data
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
# Create Net
net = network.Network([784,30,10])

In [10]:
training_data = list(training_data)
n = len(training_data)
if test_data:
    test_data = list(test_data)
    n_test = len(test_data)

# Create mini-batch
random.shuffle(training_data)
mini_batch = training_data[0:3]
X = [x for x,y in mini_batch]
Y = [y for x,y in mini_batch]
print("X shape: {},{},{} | Y shape: {},{},{}".format(len(X),len(X[0]),len(X[0][0]),len(Y),len(Y[0]),len(Y[0][0])))

X shape: 3,784,1 | Y shape: 3,10,1


In [11]:
# Initiate
nabla_b = [np.zeros(b.shape) for b in net.biases]
nabla_w = [np.zeros(w.shape) for w in net.weights]
#TODO: print nabla shapes
print("weights shape: ({}x{}),({}x{}) | biases shape: ({}x{}),({}x{})".format(len(nabla_w[0]),len(nabla_w[0][0]),len(nabla_w[1]),len(nabla_w[1][0]),len(nabla_b[0]),len(nabla_b[0][0]),len(nabla_b[1]),len(nabla_b[1][0])))


weights shape: (30x784),(10x30) | biases shape: (30x1),(10x1)


In [12]:
# feedforward
activation = X # shape = (mini_batch_size, 784, 1) 
activations = [X] # list to store all activations, layer-by-layer
Zs = [] # list to store all z vectors, layer-by-layer
for b,w in zip(net.biases,net.weights):
    Z = np.transpose(np.dot(w, activation),(1,0,2)) + b
    Zs.append(Z)
    activation = sigmoid(Z)
    activations.append(activation)
    print("Shape of activation: {}".format(activation.shape))


Shape of activation: (3, 30, 1)
Shape of activation: (3, 10, 1)


Shape of nabla_b: ((10, 1))


## Figuring out the dimensions to get the weight vector
Don't run the cells below!!

In [9]:
# Figuring out the dimensions to get the weight vector
dum1 = np.transpose(delta,(2,1,0))
dum2 = np.transpose(activations[-2],(2,0,1))
prod = np.dot(dum1, dum2)
#nabla_w[-1] = np.dot(delta, activations[-2])
dum1.shape,dum2.shape,prod.shape,np.squeeze(prod).shape

((1, 10, 3), (1, 3, 30), (1, 10, 1, 30), (10, 30))

In [13]:
# check if the above code sums the 16 examples correctly
_dum1 = np.squeeze(np.transpose(delta,(2,1,0)))
_dum2 = np.squeeze(np.transpose(activations[-2],(2,0,1)))
_prod = np.dot(dum1, dum2)
_dum1.shape, _dum2.shape,_prod.shape

((10, 3), (3, 30), (1, 10, 1, 30))

In [None]:
#Finding an easier way --> sum delta first (WRONG ACTUALLY)
delta = np.sum(net.cost_derivative(activations[-1],Y) * \
        sigmoid_prime(Zs[-1]),axis=0)
print("Shape of Delta: {}".format(delta.shape))

nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, np.sum(np.transpose(activations[-2],(0,2,1)),axis=0))
print("Shape of nabla_w: {}".format(nabla_w[-1].shape))
np.transpose(activations[-2],(0,2,1)).shape

Shape of Delta: (10, 1)
Shape of nabla_w: (10, 30)


(3, 1, 30)

In [12]:
np.sum(prod == _prod),np.sum(prod == nabla_w[-1])



(300, 0)

As you can observe above, using `np.sum` on `delta` above does not get the same matrix as using `np.sum` after the dot product in `nabla_w`. Imagine trying to multiply delta and activation as shown below:

delta --> Vector A (10x3): [
    [A0_1, A0_2, A0_3], 
    [A1_1, A1_2, A1_3], 
    [A2_1, A2_2, A2_3], 
            ...
    [A9_1, A9_2, A9_3]       
]

activation (30x3): [
    [B0_1, B0_2, B0_3],
    [B1_1, B1_2, B1_3],
    [B2_1, B2_2, B2_3],
            ...
    [B29_1, B29_2, B29_3]       
]

activation_transposed --> Vector B (3x30): [
    [B0_1, B1_1, B2_1 ... B29_1],
    [B0_2, B1_2, B2_2 ... B29_2],
    [B0_3, B1_3, B2_3 ... B29_3]
]

The desired dot product is shown (10x30): [
    [(A0_1*B0_1 + A0_2*B0_2 + A0_3*B0_3), (A0_1*B1_1 + A0_2*B1_2 + A0_3*B1_3) ... (A0_1*B29_1 + A0_2*B29_2 + A0_3*B29_3)],
    [(A1_1*B0_1 + A1_2*B0_2 + A1_3*B0_3), (A1_1*B1_1 + A1_2*B1_2 + A1_3*B1_3) ... (A1_1*B29_1 + A1_2*B29_2 + A1_3*B29_3),
                                                        ... ...
    [(A9_1*B0_1 + A9_2*B0_2 + A9_3*B0_3), (A9_1*B1_1 + A9_2*B1_2 + A9_3*B1_3) ... (A9_1*B29_1 + A9_2*B29_2 + A9_3*B29_3),
] 

if delta is summed before the dot product, the dot product will be as follows: [
    [(A0_1 + A0_2 + A0_3)(B0_1 + B0_2 + B0_3), (A0_1 + A0_2 + A0_3)(B1_1 + B1_2 + B1_3) ... (A0_1 + A0_2 + A0_3)(B29_1 + B29_2 + B29_3)],
    [(A1_1 + A1_2 + A1_3)(B0_1 + B0_2 + B0_3), (A1_1 + A1_2 + A1_3)(B1_1 + B1_2 + B1_3) ... (A1_1 + A1_2 + A1_3)(B29_1 + B29_2 + B29_3)],
                                                        ... ...
    [(A9_1 + A9_2 + A9_3)(B0_1 + B0_2 + B0_3), (A9_1 + A9_2 + A9_3)(B1_1 + B1_2 + B1_3) ... (A9_1 + A9_2 + A9_3)(B29_1 + B29_2 + B29_3)]
]

Thus, the sum of the `nabla_w` must only be taken after the dot product.

## Backward Pass

In [29]:
# Correct backward pass
print("Last layer")
delta_batch = net.cost_derivative(activations[-1],Y) * \
        sigmoid_prime(Zs[-1]) # shape = (mini_batch_size,10,1)
print("Shape of delta_batch: {}".format(delta_batch.shape))
nabla_b[-1] = np.sum(delta_batch, axis=0)
print("Shape of nabla_b: {}".format(nabla_b[-1].shape))
nabla_w[-1] = np.squeeze(np.dot(np.transpose(delta_batch,(2,1,0)),np.transpose(activations[-2],(2,0,1))))
print("Shape of nabla_w: {}".format(nabla_w[-1].shape))

# Subsequent backward pass
print("2nd last layer")
Z = Zs[-2]
sp = sigmoid_prime(Z)

delta_batch = np.transpose(np.dot(net.weights[-2+1].transpose(), delta_batch),(1,0,2)) * sp
print("Shape of delta_batch: {}".format(delta_batch.shape))
nabla_b[-2] = np.sum(delta_batch,axis=0)
print("Shape of nabla_b: {}".format(nabla_b[-2].shape))
nabla_w[-2] = np.squeeze(np.dot(np.transpose(delta_batch,(2,1,0)), np.transpose(activations[-2-1],(2,0,1))))
print("Shape of nabla_w: {}".format(nabla_w[-2].shape))
#net.weights[-1].shape,delta_batch.shape,sp.shape


Last layer
Shape of delta_batch: (3, 10, 1)
Shape of nabla_b: (10, 1)
Shape of nabla_w: (10, 30)
2nd last layer
Shape of delta_batch: (3, 30, 1)
Shape of nabla_b: (30, 1)
Shape of nabla_w: (30, 784)


## The Final Test

In [2]:
import numpy as np
import random
import mnist_loader
import network_matx

In [3]:
# Load data
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
# Create Net
net = network_matx.Network([784,30,10])

In [4]:
net.SGD(training_data,5,16,3.0,test_data)

Epoch 0: 8172/10000
Epoch 1: 8310/10000
Epoch 2: 8420/10000
Epoch 3: 9328/10000
Epoch 4: 9363/10000


## The Comparison Time Test
Yay it works! Now to see if the matrix-based approach has actually made the network faster...

In [12]:
import numpy as np
import random
import mnist_loader
import network
import network_matx
import time

COUNT = 3

Without the matrix-based approach

In [13]:
time_diff = []
for i in range(COUNT):
    # Load data
    training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
    # Create Net
    net = network.Network([784,30,10])
    tic = time.perf_counter()
    net.SGD(training_data,5,16,3.0,test_data)
    toc = time.perf_counter()
    time_diff.append(toc-tic)
    print(toc-tic)

print("Time taken for 5 epochs of SGD w/o matrix: {:0.4f}".format(np.average(time_diff)))

Epoch 0: 8971/10000
Epoch 1: 9151/10000
Epoch 2: 9220/10000
Epoch 3: 9301/10000
Epoch 4: 9321/10000
Epoch 0: 8169/10000
Epoch 1: 8347/10000
Epoch 2: 8381/10000
Epoch 3: 8444/10000
Epoch 4: 8497/10000
Epoch 0: 9013/10000
Epoch 1: 9159/10000
Epoch 2: 9225/10000
Epoch 3: 9272/10000
Epoch 4: 9330/10000
Time taken for 5 epochs of SGD w/o matrix: 12.1394


With Matrix-based approach

In [14]:
time_diff = []
for i in range(COUNT):
    # Load data
    training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
    # Create Net
    net = network_matx.Network([784,30,10])
    tic = time.perf_counter()
    net.SGD(training_data,5,16,3.0,test_data)
    toc = time.perf_counter()
    time_diff.append(toc-tic)
    print(toc-tic)
print("Time taken for 5 epochs of SGD w/ matrix: {:0.4f}".format(np.average(time_diff)))

Epoch 0: 8976/10000
Epoch 1: 9152/10000
Epoch 2: 9240/10000
Epoch 3: 9254/10000
Epoch 4: 9311/10000
8.865893532998598
Epoch 0: 8886/10000
Epoch 1: 9156/10000
Epoch 2: 9233/10000
Epoch 3: 9331/10000
Epoch 4: 9395/10000
8.823941388000094
Epoch 0: 7124/10000
Epoch 1: 7259/10000
Epoch 2: 7328/10000
Epoch 3: 9250/10000
Epoch 4: 9309/10000
8.894066030999966
Time taken for 5 epochs of SGD w/ matrix: 8.8613


As shown above, for 5 epochs, SGD, using the matrix-based approach for each mini-batch, is 12.1394 - 8.8613 = 3.2781, which is approximately a 27% decrease in time taken. This is a significant increase in speed for SGD.

The following test will use different number of epochs to observe the  change in difference in time taken between the network w/o matrix based approach against the network w/ matrix based approach