In [11]:
# Two hidden Layers
from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.utils import np_utils
np.random.seed(1671) # for reproducibility
# network and training
NB_EPOCH = 20
BATCH_SIZE = 128
VERBOSE = 1
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = SGD() # optimizer, explained later in this chapter
N_HIDDEN = 128
VALIDATION_SPLIT = 0.2 #how much TRAIN is reserved for VALIDATION
# data: shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784
#
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
# normalize
X_train /= 255
X_test /= 255
print (X_train.shape[0], 'train samples')
print (X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, NB_CLASSES)
Y_test = np_utils.to_categorical(y_test, NB_CLASSES)
# M_HIDDEN hidden layers
# 10 outputs
# final stage is softmax
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', 
             optimizer=OPTIMIZER,
             metrics=['accuracy'])
history = model.fit(X_train, Y_train,
                   batch_size=BATCH_SIZE, epochs=NB_EPOCH,
                   verbose=VERBOSE, validation_split=VALIDATION_SPLIT)
score = model.evaluate(X_test, Y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])


60000 train samples
10000 test samples
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 128)               100480    
_________________________________________________________________
activation_35 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_36 (Dense)             (None, 10)                1290      
_________________________________________________________________
activation_36 (Activation)   (None, 10)                0         
_________________________________________________________________
dense_37 (Dense)             (None, 10)                110       
_________________________________________________________________
activation_37 (Activation)   (None, 10)                0         
Total params: 101,880
Trainable params: 101,880
Non-trainable params: 0
_________

In [12]:
# Three Hidden Layers
from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.utils import np_utils
np.random.seed(1671) # for reproducibility
# network and training
NB_EPOCH = 20
BATCH_SIZE = 128
VERBOSE = 1
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = SGD() # optimizer, explained later in this chapter
N_HIDDEN = 128
VALIDATION_SPLIT = 0.2 #how much TRAIN is reserved for VALIDATION
# data: shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784
#
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
# normalize
X_train /= 255
X_test /= 255
print (X_train.shape[0], 'train samples')
print (X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, NB_CLASSES)
Y_test = np_utils.to_categorical(y_test, NB_CLASSES)
# M_HIDDEN hidden layers
# 10 outputs
# final stage is softmax
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', 
             optimizer=OPTIMIZER,
             metrics=['accuracy'])
history = model.fit(X_train, Y_train,
                   batch_size=BATCH_SIZE, epochs=NB_EPOCH,
                   verbose=VERBOSE, validation_split=VALIDATION_SPLIT)
score = model.evaluate(X_test, Y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])

60000 train samples
10000 test samples
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_38 (Dense)             (None, 128)               100480    
_________________________________________________________________
activation_38 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_39 (Dense)             (None, 10)                1290      
_________________________________________________________________
activation_39 (Activation)   (None, 10)                0         
_________________________________________________________________
dense_40 (Dense)             (None, 10)                110       
_________________________________________________________________
activation_40 (Activation)   (None, 10)                0         
_________________________________________________________________
dense_41 (Dense

In [13]:
# Five Hidden Layers
from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.utils import np_utils
np.random.seed(1671) # for reproducibility
# network and training
NB_EPOCH = 20
BATCH_SIZE = 128
VERBOSE = 1
NB_CLASSES = 10 # number of outputs = number of digits
OPTIMIZER = SGD() # optimizer, explained later in this chapter
N_HIDDEN = 128
VALIDATION_SPLIT = 0.2 #how much TRAIN is reserved for VALIDATION
# data: shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()
#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784
#
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
# normalize
X_train /= 255
X_test /= 255
print (X_train.shape[0], 'train samples')
print (X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, NB_CLASSES)
Y_test = np_utils.to_categorical(y_test, NB_CLASSES)
# M_HIDDEN hidden layers
# 10 outputs
# final stage is softmax
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', 
             optimizer=OPTIMIZER,
             metrics=['accuracy'])
history = model.fit(X_train, Y_train,
                   batch_size=BATCH_SIZE, epochs=NB_EPOCH,
                   verbose=VERBOSE, validation_split=VALIDATION_SPLIT)
score = model.evaluate(X_test, Y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])

60000 train samples
10000 test samples
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_42 (Dense)             (None, 128)               100480    
_________________________________________________________________
activation_42 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_43 (Dense)             (None, 10)                1290      
_________________________________________________________________
activation_43 (Activation)   (None, 10)                0         
_________________________________________________________________
dense_44 (Dense)             (None, 10)                110       
_________________________________________________________________
activation_44 (Activation)   (None, 10)                0         
_________________________________________________________________
dense_45 (Dense

Through my testing I found that:

Two Hidden Layers
Test score: 0.18953929247111082
Test accuracy: 0.9437000155448914

Three Hidden Layers
Test score: 0.17509758788496255
Test accuracy: 0.9510999917984009

Five Hidden Layers
Test score: 0.18540319982618095
Test accuracy: 0.9469000101089478


Between Two and Three Hidden Layers (2-3) 2 being better for test Scores, but was worse for overall Test Accuracy.
Test Score Difference = 0.01444 (Better)
Test Accuracy Difference = -0.00739 (Worse)

Between Two and Five Hidden Layers (2-5) 2 being better for Test Scores, but was worse for overall Test Accuracy.
Test Score Difference = 0.00413 (Better)
Test Accuracy Difference = -0.0032 (Worse)

Between Three and Five Hidden Layers (3-5) 3 being worse for Test Scores, but had better overall Test Accuracy.
Test Score Difference = -0.01031 (Worse)
Test Accuracy Difference = 0.00419 (Better)

From these numbers I got, I would find that having more hidden layers could contribute to having a worse overall Test Score Difference while also increasing the overall Test Accuracy Difference as shown by my three examples (Where I find 3 to be a bit of an outlier but still proves my overall point).