# Reading data

In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from sklearn.preprocessing import OneHotEncoder
from time import time

In [2]:
file_name = "notMNIST.pickle"
def make_datasets (file, n_training_samples=0, n_dev_samples=0, 
                   n_testing_samples=0, one_hot=False):
    with open (file,'rb') as f:
        dataset = pickle.load(f,encoding='latin1')
        f.close

    train_dataset = dataset['train_dataset']
    train_labels = dataset['train_labels']
    dev_dataset = dataset['valid_dataset']
    dev_labels = dataset['valid_labels']
    test_dataset = dataset['test_dataset']
    test_labels = dataset['test_labels']

    #Prepare training, dev (validation) and final testing data. 
    #It has to be reshaped since (n_samples, n_fatures) are expected

    all_training_samples, width, height = train_dataset.shape
    train_attributes = np.reshape(train_dataset, (all_training_samples, 
                                                  width * height))
    if (n_training_samples != 0):
        train_attributes = train_attributes[0:n_training_samples]
        train_labels = train_labels[0:n_training_samples]

    all_dev_samples, width, height = dev_dataset.shape
    dev_attributes = np.reshape(dev_dataset,
                                       (all_dev_samples, width * height))
    if (n_dev_samples != 0):
        dev_attributes = dev_attributes[0:n_dev_samples]
        dev_labels = dev_labels[0:n_dev_samples]

    all_testing_samples, width, height = test_dataset.shape
    test_attributes = np.reshape(test_dataset, (all_testing_samples, width * height))
    if (n_testing_samples != 0):
        test_attributes = test_attributes[0:n_testing_samples]
        test_labels = test_labels[0:n_testing_samples]

    # If one-hot encoding is requested, then funtion OneHotEcoding 
    # from SciKit-Learn is called    
    if one_hot:
        enc = OneHotEncoder(sparse=False)
        # Labels are one-dimensional vectors, 
        # and are reshaped to matrices of one column
        train_labels = enc.fit_transform(train_labels.reshape(len(train_labels),1))
        dev_labels = enc.fit_transform(dev_labels.reshape(len(dev_labels), 1))
        test_labels = enc.fit_transform(test_labels.reshape(len(test_labels), 1))

    return (train_attributes, train_labels, dev_attributes, 
            dev_labels, test_attributes, test_labels)

In [3]:
NUM_TRAINING_SAMPLES = 10000
NUM_DEV_SAMPLES = 1000
NUM_TESTING_SAMPLES = 1000

In [4]:
x_train, y_train, x_dev, y_dev, x_test, y_test = make_datasets(file_name, 
                                 n_training_samples=NUM_TRAINING_SAMPLES,
                                 n_dev_samples=NUM_DEV_SAMPLES, 
                                 n_testing_samples=NUM_TESTING_SAMPLES,
                                 one_hot=True)

# Building the deep neural network


Hyper-paramenters configuration:

In [5]:
n_epochs = 10000
epochs_to_display = 500
batch_size = 200
initial_learning_rate = 0.0075

n_inputs = len(x_train[0])
#Fibonacci numbers: 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1.597   
n_hidden1 = 377
n_hidden2 = 233
n_hidden3 = 144
n_hidden4 = 89
n_hidden5 = 55
n_hidden6 = 34
n_hidden7 = 21

n_outputs = len(y_train[0])


First, the input __X__ and target __t__ matrices are defined as placeholders:

In [6]:
with tf.name_scope("io"):
    X = tf.placeholder(dtype=tf.float32, shape=(None,n_inputs), name="X")
    t = tf.placeholder(dtype=tf.float32, shape=(None,n_outputs), name="t")

Then, the neural network topology is defined: A full-connected 28x28-300-200-100-10 deep neural network. Note that ReLU is the activation function for the hidden layers, and linear logits with softmax for the output. net_out represents the logits of the output layer.

In [7]:
he_init = tf.contrib.layers.variance_scaling_initializer()

training= tf.placeholder_with_default(False, shape=(), name='training')

dropout_rate_big = 0.40
dropout_rate_small = 0.25
                                      
X_drop = tf.layers.dropout(X, dropout_rate_small, training=training)

def leaky_relu(net, name=None):
    return tf.maximum(0.01 * net, net, name=name)


with tf.name_scope("dnn"):
    #hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1",reuse=tf.AUTO_REUSE,kernel_initializer = he_init)
    hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=leaky_relu, name="hidden1",reuse=tf.AUTO_REUSE,kernel_initializer = he_init)
    hidden1_drop = tf.layers.dropout(hidden1, dropout_rate_big, training=training)
                                      
    #hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2",reuse=tf.AUTO_REUSE, kernel_initializer = he_init)
    hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, activation=leaky_relu, name="hidden2",reuse=tf.AUTO_REUSE, kernel_initializer = he_init)
    hidden2_drop = tf.layers.dropout(hidden2, dropout_rate_big, training=training)
                                      
    #hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3",reuse=tf.AUTO_REUSE, kernel_initializer = he_init)
    hidden3 = tf.layers.dense(hidden2_drop, n_hidden3, activation=leaky_relu, name="hidden3",reuse=tf.AUTO_REUSE, kernel_initializer = he_init)
    hidden3_drop = tf.layers.dropout(hidden3, dropout_rate_big, training=training)
    
    hidden4 = tf.layers.dense(hidden3_drop, n_hidden4, activation=leaky_relu, name="hidden4",reuse=tf.AUTO_REUSE, kernel_initializer = he_init)
    hidden4_drop = tf.layers.dropout(hidden4, dropout_rate_small, training=training)
    
    hidden5 = tf.layers.dense(hidden4_drop, n_hidden5, activation=leaky_relu, name="hidden5",reuse=tf.AUTO_REUSE, kernel_initializer = he_init)
    hidden5_drop = tf.layers.dropout(hidden5, dropout_rate_small, training=training)
    
    hidden6 = tf.layers.dense(hidden5_drop, n_hidden6, activation=leaky_relu, name="hidden6",reuse=tf.AUTO_REUSE, kernel_initializer = he_init)
    hidden6_drop = tf.layers.dropout(hidden6, dropout_rate_small, training=training)
    
    hidden7 = tf.layers.dense(hidden6_drop, n_hidden7, activation=leaky_relu, name="hidden7",reuse=tf.AUTO_REUSE, kernel_initializer = he_init)
    hidden7_drop = tf.layers.dropout(hidden7, dropout_rate_small, training=training)
                                      
    net_out = tf.layers.dense(hidden7_drop, n_outputs, name="net_out",reuse=tf.AUTO_REUSE)
    y = tf.nn.softmax(logits=net_out, name="y")
    rounded_y = tf.round(y)

# Loss and cost functions with cross entropy and log-loss

In [8]:
with tf.name_scope("loss"):
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=t, logits=net_out)
    mean_log_loss = tf.reduce_mean(cross_entropy, name="mean_loss")

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



# Defining the learning algorithm: gradient descent with back-prop

In [9]:
#Momentum
#decay_steps = 10000
#decay_rate = 0.80
#global_step = tf.Variable(0, trainable=False, name="global_step")
#learning_rate = tf.train.exponential_decay(initial_learning_rate,global_step, decay_steps,decay_rate)
#optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95, use_nesterov=True)
#train_step = optimizer.minimize(mean_log_loss, global_step=global_step)

#Adam
decay_momentum = 0.875
decay_scaling = 0.99898989898
epsilon = 6.626e-4   #Planck constant: 6.626069934(89)×10−34
optimizer = tf.train.AdamOptimizer(learning_rate=initial_learning_rate,
                                   beta1=decay_momentum,
                                   beta2=decay_scaling,
                                   epsilon=epsilon,)
train_step = optimizer.minimize(mean_log_loss)

#Orig
#train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(mean_log_loss)

# Evaluating the model

In [10]:
correct_predictions = tf.equal(tf.argmax(y, 1), tf.argmax(t, 1))
accuracy = tf.reduce_mean(tf.cast(correct_predictions,tf.float32))

# Executing the model

In [11]:
init = tf.global_variables_initializer()

print("Parameters:\n",
    "Batch size: ",batch_size,"\n",
    "Initial learning rate: ",initial_learning_rate,"\n",
    "Neurons in layer 1: ",n_hidden1,"\n",
    "Neurons in layer 2: ",n_hidden2,"\n",
    "Neurons in layer 3: ",n_hidden3,"\n",
    "Neurons in layer 4: ",n_hidden4,"\n",
    "Neurons in layer 5: ",n_hidden5,"\n",
    "Neurons in layer 6: ",n_hidden6,"\n",
    "Neurons in layer 7: ",n_hidden7,"\n",
    "Activation: leaky_relu","\n",
    "Drop rate 'big': ",dropout_rate_big,"\n",
    "Drop rate 'small': ",dropout_rate_small,"\n",
    "Decay M: ",decay_momentum,"\n",
    "Decay S: ",decay_scaling,"\n",
    "Epsilon: ",epsilon,"\n"  
    #"Decay rate: ",decay_rate,"\n",
    #"Momentum: 0.95"
    )

Parameters:
 Batch size:  200 
 Initial learning rate:  0.0075 
 Neurons in layer 1:  377 
 Neurons in layer 2:  233 
 Neurons in layer 3:  144 
 Neurons in layer 4:  89 
 Neurons in layer 5:  55 
 Neurons in layer 6:  34 
 Neurons in layer 7:  21 
 Activation: leaky_relu 
 Drop rate 'big':  0.4 
 Drop rate 'small':  0.25 
 Decay M:  0.875 
 Decay S:  0.99898989898 
 Epsilon:  0.0006626 



In [12]:
start_time = time()

#extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    sess.run(init)
    for epoch in range (int(n_epochs / epochs_to_display)):
        for iteration in range (epochs_to_display):
            offset = (iteration * epoch * batch_size) % (y_train.shape[0] - batch_size)
            sess.run(train_step, feed_dict={training: True, X: x_train, t: y_train})
            #sess.run([train_step,extra_update_ops], feed_dict={training: True, X: x_train, t: y_train})
            
        accuracy_train = accuracy.eval(feed_dict={training: False, X: x_train, t: y_train})
        accuracy_dev = accuracy.eval(feed_dict={training: False, X: x_dev, t: y_dev})
        print((epoch+1)*epochs_to_display, "Train accuracy: ", accuracy_train, 
              "Development accuracy: ", accuracy_dev)

    accuracy_test = accuracy.eval(feed_dict={X: x_test, t: y_test})
    print ("Test accuracy: ", accuracy_test)
    
    print ("Target values:\n", y_test[0:10], "\nComputed values:\n", 
           rounded_y.eval(feed_dict={X: x_test[0:10]}))
    print ("First 100 Predictions: ", 
           correct_predictions.eval(feed_dict={X: x_test[0:100], t: y_test[0:100]}))
print ("Elapsed time: ", time()-start_time, "secs.")

500 Train accuracy:  0.9412 Development accuracy:  0.862
1000 Train accuracy:  0.9816 Development accuracy:  0.862
1500 Train accuracy:  0.9923 Development accuracy:  0.863
2000 Train accuracy:  0.9949 Development accuracy:  0.868
2500 Train accuracy:  0.9956 Development accuracy:  0.867
3000 Train accuracy:  0.9963 Development accuracy:  0.868
3500 Train accuracy:  0.9967 Development accuracy:  0.865
4000 Train accuracy:  0.9965 Development accuracy:  0.874
4500 Train accuracy:  0.9975 Development accuracy:  0.867
5000 Train accuracy:  0.9968 Development accuracy:  0.871
5500 Train accuracy:  0.9973 Development accuracy:  0.871
6000 Train accuracy:  0.9973 Development accuracy:  0.875
6500 Train accuracy:  0.9974 Development accuracy:  0.872
7000 Train accuracy:  0.9975 Development accuracy:  0.874
7500 Train accuracy:  0.9977 Development accuracy:  0.872
8000 Train accuracy:  0.9974 Development accuracy:  0.868
8500 Train accuracy:  0.9976 Development accuracy:  0.867
9000 Train accu