# Neural Network with TensorFlow

In [5]:
import tensorflow as tf
import numpy as np

In [2]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [95]:
reset_graph()

In [3]:
from datetime import datetime

def log_dir(prefix="", model=False, timestamp=False, subdirname=""):
    """
    - model - set True if model is saved (e.g. saver.save())
    - timestamp - set True to create unique directory
    """
    time = ""
    root_dir = "../models" if model else "../tf_logs"
    if prefix:
        prefix += "-"
    if timestamp:
        time = "-" + datetime.utcnow().strftime("%Y%m%d%H%M%S")
    name = prefix + "model" + time
    logdir = "{}/{}".format(root_dir, name) if not subdirname else "{}/{}/{}".format(root_dir, subdirname, name)
    return logdir

In [96]:
n_inputs = 28 * 28  # MNIST image
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

In [97]:
X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

In [99]:
# simple function for a neuron layer - actually we don't need it since TF has its own implementation
def neuron_layer(X, n_neurons, name, activation=None):
    with tf.name_scope(name):
        n_inputs = int(X.get_shape()[1])
        stddev = 2 / np.sqrt(n_inputs + n_neurons)
        init = tf.truncated_normal((n_inputs, n_neurons), stddev=stddev)
        W = tf.Variable(init, name="kernel")
        b = tf.Variable(tf.zeros([n_neurons]), name="bias")
        Z = tf.matmul(X, W) + b
        if activation is not None:
            return activation(Z)
        else:
            return Z

In [100]:
with tf.name_scope("dnn"):
    hidden1 = neuron_layer(X, n_hidden1, name="hidden1", activation=tf.nn.relu)
    hidden2 = neuron_layer(hidden1, n_hidden2, name="hidden2", activation=tf.nn.relu)
    logits = neuron_layer(hidden2, n_outputs, name="outputs")
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

In [101]:
learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

In [102]:
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [103]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [104]:
file_writer = tf.summary.FileWriter(log_dir(prefix="mnist_NN"), tf.get_default_graph())
file_writer.close()

Our graph is ready. Let's start a learning phase:

Prepare the data:

In [6]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
print(X_train.shape, X_train.dtype)
print(y_train.shape, y_train.dtype)
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0   # reshape and scale from 0.0 to 1.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_test = y_test.astype(np.int32)
y_train = y_train.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

(60000, 28, 28) uint8
(60000,) uint8


Start learning:

In [105]:
n_epochs = 40
batch_size = 50

In [82]:
def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch

In [81]:
a = np.arange(10)
np.array_split(a, 5)

[array([0, 1]), array([2, 3]), array([4, 5]), array([6, 7]), array([8, 9])]

In [106]:
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tBatch accuracy: {:.5f}\tVal accuracy: {:.5f}".format(epoch, acc_train, acc_val))
    save_path = saver.save(sess, log_dir(prefix="mnist_NN", model=True, subdirname="mnist_NN_model"))

Epoch  0	Batch accuracy: 0.90000	Val accuracy: 0.91460
Epoch  1	Batch accuracy: 0.94000	Val accuracy: 0.93240
Epoch  2	Batch accuracy: 0.94000	Val accuracy: 0.94260
Epoch  3	Batch accuracy: 0.90000	Val accuracy: 0.94900
Epoch  4	Batch accuracy: 0.96000	Val accuracy: 0.95500
Epoch  5	Batch accuracy: 0.94000	Val accuracy: 0.95560
Epoch  6	Batch accuracy: 1.00000	Val accuracy: 0.96080
Epoch  7	Batch accuracy: 0.94000	Val accuracy: 0.96180
Epoch  8	Batch accuracy: 0.98000	Val accuracy: 0.96340
Epoch  9	Batch accuracy: 0.96000	Val accuracy: 0.96380
Epoch 10	Batch accuracy: 0.92000	Val accuracy: 0.96640
Epoch 11	Batch accuracy: 0.98000	Val accuracy: 0.96740
Epoch 12	Batch accuracy: 0.98000	Val accuracy: 0.96720
Epoch 13	Batch accuracy: 0.98000	Val accuracy: 0.96960
Epoch 14	Batch accuracy: 1.00000	Val accuracy: 0.97140
Epoch 15	Batch accuracy: 0.96000	Val accuracy: 0.97240
Epoch 16	Batch accuracy: 1.00000	Val accuracy: 0.97380
Epoch 17	Batch accuracy: 1.00000	Val accuracy: 0.97380
Epoch 18	B

**Use model:**

In [107]:
with tf.Session() as sess:
    saver.restore(sess, save_path)
    X_new = X_test[:20]
    Z = logits.eval(feed_dict={X: X_new})
    y_pred = np.argmax(Z, axis=1)

INFO:tensorflow:Restoring parameters from ../models/mnist_NN_model/mnist_NN-model


In [108]:
import pandas as pd

results = pd.DataFrame({"Actual": y_test[:20], "Prediction": y_pred})
results["Correct"] = results["Actual"] == results["Prediction"]
results

Unnamed: 0,Actual,Prediction,Correct
0,7,7,True
1,2,2,True
2,1,1,True
3,0,0,True
4,4,4,True
5,1,1,True
6,4,4,True
7,9,9,True
8,5,5,True
9,9,9,True


### Using ``tf.layer.dense`` in place of ``neuron_layer``

In [109]:
reset_graph()

In [110]:
n_inputs = 28 * 28  # MNIST image
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

learning_rate = 0.01

In [111]:
X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1", activation=tf.nn.relu)
    hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2", activation=tf.nn.relu)
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    
with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

file_writer = tf.summary.FileWriter(log_dir(prefix="TF_dense-mnistNN"), tf.get_default_graph())

In [112]:
n_epochs = 40
batch_size = 50

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tBatch accuracy: {:.5f}\tVal accuracy: {:.5f}".format(epoch, accuracy_batch, accuracy_val))
    save_path = saver.save(sess, log_dir("TF_dense-mnistNN", model=True, subdirname="TF_dense_mnistNN"))

Epoch  0	Batch accuracy: 0.90000	Val accuracy: 0.90240
Epoch  1	Batch accuracy: 0.92000	Val accuracy: 0.92540
Epoch  2	Batch accuracy: 0.94000	Val accuracy: 0.93720
Epoch  3	Batch accuracy: 0.90000	Val accuracy: 0.94160
Epoch  4	Batch accuracy: 0.94000	Val accuracy: 0.94700
Epoch  5	Batch accuracy: 0.94000	Val accuracy: 0.95140
Epoch  6	Batch accuracy: 1.00000	Val accuracy: 0.95480
Epoch  7	Batch accuracy: 0.94000	Val accuracy: 0.96100
Epoch  8	Batch accuracy: 0.96000	Val accuracy: 0.96220
Epoch  9	Batch accuracy: 0.94000	Val accuracy: 0.96500
Epoch 10	Batch accuracy: 0.92000	Val accuracy: 0.96540
Epoch 11	Batch accuracy: 0.98000	Val accuracy: 0.96660
Epoch 12	Batch accuracy: 0.98000	Val accuracy: 0.96820
Epoch 13	Batch accuracy: 0.98000	Val accuracy: 0.97040
Epoch 14	Batch accuracy: 1.00000	Val accuracy: 0.96960
Epoch 15	Batch accuracy: 0.94000	Val accuracy: 0.97180
Epoch 16	Batch accuracy: 0.98000	Val accuracy: 0.97280
Epoch 17	Batch accuracy: 1.00000	Val accuracy: 0.97320
Epoch 18	B

## Batch Normalization

1. $\mu_B = \dfrac{1}{m_B}\sum\limits_{i=1}^{m_B}\mathbf{x}^{(i)}$
2. $\sigma_B^2 = \dfrac{1}{m_B}\sum\limits_{i=1}^{m_B}(\mathbf{x}^{(i)}-\mu_B)^2$
3. $\hat{\mathbf{x}}^{(i)}=\dfrac{\mathbf{x}^{(i)}-\mu_B}{\sqrt{\sigma_B^2+\varepsilon}}$
4. $\mathbf{z}^{(i)} = \gamma\hat{\mathbf{x}}^{(i)} + \beta$


- $\mu_B$ - empirical average value (counted on mini-batch B)
- $\sigma_B$ - empirical standart deviation (counted on mini-batch B)
- $m_B$ - number of samples in mini-batch B
- $\hat{\mathbf{x}}^{(i)}$ - centered around 0 and normalized input
- $\gamma$ - scalability parameter for a layer
- $\beta$ - shift (bias) value for a layer
- $\varepsilon$ - smoothing term (usually $10^{-5}$)
- $\mathbf{z}^{(i)}$ - BN operation output (scaled and shifted version of inputs)

In [113]:
reset_graph()

Creating a graph:

In [32]:
from functools import partial

In [114]:
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
outputs = 10
learning_rate = 0.01

In [115]:
X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name="training")

with tf.name_scope("dnn"):
    my_batch_norm_layer = partial(tf.layers.batch_normalization, training=training, momentum=0.9)
    hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1")
    bn1 = my_batch_norm_layer(hidden1)
    bn1_act = tf.nn.elu(bn1)
    hidden2 = tf.layers.dense(bn1_act, n_hidden2, name="hidden2")
    bn2 = my_batch_norm_layer(hidden2)
    bn2_act = tf.nn.elu(bn2)
    logits_before_bn = tf.layers.dense(bn2_act, outputs, name="logits")
    logits = my_batch_norm_layer(logits_before_bn)

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

file_writer = tf.summary.FileWriter(log_dir("BatchNormalized_model"), tf.get_default_graph())

Start learning:

In [116]:
n_epochs = 20
batch_size = 200

To update $\gamma$ and $\beta$ we have to evaluate some extra operations to calculate them (in addition to training_op node evaluation)

In [117]:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run([training_op, extra_update_ops], feed_dict={training: True, X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tAccuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("BatchNorm", model=True, subdirname="BatchNorm_model"))

Epoch  0	Accuracy: 0.88240
Epoch  1	Accuracy: 0.90940
Epoch  2	Accuracy: 0.91960
Epoch  3	Accuracy: 0.92820
Epoch  4	Accuracy: 0.93180
Epoch  5	Accuracy: 0.93760
Epoch  6	Accuracy: 0.94380
Epoch  7	Accuracy: 0.94700
Epoch  8	Accuracy: 0.94920
Epoch  9	Accuracy: 0.95200
Epoch 10	Accuracy: 0.95360
Epoch 11	Accuracy: 0.95520
Epoch 12	Accuracy: 0.95760
Epoch 13	Accuracy: 0.96220
Epoch 14	Accuracy: 0.96320
Epoch 15	Accuracy: 0.96280
Epoch 16	Accuracy: 0.96580
Epoch 17	Accuracy: 0.96760
Epoch 18	Accuracy: 0.96780
Epoch 19	Accuracy: 0.96860


## Gradient Clipping

In [118]:
reset_graph()

In [119]:
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 50
n_hidden5 = 50
n_outputs = 10

learning_rate = 0.01

In [120]:
X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
    hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3")
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")
    hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.relu, name="hidden5")
    logits = tf.layers.dense(hidden5, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    
with tf.name_scope("train"):
    threshold = 1.0
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var) for grad, var in grads_and_vars]
    training_op = optimizer.apply_gradients(capped_gvs)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

init = tf.global_variables_initializer()
saver = tf.train.Saver()

file_writer = tf.summary.FileWriter(log_dir("GradientClipping_model"), tf.get_default_graph())

In [121]:
n_epochs = 20
batch_size = 200

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tValid accuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("GradientClipping", model=True, subdirname="GradClipp_model"))

Epoch  0	Valid accuracy: 0.28760
Epoch  1	Valid accuracy: 0.79400
Epoch  2	Valid accuracy: 0.87980
Epoch  3	Valid accuracy: 0.90620
Epoch  4	Valid accuracy: 0.91640
Epoch  5	Valid accuracy: 0.92240
Epoch  6	Valid accuracy: 0.92940
Epoch  7	Valid accuracy: 0.93560
Epoch  8	Valid accuracy: 0.93840
Epoch  9	Valid accuracy: 0.94180
Epoch 10	Valid accuracy: 0.94580
Epoch 11	Valid accuracy: 0.94720
Epoch 12	Valid accuracy: 0.94760
Epoch 13	Valid accuracy: 0.95340
Epoch 14	Valid accuracy: 0.95660
Epoch 15	Valid accuracy: 0.95640
Epoch 16	Valid accuracy: 0.95760
Epoch 17	Valid accuracy: 0.95920
Epoch 18	Valid accuracy: 0.96260
Epoch 19	Valid accuracy: 0.96140


## Reusing TensorFlow models

In [69]:
import time

In [31]:
reset_graph()

In [32]:
saver = tf.train.import_meta_graph("../models/GradClipp_model/GradientClipping-model.meta")  # Upload whole graph

In [33]:
for op in tf.get_default_graph().get_operations():
    print(op.name)

X
y
hidden1/kernel/Initializer/random_uniform/shape
hidden1/kernel/Initializer/random_uniform/min
hidden1/kernel/Initializer/random_uniform/max
hidden1/kernel/Initializer/random_uniform/RandomUniform
hidden1/kernel/Initializer/random_uniform/sub
hidden1/kernel/Initializer/random_uniform/mul
hidden1/kernel/Initializer/random_uniform
hidden1/kernel
hidden1/kernel/Assign
hidden1/kernel/read
hidden1/bias/Initializer/zeros
hidden1/bias
hidden1/bias/Assign
hidden1/bias/read
dnn/hidden1/MatMul
dnn/hidden1/BiasAdd
dnn/hidden1/Relu
hidden2/kernel/Initializer/random_uniform/shape
hidden2/kernel/Initializer/random_uniform/min
hidden2/kernel/Initializer/random_uniform/max
hidden2/kernel/Initializer/random_uniform/RandomUniform
hidden2/kernel/Initializer/random_uniform/sub
hidden2/kernel/Initializer/random_uniform/mul
hidden2/kernel/Initializer/random_uniform
hidden2/kernel
hidden2/kernel/Assign
hidden2/kernel/read
hidden2/bias/Initializer/zeros
hidden2/bias
hidden2/bias/Assign
hidden2/bias/read
dn

Then we have to create descriptors for the train phase:

In [34]:
X = tf.get_default_graph().get_tensor_by_name("X:0")
y = tf.get_default_graph().get_tensor_by_name("y:0")
accuracy = tf.get_default_graph().get_tensor_by_name("eval/accuracy:0")
training_op = tf.get_default_graph().get_operation_by_name("train/GradientDescent")

And start the train phase:

In [35]:
with tf.Session() as sess:
    saver.restore(sess, "../models/GradClipp_model/GradientClipping-model")  # Restore the model
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tValid accuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("GradientClipping_NEW", model=True, subdirname="GradClipp_model"))

INFO:tensorflow:Restoring parameters from ../models/GradClipp_model/GradientClipping-model
Epoch  0	Valid accuracy: 0.98060
Epoch  1	Valid accuracy: 0.98100
Epoch  2	Valid accuracy: 0.97940
Epoch  3	Valid accuracy: 0.98160
Epoch  4	Valid accuracy: 0.98040
Epoch  5	Valid accuracy: 0.98080
Epoch  6	Valid accuracy: 0.98200
Epoch  7	Valid accuracy: 0.98120
Epoch  8	Valid accuracy: 0.98100
Epoch  9	Valid accuracy: 0.98120
Epoch 10	Valid accuracy: 0.98040
Epoch 11	Valid accuracy: 0.98140
Epoch 12	Valid accuracy: 0.98040
Epoch 13	Valid accuracy: 0.98100
Epoch 14	Valid accuracy: 0.98040
Epoch 15	Valid accuracy: 0.98040
Epoch 16	Valid accuracy: 0.97960
Epoch 17	Valid accuracy: 0.98020
Epoch 18	Valid accuracy: 0.98120
Epoch 19	Valid accuracy: 0.98060


Also we can use only specific parts of the uploaded graph:

In [82]:
reset_graph()

In [83]:
learning_rate = 0.1

In [84]:
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 20 # new
n_outputs = 10 # new

X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
    hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3")
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")  # new
    logits = tf.layers.dense(hidden4, n_outputs, name="outputs")  # new
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

We've created a new graph and we want to use hidden1, hidden2 and hidden3 layers from the uploaded model:

In [85]:
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden[123]")  #RegExp
restore_saver = tf.train.Saver(reuse_vars)  # to restore layers 1-3 only

t0 = time.time()
with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess, "../models/GradClipp_model/GradientClipping-model")
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tValid accuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("GradientClipping_NEW", model=True, subdirname="GradClipp_model"))
t1 = time.time()
print("Total training time: {:.2f}".format(t1 - t0))

INFO:tensorflow:Restoring parameters from ../models/GradClipp_model/GradientClipping-model
Epoch  0	Valid accuracy: 0.97200
Epoch  1	Valid accuracy: 0.97820
Epoch  2	Valid accuracy: 0.97780
Epoch  3	Valid accuracy: 0.97940
Epoch  4	Valid accuracy: 0.97940
Epoch  5	Valid accuracy: 0.98000
Epoch  6	Valid accuracy: 0.97940
Epoch  7	Valid accuracy: 0.98000
Epoch  8	Valid accuracy: 0.98140
Epoch  9	Valid accuracy: 0.98080
Epoch 10	Valid accuracy: 0.97980
Epoch 11	Valid accuracy: 0.98080
Epoch 12	Valid accuracy: 0.97780
Epoch 13	Valid accuracy: 0.98040
Epoch 14	Valid accuracy: 0.98020
Epoch 15	Valid accuracy: 0.98000
Epoch 16	Valid accuracy: 0.97940
Epoch 17	Valid accuracy: 0.97980
Epoch 18	Valid accuracy: 0.97980
Epoch 19	Valid accuracy: 0.98000
Total training time: 28.16


## Freezing the Lower Layers

In [77]:
reset_graph()

In [78]:
learning_rate = 0.1

In [79]:
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 20 # new
n_outputs = 10 # new

X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
    hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name="hidden3")
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")  # new
    logits = tf.layers.dense(hidden4, n_outputs, name="outputs")  # new
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="hidden[34]|outputs")  # layers to train
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss, var_list=train_vars)  # train only specific layers

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [80]:
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden[123]")  #RegExp
restore_saver = tf.train.Saver(reuse_vars)  # to restore layers 1-3 only

In [81]:
t0 = time.time()
with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess, "../models/GradClipp_model/GradientClipping-model")
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tValid accuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("GradientClipping_NEW", model=True, subdirname="GradClipp_model"))
t1 = time.time()
print("Total training time: {:.2f}".format(t1 - t0))

INFO:tensorflow:Restoring parameters from ../models/GradClipp_model/GradientClipping-model
Epoch  0	Valid accuracy: 0.97340
Epoch  1	Valid accuracy: 0.97620
Epoch  2	Valid accuracy: 0.97720
Epoch  3	Valid accuracy: 0.97880
Epoch  4	Valid accuracy: 0.97880
Epoch  5	Valid accuracy: 0.97980
Epoch  6	Valid accuracy: 0.97940
Epoch  7	Valid accuracy: 0.97900
Epoch  8	Valid accuracy: 0.97940
Epoch  9	Valid accuracy: 0.97940
Epoch 10	Valid accuracy: 0.97940
Epoch 11	Valid accuracy: 0.97940
Epoch 12	Valid accuracy: 0.97940
Epoch 13	Valid accuracy: 0.97900
Epoch 14	Valid accuracy: 0.98020
Epoch 15	Valid accuracy: 0.98000
Epoch 16	Valid accuracy: 0.97920
Epoch 17	Valid accuracy: 0.97960
Epoch 18	Valid accuracy: 0.97940
Epoch 19	Valid accuracy: 0.98020
Total training time: 14.63


As we can see with two freezed layers the training process goes ~2 times faster than in previous case

Also we can use ``stop_gradient`` method to freeze all layers below:

In [103]:
reset_graph()

In [104]:
learning_rate = 0.1
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 20 # new
n_outputs = 10 # new

In [105]:
X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")  # freezed
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")  # freezed
    hidden2_stop = tf.stop_gradient(hidden2)
    hidden3 = tf.layers.dense(hidden2_stop, n_hidden3, activation=tf.nn.relu, name="hidden3")
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")
    logits = tf.layers.dense(hidden4, n_outputs, name="outputs")
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [106]:
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden[123]")  #RegExp
restore_saver = tf.train.Saver(reuse_vars)  # to restore layers 1-3 only

In [107]:
t0 = time.time()
with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess, "../models/GradClipp_model/GradientClipping-model")
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tValid accuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("GradientClipping_NEW", model=True, subdirname="GradClipp_model"))
t1 = time.time()
print("Total training time: {:.2f}".format(t1 - t0))

INFO:tensorflow:Restoring parameters from ../models/GradClipp_model/GradientClipping-model
Epoch  0	Valid accuracy: 0.97300
Epoch  1	Valid accuracy: 0.97680
Epoch  2	Valid accuracy: 0.97720
Epoch  3	Valid accuracy: 0.97780
Epoch  4	Valid accuracy: 0.97820
Epoch  5	Valid accuracy: 0.97940
Epoch  6	Valid accuracy: 0.97940
Epoch  7	Valid accuracy: 0.97920
Epoch  8	Valid accuracy: 0.97980
Epoch  9	Valid accuracy: 0.97960
Epoch 10	Valid accuracy: 0.97960
Epoch 11	Valid accuracy: 0.98040
Epoch 12	Valid accuracy: 0.97980
Epoch 13	Valid accuracy: 0.97920
Epoch 14	Valid accuracy: 0.97960
Epoch 15	Valid accuracy: 0.98060
Epoch 16	Valid accuracy: 0.98020
Epoch 17	Valid accuracy: 0.98040
Epoch 18	Valid accuracy: 0.98040
Epoch 19	Valid accuracy: 0.98020
Total training time: 14.22


## Caching the Frozen Layers

In [108]:
reset_graph()

In [4]:
learning_rate = 0.1
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 20 # new
n_outputs = 10 # new

In [110]:
X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")  # freezed
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")  # freezed
    hidden2_stop = tf.stop_gradient(hidden2)
    hidden3 = tf.layers.dense(hidden2_stop, n_hidden3, activation=tf.nn.relu, name="hidden3")
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name="hidden4")
    logits = tf.layers.dense(hidden4, n_outputs, name="outputs")
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [111]:
reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden[123]")  #RegExp
restore_saver = tf.train.Saver(reuse_vars)  # to restore layers 1-3 only

In [113]:
t0 = time.time()
with tf.Session() as sess:
    init.run()
    restore_saver.restore(sess, "../models/GradClipp_model/GradientClipping-model")
    for epoch in range(n_epochs):
        h2_cache = sess.run(hidden2, feed_dict={X: X_train})
        h2_cache_valid = sess.run(hidden2, feed_dict={X: X_valid})
        for hidden2_batch, y_batch in shuffle_batch(h2_cache, y_train, batch_size):
            sess.run(training_op, feed_dict={hidden2: hidden2_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={hidden2: h2_cache_valid, y: y_valid})
        print("Epoch {:2}\tValid accuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("GradientClipping_NEW", model=True, subdirname="GradClipp_model"))
t1 = time.time()
print("Total training time: {:.2f}".format(t1 - t0))

INFO:tensorflow:Restoring parameters from ../models/GradClipp_model/GradientClipping-model
Epoch  0	Valid accuracy: 0.97420
Epoch  1	Valid accuracy: 0.97600
Epoch  2	Valid accuracy: 0.97820
Epoch  3	Valid accuracy: 0.97820
Epoch  4	Valid accuracy: 0.97880
Epoch  5	Valid accuracy: 0.97860
Epoch  6	Valid accuracy: 0.98000
Epoch  7	Valid accuracy: 0.97920
Epoch  8	Valid accuracy: 0.97980
Epoch  9	Valid accuracy: 0.97880
Epoch 10	Valid accuracy: 0.97940
Epoch 11	Valid accuracy: 0.98020
Epoch 12	Valid accuracy: 0.98040
Epoch 13	Valid accuracy: 0.98020
Epoch 14	Valid accuracy: 0.97960
Epoch 15	Valid accuracy: 0.98020
Epoch 16	Valid accuracy: 0.98040
Epoch 17	Valid accuracy: 0.98020
Epoch 18	Valid accuracy: 0.98060
Epoch 19	Valid accuracy: 0.97920
Total training time: 13.01


# Fast Optimizers

## Momentum Optimization

1. $\mathbf{m}\leftarrow\beta\mathbf{m}-\eta\nabla_{\theta}J(\theta)$
2. $\theta\leftarrow\theta+\mathbf{m}$

When $\nabla_{\theta}J(\theta)=\text{const}$:

$m=\eta\dfrac{1}{1 - \beta}\text{const}$ - e.g. if $\beta=0.9$ the speed will be 10 times higher than normal GD

In [114]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, 
                                       momentum=0.9)

## Nesterov Acceleration Gradient (NAG)

1. $\mathbf{m}\leftarrow\beta\mathbf{m}-\eta\nabla_{\theta}J(\theta + \beta\mathbf{m})$
2. $\theta\leftarrow\theta+\mathbf{m}$

In [115]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                       momentum=0.9, use_nesterov=True)

## AdaGrad

**Highly not recommended for DNN - can stop too early (before global optimum is reached)**

1. $\mathbf{s}\leftarrow\mathbf{s} + \nabla_{\theta}J(\theta)\otimes\nabla_{\theta}J(\theta)$ ($\otimes$ - elementwise multiplication)
2. $\theta\leftarrow\theta-\eta\nabla_{\theta}J(\theta)\oslash\sqrt{\mathbf{s}+\varepsilon}$ ($\oslash$ - elementwise division)

$s_i\leftarrow s_i + (\partial J(\theta)\:/\:\partial\theta_i)^2$

$\theta_i\leftarrow\theta_i-\eta\partial J(\theta)\:/\:\partial\theta_i\:/\:\sqrt{s_i + \varepsilon}$

In [6]:
optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)

## RMSProp

1. $\mathbf{s}\leftarrow\beta\mathbf{s} + (1-\beta)\nabla_{\theta}J(\theta)\otimes\nabla_{\theta}J(\theta)$
2. $\theta\leftarrow\theta-\eta\nabla_{\theta}J(\theta)\oslash\sqrt{\mathbf{s}+\varepsilon}$

In [5]:
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, 
                                      momentum=0.9, decay=0.9, epsilon=1e-10)

## Adam (adaptive moment estimation)

1. $\mathbf{m}\leftarrow\beta_1\mathbf{m}-(1-\beta_1)\nabla_{\theta}J(\theta)$
2. $\mathbf{s}\leftarrow\beta_2\mathbf{s} + (1-\beta_2)\nabla_{\theta}J(\theta)\otimes\nabla_{\theta}J(\theta)$
3. $\mathbf{m}\leftarrow\dfrac{\mathbf{m}}{1-{\beta_1}^t}$
4. $\mathbf{s}\leftarrow\dfrac{\mathbf{s}}{1-{\beta_2}^t}$
5. $\theta\leftarrow\theta-\eta\mathbf{m}\oslash\sqrt{\mathbf{s}+\varepsilon}$

Ususally $\beta_1 = 0.9$ and $\beta_2 = 0.999$

In [8]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

## Learning Rate Scheduling

In [10]:
reset_graph()

In [11]:
n_inputs = 28 * 28  # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10

X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32), name="accuracy")

In [13]:
with tf.name_scope("train"):
    initial_learining_rate = 0.1
    decay_steps = 10000
    decay_rate = 1/10
    global_step = tf.Variable(0, trainable=False, name="global_step")
    learning_rate = tf.train.exponential_decay(initial_learining_rate, global_step, decay_steps, decay_rate)
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9)
    training_op = optimizer.minimize(loss, global_step=global_step)

In [14]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [22]:
n_epochs = 5
batch_size = 50

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tValid accuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("LRSchedule", model=True, subdirname="LearningRateSched_model"))

Epoch  0	Valid accuracy: 0.91800
Epoch  1	Valid accuracy: 0.95080
Epoch  2	Valid accuracy: 0.96400
Epoch  3	Valid accuracy: 0.96880
Epoch  4	Valid accuracy: 0.97240


# DNN Regularization
## $\ell_1$ and $\ell_2$ Regularizations

Manually:

In [None]:
reset_graph()

In [10]:
n_inputs = 28 * 28
n_hidden1 = 300
n_outputs = 10
learning_rate = 0.01

In [9]:
X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    logits = tf.layers.dense(hidden1, n_outputs, name="outputs")

In [11]:
W1 = tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
W2 = tf.get_default_graph().get_tensor_by_name("outputs/kernel:0")

scale = 0.001

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    base_loss = tf.reduce_mean(xentropy, name="avg_xentropy")
    reg_losses = tf.reduce_sum(tf.abs(W1)) + tf.reduce_sum(tf.abs(W2))
    loss = tf.add(base_loss, scale * reg_losses, name="loss")

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32), name="accuracy")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [12]:
n_epochs = 20
batch_size = 200

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tValid accuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("L1_Reg", model=True, subdirname="L1_Reg_model"))

Epoch  0	Valid accuracy: 0.79000
Epoch  1	Valid accuracy: 0.84300
Epoch  2	Valid accuracy: 0.86440
Epoch  3	Valid accuracy: 0.87800
Epoch  4	Valid accuracy: 0.88480
Epoch  5	Valid accuracy: 0.88880
Epoch  6	Valid accuracy: 0.89180
Epoch  7	Valid accuracy: 0.89560
Epoch  8	Valid accuracy: 0.89860
Epoch  9	Valid accuracy: 0.90220
Epoch 10	Valid accuracy: 0.90400
Epoch 11	Valid accuracy: 0.90580
Epoch 12	Valid accuracy: 0.90580
Epoch 13	Valid accuracy: 0.90720
Epoch 14	Valid accuracy: 0.90700
Epoch 15	Valid accuracy: 0.90740
Epoch 16	Valid accuracy: 0.90860
Epoch 17	Valid accuracy: 0.90940
Epoch 18	Valid accuracy: 0.90920
Epoch 19	Valid accuracy: 0.90940


Or by using TensorFlow:

In [16]:
from functools import partial

In [88]:
reset_graph()

In [89]:
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10
learning_rate = 0.01

X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

In [90]:
scale = 0.001

In [91]:
my_dense_layer = partial(tf.layers.dense, activation=tf.nn.relu, 
                         kernel_regularizer=tf.contrib.layers.l1_regularizer(scale))

with tf.name_scope("dnn"):
    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    hidden2 = my_dense_layer(hidden1, n_hidden2, name="hidden2")
    logits = my_dense_layer(hidden2, n_outputs, activation=None, name="outputs")

In [92]:
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    base_loss = tf.reduce_mean(xentropy, name="avg_xentropy")
    reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss = tf.add_n([base_loss] + reg_losses, name="loss")

In [93]:
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32), name="accuracy")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()
file_writer = tf.summary.FileWriter(log_dir("L1_Regul"), tf.get_default_graph())

In [94]:
n_epochs = 20
batch_size = 200

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tValid accuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("L1_Reg", model=True, subdirname="L1_Reg_model"))

Epoch  0	Valid accuracy: 0.82740
Epoch  1	Valid accuracy: 0.87660
Epoch  2	Valid accuracy: 0.89520
Epoch  3	Valid accuracy: 0.90180
Epoch  4	Valid accuracy: 0.90840
Epoch  5	Valid accuracy: 0.90960
Epoch  6	Valid accuracy: 0.91260
Epoch  7	Valid accuracy: 0.91540
Epoch  8	Valid accuracy: 0.91780
Epoch  9	Valid accuracy: 0.91900
Epoch 10	Valid accuracy: 0.92000
Epoch 11	Valid accuracy: 0.92240
Epoch 12	Valid accuracy: 0.92120
Epoch 13	Valid accuracy: 0.92280
Epoch 14	Valid accuracy: 0.92240
Epoch 15	Valid accuracy: 0.92160
Epoch 16	Valid accuracy: 0.92180
Epoch 17	Valid accuracy: 0.92280
Epoch 18	Valid accuracy: 0.92160
Epoch 19	Valid accuracy: 0.92140


## Dropout

In [83]:
reset_graph()

In [84]:
n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10
learning_rate = 0.01

X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")

In [85]:
training = tf.placeholder_with_default(False, shape=(), name="training")

dropout_rate = 0.5  # == 1 - keep_prob
X_drop = tf.layers.dropout(X, rate=dropout_rate, training=training)  # dropout

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden1_drop = tf.layers.dropout(hidden1, rate=dropout_rate, training=training)  # dropout
    hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, activation=tf.nn.relu, name="hidden2")
    hidden2_drop = tf.layers.dropout(hidden2, rate=dropout_rate, training=training)  # dropout
    logits = tf.layers.dense(hidden2_drop, n_outputs, name="outputs")

In [86]:
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32), name="accuracy")
    
with tf.name_scope("train"):
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()
file_writer = tf.summary.FileWriter(log_dir("Dropout"), tf.get_default_graph())

In [87]:
n_epochs = 20
batch_size = 200

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
        print("Epoch {:2}\tValid accuracy: {:.5f}".format(epoch, accuracy_val))
    save_path = saver.save(sess, log_dir("Dropout", model=True, subdirname="Dropout_model"))

Epoch  0	Valid accuracy: 0.92300
Epoch  1	Valid accuracy: 0.94420
Epoch  2	Valid accuracy: 0.95000
Epoch  3	Valid accuracy: 0.96140
Epoch  4	Valid accuracy: 0.96500
Epoch  5	Valid accuracy: 0.96960
Epoch  6	Valid accuracy: 0.97240
Epoch  7	Valid accuracy: 0.97340
Epoch  8	Valid accuracy: 0.97580
Epoch  9	Valid accuracy: 0.97520
Epoch 10	Valid accuracy: 0.97680
Epoch 11	Valid accuracy: 0.97820
Epoch 12	Valid accuracy: 0.97640
Epoch 13	Valid accuracy: 0.97860
Epoch 14	Valid accuracy: 0.97760
Epoch 15	Valid accuracy: 0.98020
Epoch 16	Valid accuracy: 0.97980
Epoch 17	Valid accuracy: 0.97960
Epoch 18	Valid accuracy: 0.98080
Epoch 19	Valid accuracy: 0.98060
