## Trainning (Not using TFRecords)

### What you will see
Basically how to train a network on tensorflow.
1. Model definition
2. Loss definition
3. Session build

In [1]:
import tensorflow as tf
import sys
sys.path.append('../')
import model
import os
import subprocess
import glob
from driving_data import HandleData


# Regularization value
L2NormConst = 0.001
start_lr = 0.001
batch_size=100
epochs = 600
input_train_hdf5 = '../Track1_LMDB_Unbalanced'
input_val_hdf5 = '../DatasetLMDB_Wheel_Test'
logs_path = '../logs'
save_dir = '../save'
iter_disp = 10
gpu_fraction = 0.3

os.environ["CUDA_VISIBLE_DEVICES"] = str(0)

### Define the model

In [2]:
# Open Model
driving_model = model.DrivingModel()
    
# Get placeholders from model
model_in = driving_model.input
model_out = driving_model.output
labels_in = driving_model.label_in
model_drop = driving_model.dropout_control

### Create the Session
Basically ask tensorflow to build the graph

In [3]:
# Avoid allocating the whole memory
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))

### Define Loss function

In [4]:
train_vars = tf.trainable_variables()
with tf.name_scope("MSE_Loss_L2Reg"):
    loss = tf.reduce_mean(tf.square(tf.subtract(labels_in, model_out))) + tf.add_n(
        [tf.nn.l2_loss(v) for v in train_vars]) * L2NormConst

# Add model accuracy
with tf.name_scope("Loss_Validation"):
    loss_val = tf.reduce_mean(tf.square(tf.subtract(labels_in, model_out)))

### Define the solver
We want to use the Adam solver to minimize or loss function.

In [5]:
# Solver configuration
# Get ops to update moving_mean and moving_variance from batch_norm
# Reference: https://www.tensorflow.org/api_docs/python/tf/contrib/layers/batch_norm
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.name_scope("Solver"):
    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = start_lr
    # decay every 10000 steps with a base of 0.96
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                               1000, 0.9, staircase=True)

    # Basically update the batch_norm moving averages before the training step
    # http://ruishu.io/2016/12/27/batchnorm/
    with tf.control_dependencies(update_ops):
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)

### Initialize the values (Random values of weights)

In [6]:
# Initialize all random variables (Weights/Bias)
sess.run(tf.global_variables_initializer())

### Define saver object to save checkpoints

In [7]:
# Define saver object to save all the variables of the drivingModel graph
saver = tf.train.Saver()

### Add some variables to be observed on Tensorboard

In [8]:
# Create histogram for labels
tf.summary.histogram("steer_angle", labels_in)
# Add input image/steering angle on summary
tf.summary.image("input_image", model_in, 10)

# Monitor loss, learning_rate, global_step, etc...
tf.summary.scalar("loss_train", loss)
tf.summary.scalar("learning_rate", learning_rate)
tf.summary.scalar("global_step", global_step)
# merge all summaries into a single op
merged_summary_op = tf.summary.merge_all()

# Configure where to save the logs for tensorboard
summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())

### Load the dataset

In [9]:
data = HandleData(path=input_train_hdf5, path_val=input_val_hdf5)
num_images_epoch = int(data.get_num_images() / batch_size)
print('Num samples',data.get_num_images(), 'Iterations per epoch:', num_images_epoch, 'batch size:', batch_size)

Loading training data
LMDB file
Load validation dataset
Number training images: 1040
Number validation images: 1480
Num samples 1040 Iterations per epoch: 10 batch size: 100


### Do training

In [10]:
# For each epoch
for epoch in range(epochs):
    for i in range(int(data.get_num_images() / batch_size)):
        # Get training batch
        xs_train, ys_train = data.LoadTrainBatch(batch_size, should_augment=True)

        # Send training batch to tensorflow graph (Dropout enabled)
        train_step.run(feed_dict={model_in: xs_train, labels_in: ys_train, model_drop: 0.8})

        # Display some information each x iterations
        if i % iter_disp == 0:
            # Get validation batch
            xs, ys = data.LoadValBatch(batch_size)
            # Send validation batch to tensorflow graph (Dropout disabled)
            loss_value = loss_val.eval(feed_dict={model_in: xs, labels_in: ys, model_drop: 1.0})
            print("Epoch: %d, Step: %d, Loss(Val): %g" % (epoch, epoch * batch_size + i, loss_value))

        # write logs at every iteration
        summary = merged_summary_op.eval(feed_dict={model_in: xs_train, labels_in: ys_train, model_drop: 1.0})
        summary_writer.add_summary(summary, epoch * batch_size + i)

    # Save checkpoint after each epoch
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    checkpoint_path = os.path.join(save_dir, "model")
    filename = saver.save(sess, checkpoint_path, global_step=epoch)
    print("Model saved in file: %s" % filename)

    # Shuffle data at each epoch end
    print("Shuffle data")
    data.shuffleData()

Epoch: 0, Step: 0, Loss(Val): 0.158225
Model saved in file: ../save/model-0
Shuffle data
Epoch: 1, Step: 100, Loss(Val): 0.0594684
Model saved in file: ../save/model-1
Shuffle data
Epoch: 2, Step: 200, Loss(Val): 0.0848938
Model saved in file: ../save/model-2
Shuffle data
Epoch: 3, Step: 300, Loss(Val): 0.0765659
Model saved in file: ../save/model-3
Shuffle data
Epoch: 4, Step: 400, Loss(Val): 0.0576228
Model saved in file: ../save/model-4
Shuffle data
Epoch: 5, Step: 500, Loss(Val): 0.0784099
Model saved in file: ../save/model-5
Shuffle data
Epoch: 6, Step: 600, Loss(Val): 0.0810263
Model saved in file: ../save/model-6
Shuffle data
Epoch: 7, Step: 700, Loss(Val): 0.0541961
Model saved in file: ../save/model-7
Shuffle data
Epoch: 8, Step: 800, Loss(Val): 0.0680012
Model saved in file: ../save/model-8
Shuffle data
Epoch: 9, Step: 900, Loss(Val): 0.0700402
Model saved in file: ../save/model-9
Shuffle data
Epoch: 10, Step: 1000, Loss(Val): 0.0632122
Model saved in file: ../save/model-10
S