In [1]:
import sys
import time
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import itertools
import os 
import matplotlib.pyplot as plt
import random
import tensorflow as tf
from tensorflow.keras.mixed_precision import experimental as mixed_precision
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
tf.config.optimizer.set_jit(True) # Enable accelerated linear algebra 
import math
from Helper.DataLoader import Loader , batchSize ,genBranch
from Helper.ImitationLearning import Branches , BranchCommands ,create_network , dropoutVec
from Helper.ImageAug import images_aug
from tensorflow.core.protobuf import saver_pb2

Using TensorFlow backend.


In [2]:
trainfromScratch =  True
epochs = 1000
MAX_LR_COUNTER = 3 # model has to perform worse for this number of cases to decrement learning rate 
memory_fraction = 0.5
MAX_LEARNING_RATE = 2e-4
MIN_LEARNING_RATE = 1e-7
LEARNING_RATE_DECAY = 0.5
LEARNING_RATE =  [MAX_LEARNING_RATE] * len(Branches)

In [3]:
train_loader = Loader('/home/pankaj/CARLA_0.8.4/Collected_data/train/*/' ,'training_data',Branches , BranchCommands)
val_loader = Loader('/home/pankaj/CARLA_0.8.4/Collected_data/train/*/' , 'validation_data',Branches , BranchCommands)
dir_path = os.getcwd()
contents= os.listdir(dir_path)
model_path= os.path.join(dir_path, 'models')
logs_path= os.path.join(dir_path ,'logs')
if 'models' not in contents:
    os.mkdir(model_path)
if 'logs'not in contents:
    os.mkdir(logs_path)

for branch in Branches:#,'speed', 'intent'
    print(f"Training points in {branch} is {train_loader.dict[branch]['Count']}")
    print(f"Validation points in {branch} is {val_loader.dict[branch]['Count']}")

total_train = sum(train_loader.dict[branch]['Count'] for branch in Branches if branch not in ["Speed" , "Intent"])
total_val =  sum(val_loader.dict[branch]['Count'] for branch in Branches if branch not in ["Speed" , "Intent"] )
steps_per_epoch = total_train//(batchSize * len(Branches))
print("steps_per_epoch: ",steps_per_epoch)
VAL_STEPS = total_val//(batchSize*len(Branches))
print("VAL_STEPS:", VAL_STEPS)

Training points in Follow_Lane is 82322
Validation points in Follow_Lane is 82322
Training points in Left is 82322
Validation points in Left is 82322
Training points in Right is 82322
Validation points in Right is 82322
Training points in Straight is 82322
Validation points in Straight is 82322
Training points in Speed is 82322
Validation points in Speed is 82322
Training points in Intent is 82322
Validation points in Intent is 82322
steps_per_epoch:  1715
VAL_STEPS: 1715


In [4]:
batchListGenTrain = []
batchListGenVal = []
for branch in Branches:
    miniBatchGen = genBranch(branch = train_loader.dict[branch],command = branch ,batchSize = batchSize)
    batchListGenTrain.append(miniBatchGen)
    miniBatchGen = genBranch(branch = val_loader.dict[branch],command = branch, batchSize = batchSize)
    batchListGenVal.append(miniBatchGen)

from IPython.display import clear_output
while True:
    for j in range(len(Branches)):
        xs , ys = next(batchListGenTrain[j])
        #xs = images_aug(xs)
        xs = np.multiply(xs , 1.0/255.0)
        command = np.eye(len(Branches))[ys[0,24].astype(np.int8)].reshape(1,-1)
        for i in range(batchSize):
            plt.imshow(xs[i])
            plt.show()
            print(f"Steer: {ys[i][0]} Throttle: {ys[i][1]} Brake: {ys[i][2]} TimeStamp: {ys[i][11] }")
            print(f"Speed: {ys[i][10]} Directions: {ys[i][24]} Branch:{Branches[j]} Iterator: {i}") 
            print(f"Ped: {ys[i][25]} Veh: {ys[i][26]} Tra: {ys[i][27]} Command:{command}")
            clear_output(wait = True)        

In [5]:
# Setup tensorflow 
tf.reset_default_graph()
sessGraph = tf.Graph()
# use many gpus
config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = memory_fraction 
with sessGraph.as_default():
    sess = tf.Session(graph=sessGraph, config=config)
    with sess.as_default():
        nettensors = create_network()
        sess.run(tf.global_variables_initializer())
        # merge all summaries into a single op
        merged_summary_op = tf.summary.merge_all()
        saver = tf.train.Saver(write_version=saver_pb2.SaverDef.V2)
        if not (trainfromScratch):
            print("loading base model from " , model_path)
            saver.restore(sess, model_path+"/model.ckpt")  # restore trained parameters   
        min_epoch_loss = np.array([[float('inf')]*len(Branches)])
        summary_writer = tf.summary.FileWriter(logs_path, graph=sessGraph)
        tboard_counter = 0
        lr_counter = [0] * len(Branches)
        for epoch in range(epochs): #1st loop for epochs 
            start_time=time.time()
            print(f'Starting epoch: {epoch}')
            #epoch_loss=0
            for step in range(steps_per_epoch):# second loop for each step in a epoch
                #step_loss=0
                for j in range(len(Branches)):# each step will update all braches one at a time  
                    xs , ys = next(batchListGenTrain[j])
                    if step%100 == 0:
                        xs = images_aug(xs)
                    xs = np.multiply(xs , 1.0/255.0)
                    command = np.eye(len(Branches))[ys[0,24].astype(np.int8)].reshape(1,-1)
                    contSolver = nettensors['optimizers']
                    contLoss = nettensors['losses']
                    log = nettensors['Logger']
                    feedDict = {nettensors['inputs'][0]: xs, 
                                nettensors['inputs'][1][0]: command,
                                nettensors['inputs'][1][1]:ys[:,10].reshape([batchSize,1]),
                                nettensors['droput']: dropoutVec, 
                                nettensors['targets'][0]: ys[:,10].reshape([batchSize,1]),
                                nettensors['targets'][1]: ys[:,0:3],
                                nettensors['targets'][2]: ys[:,25:28] ,
                                nettensors['learning_rate']: LEARNING_RATE[j]
                               }  #
                    _,loss,log    = sess.run([contSolver, contLoss, log ], feed_dict = feedDict)
                    #print(log)
                #time.sleep(20)
                summary = merged_summary_op.eval(feed_dict=feedDict)
                summary_writer.add_summary(summary, tboard_counter)
                tboard_counter+=1
            print("Running Validation")
            epoch_loss = np.zeros((1,len(Branches)))
            for step in range(VAL_STEPS):
                step_loss = [0]*len(Branches) 
                for j in range(len(branchConfig)):
                    xs, ys = next(batchListGenVal[j])
                    xs =  np.multiply(xs , 1.0/255.0)
                    contLoss = nettensors['losses'] 
                    log = nettensors['Logger']
                    command = np.eye(len(Branches))[ys[0,24].astype(np.int8)].reshape(1,-1)
                    feedDict = {
                            nettensors['inputs'][0]: xs, 
                            nettensors['inputs'][1][0]: command,
                            nettensors['inputs'][1][1]:ys[:,10].reshape([batchSize,1]),
                            nettensors['droput']:[1] * len(dropoutVec),  
                            nettensors['targets'][0]: ys[:,10].reshape([batchSize,1]),
                            nettensors['targets'][1]: ys[:,0:3],
                            nettensors['targets'][2]: ys[:,25:28],
                            }  
                    loss,log = sess.run([contLoss , log], feed_dict = feedDict)
                    #print(f"Validation--> Step:: {step}  Branch: {branch_map[j]} loss: {loss}" )
                    #print(log)
                    step_loss[j] = loss 
                epoch_loss+=step_loss 
            epoch_loss /= VAL_STEPS    
            branchImprovement = list((epoch_loss < min_epoch_loss)[0])
            print(f"Epoch no. {epoch} took {(time.time() - start_time)//60} minutes")
            print(f"branch improvement: {branchImprovement}")
            print(f"branch losses:{epoch_loss}")
            print(f"Minimum epoch loss: {min_epoch_loss}")
            if np.sum(epoch_loss < min_epoch_loss) > len(Branches)/2:# Loss has decreased in more than half the branches
                min_epoch_loss = epoch_loss               
                print(f"Found better model saving  checkpoint")
                checkpoint_path=os.path.join(model_path , "model.ckpt")
                file_name= saver.save(sess , checkpoint_path)
                for j , imp in enumerate(branchImprovement):
                    if imp:
                        lr_counter[j] = 0                       
            else: # Did not find a better model
                for j , imp in enumerate(branchImprovement):
                    if not imp:
                        lr_counter[j] += 1 # Increment counter only for which improvement was not found    
            for j , imp in enumerate(branchImprovement):            
                if lr_counter[j] ==  MAX_LR_COUNTER:
                    LEARNING_RATE[j] *= LEARNING_RATE_DECAY
                    if LEARNING_RATE[j] <= MIN_LEARNING_RATE:
                        print(f"Last learning rate achieved for {Branches[j]}")
                        LEARNING_RATE[j] = MAX_LEARNING_RATE
                    print(f"Updated learning rate for {Branches[j] }: {LEARNING_RATE[j]} ", )
                    lr_counter[j] = 0
            print(f"lr counter : {lr_counter}")
            print(f"Current learning rate : {LEARNING_RATE}")             
        print("Saving last model")
        checkpoint_path=os.path.join(model_path , "model.ckpt")
        file_name= saver.save(sess , checkpoint_path)




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Tensor("Network/Branch_0/fc_8:0", shape=(?, 3), dtype=float32, device=/device:GPU:0)
Tensor("Network/Branch_1/fc_11:0", shape=(?, 3), dtype=float32, device=/device:GPU:0)
Tensor("Network/Branch_2/fc_14:0", shape=(?, 3), dtype=float32, device=/device:GPU:0)
Tensor("Network/Branch_3/fc_17:0", shape=(?, 3), dtype=float32, device=/device:GPU:0)
Tensor("Network/Branch_4/fc_20:0", shape=(?, 1), dtype=float32, device=/device:GPU:0)
Tensor("Network/Branch_5/fc_23:0", shape=(?, 3), dtype=float32, device=/device:GPU:0)




Starting epoch: 0
Running Validation


NameError: name 'branchConfig' is not defined