## Region Proposal Network

**Program Flow**

1) Preprocess data to fit the format of Faster R-CNN.
   - Extract information from the image and label files of Pascal VOC.
   - Generate anchors.
   - Keep track of list of anchors that should be ignored (out of boundary anchors).
   - Determine positive and negative anchors for each image data.
   - Determine the difference between positive anchors and the corresponding ground-truth objects based on the given formula.
   - Sample anchors for training (by using the list to keep track of ignored anchors earlier).

2) Build RPN model.
    - Make sure the kernel size selected is the same with the kernel size used when generating anchors earlier.
    - Make sure the ignored anchors do not contribute to loss.
    
3) Post-Processing.
    - Choose prediction anchors that are above a certain confidence level to represent the bounding boxes.
    - Use the correct anchors (generated ones) to apply the regression results to. 
    - Use Non-Max Suppression technique to filter out multiple bounding box predictions on one object.

In [1]:
#Imports
import numpy as np
import os
import glob
import cv2
import xmltodict
import tensorflow as tf
import math
from tqdm import tqdm

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
data_images_path     = '../../VOCdevkit/VOC2012/JPEGImages'
data_annotation_path = '../../VOCdevkit/VOC2012/Annotations'
image_height = 224
image_width  = 224
image_depth  = 3
rpn_kernel_size = 3
subsampled_ratio = 8
anchor_sizes = [32,64,128]
anchor_aspect_ratio = [[1,1],[1,2],[2,1]]
num_anchors_in_box = len(anchor_sizes)*len(anchor_aspect_ratio)
neg_threshold = 0.3
pos_threshold = 0.7
anchor_sampling_amount = 128

In [3]:
#Get the image and annotation file paths
list_images      = sorted([x for x in glob.glob(data_images_path + '/**')])     #length : 17125
list_annotations = sorted([x for x in glob.glob(data_annotation_path + '/**')]) #length : 17125
total_images = len(list_images)

### DATA PREPROCESSING

In [4]:
def get_classes(xml_files=list_annotations):
    '''
    Output: All the distinct classes for this dataset.
    
    '''
    classes = []
    
    for file in xml_files: 

        f = open(file)
        doc = xmltodict.parse(f.read()) #parse the xml file to python dict.

        #Images in the dataset might contain either 1 object or more than 1 object. For images with 1 object, the annotation for the object
        #in the xml file will be located in 'annotation' -> 'object' -> 'name'. For images with more than 1 object, the annotations for the objects
        #will be nested in 'annotation' -> 'object' thus requiring a loop to iterate through them. (Pascal VOC format)

        try: 
            #try iterating through the tag. (For images with more than 1 obj.)
            for obj in doc['annotation']['object']:
                classes.append(obj['name'].lower()) #append the lowercased string.

        except TypeError as e: #iterating through non-nested tags would throw a TypeError.
            classes.append(doc['annotation']['object']['name'].lower()) #append the lowercased string.

        f.close()

    classes = list(set(classes)) #remove duplicates.
    classes.sort()

    #returns a list containing the names of classes after being sorted.
    return classes

In [5]:
classes = get_classes()
num_of_class = len(classes)

In [6]:
def get_labels_from_xml(xml_file_path, num_of_class = num_of_class):
    '''
    Input : A SINGLE xml file and the total number of classes in the dataset. 
    Output: Labels in numpy array format (Object classes their corresponding bounding box coordinates).

    Desc : This function parses a single xml file and outputs the objects classes and their corresponding bounding box coordinates
           [top-left-x, top-left-y, btm-right-x, btm-right-y] on the resized image.

    '''

    f = open(xml_file_path)
    doc = xmltodict.parse(f.read()) #parse the xml file to python dict.

    #get the original image height and width. Images have different height and width from each other.
    ori_img_height = float(doc['annotation']['size']['height'])
    ori_img_width  = float(doc['annotation']['size']['width'])


    class_label = [] #init for keeping track objects' labels.
    bbox_label  = [] #init for keeping track of objects' bounding box (bb).


    #Images in the dataset might contain either 1 object or more than 1 object. For images with 1 object, the annotation for the object
    #in the xml file will be located in 'annotation' -> 'object' -> 'name'. For images with more than 1 object, the annotations for the objects
    #will be nested in 'annotation' -> 'object' thus requiring a loop to iterate through them. (Pascal VOC format)
    try:
        #Try iterating through the tag (For images with more than 1 obj).
        for each_obj in doc['annotation']['object']:

            obj_class = each_obj['name'].lower() #get the label for the object and lowercase the string.

            #Pascal VOC's format to denote bounding boxes are to denote the top left part of the box and the bottom right of the box.
            #the coordinates are in terms of x and y axis for both part of the box.
            x_min = float(each_obj['bndbox']['xmin']) #top left x-axis coordinate.
            x_max = float(each_obj['bndbox']['xmax']) #bottom right x-axis coordinate.
            y_min = float(each_obj['bndbox']['ymin']) #top left y-axis coordinate.
            y_max = float(each_obj['bndbox']['ymax']) #bottom right y-axis coordinate.

        ##################################################################################
        #We want to make sure the coordinates are resized according to the resized image.#
        ##################################################################################

            #All the images will be resized to a fixed size in order to be fixed-size inputs to the neural network model.
            #Therefore, we need to resize the coordinates as well since the coordinates above is based on the original size of the images.

            #In order to find the resized coordinates, we must multiply the ratio of the resized image compared to its original to the coordinates.
            x_min = float((image_width/ori_img_width)*x_min)
            y_min = float((image_height/ori_img_height)*y_min)
            x_max = float((image_width/ori_img_width)*x_max)
            y_max = float((image_height/ori_img_height)*y_max)

            generated_box_info = [x_min, y_min, x_max, y_max]


            index = classes.index(obj_class) #get the index of the object's class.

            #append each object's class label and the bounding box label (converted to Faster R-CNN format) into the list initialized earlier.
            class_label.append(index)
            bbox_label.append(np.asarray(generated_box_info, dtype='float32'))

    except TypeError as e : #happens when the iteration through the tag fails due to only 1 object being in the image.

        #SAME PROCEDURE AS ABOVE !  

        #Getting these information from the XML file differs compared to above,
        obj_class = doc['annotation']['object']['name']
        x_min = float(doc['annotation']['object']['bndbox']['xmin']) 
        x_max = float(doc['annotation']['object']['bndbox']['xmax']) 
        y_min = float(doc['annotation']['object']['bndbox']['ymin']) 
        y_max = float(doc['annotation']['object']['bndbox']['ymax']) 

        x_min = float((image_width/ori_img_width)*x_min)
        y_min = float((image_height/ori_img_height)*y_min)
        x_max = float((image_width/ori_img_width)*x_max)
        y_max = float((image_height/ori_img_height)*y_max)

        generated_box_info = [x_min, y_min, x_max, y_max]

        #Get the index of the class
        index = classes.index(obj_class) 

        class_label.append(index)
        bbox_label.append(np.asarray(generated_box_info, dtype='float32'))


    return class_label, np.asarray(bbox_label)

In [7]:
def generate_anchors(rpn_kernel_size=rpn_kernel_size, subsampled_ratio=subsampled_ratio,
                     anchor_sizes=anchor_sizes, anchor_aspect_ratio=anchor_aspect_ratio):

    '''
    Since Faster R-CNN works by placing different sized anchors throughout an image on different positions, we need to generate the coordinates
    for each of those anchors first before generating the labels required by Faster R-CNN. This function generates the anchors based on number
    of anchors, sizes of anchors, aspect ratios of anchors, RPN's kernel size and subsampled size of the image.

    Output : list of anchors (each anchor is denoted by (x,y,w,h)) and the list if an anchor should be ignored or not.

    '''

    list_of_anchors = []
    anchor_booleans = [] #This is to keep track of an anchor's status. Anchors that are out of boundary are meant to be ignored.

    #get the feature map's height and width after convolutions and poolings.


    #the anchor's center is always at the middle grid of the RPN kernel. Therefore, we need to imitate the kernel slide to get all the
    #centers of the anchor. If a kernel is 3x3, the center will start from (1,1). If a kernel is 5x5, the center will start from (2,2).
    #we use divmod to get the first coordinate of the RPN kernel's middle point.
    starting_center = divmod(rpn_kernel_size, 2)[0]

    anchor_center = [starting_center - 1,starting_center] #-1 on the x-coor because the increment comes first in the while loop below.

    #We want to imitate the kernel sliding with stride 1 until it reaches the ending center. Since index starts with 0, we subtract 1 more to the 
    #width and height of the subsampled image.
    subsampled_height = image_height/subsampled_ratio
    subsampled_width = image_width/subsampled_ratio
    
    while (anchor_center != [subsampled_width - (1 + starting_center), subsampled_height - (1 + starting_center)]): 


        anchor_center[0] += 1 #Increment the x-axis by 1.

        #If the sliding window has reached the last central point at the right side, increase the y-axis by 1 and 
        #reset x-axis to 0.
        if anchor_center[0] > subsampled_width - (1 + starting_center):

            anchor_center[1] += 1
            anchor_center[0] = starting_center

        #Even though we calculate the anchor on the feature map, the anchors are still referenced to the original image. Therefore, 
        #once we obtain the position of the center of the anchor on the feature map, we multiply is by the downsampling ratio to obtain its 
        #center position referenced to the original input image.
        anchor_center_on_image = [anchor_center[0]*subsampled_ratio, anchor_center[1]*subsampled_ratio]

        #We want to calculate the anchor's height and width on all the different variations of aspect ratio and sizes.
        #Iterate through every size defined for each anchor center's position.
        for size in anchor_sizes:

            #Iterate through every aspect ratio for each size.
            for a_ratio in anchor_aspect_ratio:

                #[x,y,w,h] of an anchor.
                anchor_info = [anchor_center_on_image[0], anchor_center_on_image[1], size*a_ratio[0], size*a_ratio[1]]

                #Perform check if a given anchor crosses the boundary of the image or not. Such anchors are to be ignored and will be labelled as 0.
                #Else the anchor will be labelled as 1 (meaning good to go).
                if (anchor_info[0] - anchor_info[2]/2 < 0 or anchor_info[0] + anchor_info[2]/2 > image_width or 
                                        anchor_info[1] - anchor_info[3]/2 < 0 or anchor_info[1] + anchor_info[3]/2 > image_height) :

                    anchor_booleans.append([0.0])

                else:

                    anchor_booleans.append([1.0])

                list_of_anchors.append(anchor_info)

    return list_of_anchors, anchor_booleans

In [8]:
def generate_label(class_labels, ground_truth_boxes, anchors, anchor_booleans, num_class=num_of_class,
                    neg_anchor_thresh = neg_threshold, pos_anchor_thresh = pos_threshold):
    '''
    Each and every anchor that was generated (except for the ignored ones) must be labelled with either positive or negative depending on how
    much the particular anchor box intersects with a ground-truth box. If an anchor is positive, the difference between the anchor's [x,y,w,h] and the
    ground-truth box's [x,y,w,h] must be calculated in order for the network to learn how much to regress later on.

    Input  : Classes, Ground truth box(es) belonging to one image [top-left-x, top-left-y, btm-right-x, btm-right-y], all the anchors and anchor booleans.
    Output : Anchor Booleans (to know which anchor to ignore), Objectness array, Coordinate difference array.
    '''


    number_of_anchors = len(anchors) #Get the total number of anchors.

    #For every anchor, we want a 1-D array that denotes whether the anchor should contribute to the loss or not. By default, all of them contributes.
    #Since we already have the anchor booleans list, we will convert that into a numpy array and reshape it accordingly.
    anchor_boolean_array   = np.reshape(np.asarray(anchor_booleans),(number_of_anchors, 1))
    #For every anchor, we want a 2-D array that denotes whether the IoU of the anchor with a ground-truth object is more than certain threshold or not.
    objectness_label_array = np.zeros((number_of_anchors, 2), dtype=np.float32)
    #For every anchor, we want a 4-D array that denotes how much the anchor should regress in order to fit an object. (Only for positive anchors)
    box_regression_array   = np.zeros((number_of_anchors, 4), dtype=np.float32)
    #For every anchor, we want a num_of_class-D array that denotes which class does the object belongs to (Only if the anchor is positive)
    class_array            = np.zeros((number_of_anchors, num_class), dtype=np.float32)

    #We want to iterate through every ground truth box.
    for j in range(ground_truth_boxes.shape[0]):

        #get the class label
        class_label = class_labels[j]

        #Get the ground truth box's coordinates.
        gt_box_top_left_x = ground_truth_boxes[j][0]
        gt_box_top_left_y = ground_truth_boxes[j][1]
        gt_box_btm_rght_x = ground_truth_boxes[j][2]
        gt_box_btm_rght_y = ground_truth_boxes[j][3]

        #Calculate the area of the original bounding box.1 is added since the index starts from 0 not 1.
        gt_box_area = (gt_box_btm_rght_x - gt_box_top_left_x + 1)*(gt_box_btm_rght_y - gt_box_top_left_y + 1)

        #Loop through the anchors.
    
        for i in range(number_of_anchors):

            #Check if the anchor should be ignored or not. If it is to be ignored, skip this i-th loop.
            if int(anchor_boolean_array[i][0]) == 0:

                continue

            anchor = anchors[i] #Select the i-th anchor [x,y,w,h]

            #Since our anchors are in [x,y,w,h] format, we want to convert them to the [top-left-x, top-left-y, btm-right-x, btm-right-y] first.
            anchor_top_left_x = anchor[0] - anchor[2]/2
            anchor_top_left_y = anchor[1] - anchor[3]/2
            anchor_btm_rght_x = anchor[0] + anchor[2]/2
            anchor_btm_rght_y = anchor[1] + anchor[3]/2

            #Get the area of the bounding box.
            anchor_box_area = (anchor_btm_rght_x - anchor_top_left_x + 1)*(anchor_btm_rght_y - anchor_top_left_y + 1)

            #Determine the intersection rectangle.
            int_rect_top_left_x = max(gt_box_top_left_x, anchor_top_left_x)
            int_rect_top_left_y = max(gt_box_top_left_y, anchor_top_left_y)
            int_rect_btm_rght_x = min(gt_box_btm_rght_x, anchor_btm_rght_x)
            int_rect_btm_rght_y = min(gt_box_btm_rght_y, anchor_btm_rght_y)

            #if the boxes do not intersect, the difference will be < 0. Hence we pick 0 in those cases.
            int_rect_area = max(0, int_rect_btm_rght_x - int_rect_top_left_x + 1)*max(0, int_rect_btm_rght_y - int_rect_top_left_y)

            #Calculate the IoU.
            intersect_over_union = float(int_rect_area / (gt_box_area + anchor_box_area - int_rect_area))

            #If the IoU is above or equal to the set threshold, then we want to label it as a positive anchor.
            #If it is lower or equal to the set threshold, we want to label it as a negative anchor.
            #Normally, there are going to be more negative anchors than the positive ones.
            #In an image, there might be more than 1 object. Therefore, an anchor which was labelled as positive for the first object
            #can be labelled as negative for the second object. We do not want the positive anchors to be overwritten since positive anchors
            #are naturally lower in number. Therefore, if an anchor is already labelled positively, we're not going to label it as negative nor neutral
            #on the following objects.
            if intersect_over_union >= pos_anchor_thresh:


                objectness_label_array[i][0] = 1.0 #positive label is on the left
                objectness_label_array[i][1] = 0.0 #overwrite the negative label in case this anchor is labelled negatively for previous object(s).

                class_array[i][int(class_label)] = 1.0 #Denote the label of the class in the array.

                ##################################################################
                #We want to calculate the regression values for positive anchors.#
                ##################################################################

                #Get the ground-truth box's [x,y,w,h]
                gt_box_center_x = ground_truth_boxes[j][0] + ground_truth_boxes[j][2]/2
                gt_box_center_y = ground_truth_boxes[j][1] + ground_truth_boxes[j][3]/2
                gt_box_width    = ground_truth_boxes[j][2] - ground_truth_boxes[j][0]
                gt_box_height   = ground_truth_boxes[j][3] - ground_truth_boxes[j][1]

                #Regression loss according to the paper.
                delta_x = (gt_box_center_x - anchor[0])/anchor[2]
                delta_y = (gt_box_center_y - anchor[1])/anchor[3]
                delta_w = math.log(gt_box_width/anchor[2])
                delta_h = math.log(gt_box_height/anchor[3])

                #Fill in the calculated values in the array.
                box_regression_array[i][0] = delta_x
                box_regression_array[i][1] = delta_y
                box_regression_array[i][2] = delta_w
                box_regression_array[i][3] = delta_h

            if intersect_over_union <= neg_anchor_thresh:

                #Check if the anchor is already labelled positive or not.
                if int(objectness_label_array[i][0]) == 0:

                    objectness_label_array[i][1] = 1.0

            #These are neutral anchors.
            if intersect_over_union > neg_anchor_thresh and intersect_over_union < pos_anchor_thresh:

                #We do not want to label either the negative or the positive anchors as neutral.
                if int(objectness_label_array[i][0]) == 0 and int(objectness_label_array[i][1]) == 0:
                    anchor_boolean_array[i][0] = 0.0 #Neutral anchors are to be ignored.


    return anchor_boolean_array, objectness_label_array, box_regression_array, class_array

In [9]:
def anchor_sampling(anchor_booleans, objectness_label, anchor_sampling_amount=anchor_sampling_amount):

    '''
    Faster R-CNN randomly samples a fixed amount of negative anchors and positive anchors for training. If we use all the neg and pos anchors,
    our model will overfit on the negative ones as negative anchors are larger in amount compared to the positive anchors.

    Input : anchor booleans and objectness label
    Output: Updated anchor booleans. 

    '''

    positive_count = 0
    negative_count = 0
    

    #Iterate through every label.
    for i in range(objectness_label.shape[0]):

        if int(objectness_label[i][0]) == 1: #If the anchor is positively labelled.

            if positive_count > anchor_sampling_amount: #If the positive anchors are more than the threshold amount, set the boolean to 0.

                anchor_booleans[i][0] = 0.0

            positive_count += 1

        if int(objectness_label[i][1]) == 1: #If the anchor is negatively labelled.

            if negative_count > anchor_sampling_amount: #If the negative anchors are more than the threshold amount, set the boolean to 0.

                anchor_booleans[i][0] = 0.0

            negative_count += 1

    #Return the updated booleans. REMEMBER! This array was passed by reference. 
    return anchor_booleans


In [10]:
def generate_dataset(first_index, last_index, anchors, anchor_booleans):
        '''
        This function generates the dataset using the functions defined earlier for the given number of data. If the write status is True,
        the output of this function will be written on disk.

        Input : starting index and final index of the dataset to be generated.
        Output: Anchor booleans, Objectness Label and Regression Label in specified batches.

        '''
        num_of_anchors = len(anchors)
        
        batch_anchor_booleans   = []
        batch_objectness_array  = []
        batch_regression_array  = []
        batch_class_label_array = []

        for i in range(first_index, last_index):

            #Get the true labels and the ground truth boxes [x,y,w,h] for every file.
            true_labels, ground_truth_boxes = get_labels_from_xml(xml_file_path=list_annotations[i])

            #Get the updated anchor booleans, objectness label and regression label.

            anchor_bools, objectness_label_array, box_regression_array, class_array = generate_label(true_labels, ground_truth_boxes, 
                                                                                                        anchors, anchor_booleans)

            #Get the updated anchor bools based on the sampling.
            anchor_bools = anchor_sampling(anchor_bools, objectness_label_array)

            batch_anchor_booleans.append(anchor_bools)
            batch_objectness_array.append(objectness_label_array)
            batch_regression_array.append(box_regression_array)
            batch_class_label_array.append(class_array)

        batch_anchor_booleans   = np.reshape(np.asarray(batch_anchor_booleans), (-1,num_of_anchors))
        batch_objectness_array  = np.asarray(batch_objectness_array)
        batch_regression_array  = np.asarray(batch_regression_array)
        batch_class_label_array = np.asarray(batch_class_label_array)

        return (batch_anchor_booleans, batch_objectness_array, batch_regression_array, batch_class_label_array)

In [11]:
def read_images(first_index, last_index):
    '''
    Read the image files, resize, normalize the images and return them in a numpy array.
    Input : first and last index.
    Output: Numpy array of images.
    '''
    images_list = []
    
    for i in range(first_index, last_index):
        
        im = cv2.imread(list_images[i])
        im = cv2.resize(im, (image_height, image_width))/255
        
        images_list.append(im)
    
    return np.asarray(images_list)

In [12]:
anchors, an_bools = generate_anchors() #We only need to generate the anchors and the anchor booleans once.
num_of_anchors = len(anchors)

In [13]:
a,b,c,d = generate_dataset(0,1, anchors, an_bools)
a.shape

(1, 6084)

### MODEL BUILDING

In [14]:
learning_rate = 1e-5
epoch = 1000
batch_size = 10
model_checkpoint = './model_ckpt/model.ckpt'
decay_steps = 10000
decay_rate = 0.99
lambda_value = 10

In [15]:
def smooth_func(t):
    
    t = tf.abs(t)
    
    comparison_tensor = tf.ones((num_of_anchors, 4))
    smoothed = tf.where(tf.less(t, comparison_tensor), 0.5*tf.pow(t,2), t - 0.5)
    
    return smoothed

In [16]:
def smooth_L1(pred_box, truth_box):
    
    diff = pred_box - truth_box
    
    smoothed = tf.map_fn(smooth_func, diff)
    
    return smoothed

In [17]:
X       = tf.placeholder(tf.float32, shape=(None, image_height, image_width, image_depth)) 
Y_obj   = tf.placeholder(tf.float32, shape=(None, num_of_anchors,2))
Y_coor  = tf.placeholder(tf.float32, shape=(None, num_of_anchors,4))
anch_bool = tf.placeholder(tf.float32, shape=(None, num_of_anchors))

conv1 = tf.contrib.layers.conv2d(X, num_outputs=64, kernel_size=3, stride=1, 
                                 padding='SAME', activation_fn=tf.nn.relu)
conv2 = tf.contrib.layers.conv2d(conv1, num_outputs=64, kernel_size=3, stride=1, 
                                 padding='SAME', activation_fn=tf.nn.relu)
conv2_pool = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')


conv3 = tf.contrib.layers.conv2d(conv2_pool, num_outputs=128, kernel_size=3, stride=1, 
                                 padding='SAME', activation_fn=tf.nn.relu)
conv4 = tf.contrib.layers.conv2d(conv3, num_outputs=128, kernel_size=3, stride=1, 
                                 padding='SAME', activation_fn=tf.nn.relu)
conv4_pool = tf.nn.max_pool(conv4, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

conv5 = tf.contrib.layers.conv2d(conv4_pool, num_outputs=512, kernel_size=3, stride=1, 
                                 padding='SAME', activation_fn=tf.nn.relu)
conv6 = tf.contrib.layers.conv2d(conv5, num_outputs=256, kernel_size=3, stride=1, 
                                 padding='SAME', activation_fn=tf.nn.relu)
conv7 = tf.contrib.layers.conv2d(conv6, num_outputs=256, kernel_size=3, stride=1, 
                                 padding='SAME', activation_fn=tf.nn.relu)
conv7_pool = tf.nn.max_pool(conv7, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

In [18]:
rpn_conv = tf.contrib.layers.conv2d(conv7_pool, num_outputs=512, kernel_size=3, stride=1, 
                                    padding='VALID', activation_fn=tf.nn.relu)

obj_conv = tf.contrib.layers.conv2d(rpn_conv, num_outputs=18, kernel_size=1, stride=1, padding='VALID', activation_fn=None)
bb_conv = tf.contrib.layers.conv2d(rpn_conv, num_outputs=36, kernel_size=1, stride=1, padding='VALID', activation_fn=None)

class_conv_reshape = tf.reshape(obj_conv, (-1, num_of_anchors, 2))
anchor_conv_reshape = tf.reshape(bb_conv, (-1, num_of_anchors, 4))

logits = tf.nn.softmax(class_conv_reshape)

global_step = tf.Variable(0, trainable=False)
decayed_lr = tf.train.exponential_decay(learning_rate,
                                            global_step, decay_steps,
                                            decay_rate, staircase=True)



loss1 = 1/256*tf.reduce_sum(anch_bool*(tf.nn.softmax_cross_entropy_with_logits(labels=Y_obj, logits=class_conv_reshape)))
loss2 = lambda_value*(1/128)*tf.reduce_sum((tf.reshape(Y_obj[:,:,0], (-1,num_of_anchors,1)))*smooth_L1(anchor_conv_reshape, Y_coor))

total_loss = loss1 + loss2

optimizer = tf.train.AdamOptimizer(decayed_lr).minimize(total_loss, global_step=global_step)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [19]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()

try:
    saver.restore(sess, model_checkpoint)
    print("Model has been loaded!")
    
except:
    
    print("Model is not loaded!")

INFO:tensorflow:Restoring parameters from ./model_ckpt/model.ckpt
Model is not loaded!


In [20]:
#TRAINING 

for epoch_idx in range(epoch): #Each epoch.
    
    #Loop through the whole dataset in batches.
    for start_idx in tqdm(range(0, total_images, batch_size)):
        
        end_idx = start_idx + batch_size
        
        if end_idx >= total_images : end_idx = total_images - 1 #In case the end index exceeded the dataset.
            
        images = read_images(start_idx, end_idx) #Read images.
        
        #Get the labels needed.
        batch_anchor_booleans, batch_objectness_array, batch_regression_array, _ = \
                                                generate_dataset(start_idx,end_idx, anchors, an_bools)
        #Optimize the model.
        _, theloss = sess.run([optimizer, total_loss], feed_dict={X: images,
                                                                  Y_obj:batch_objectness_array,
                                                                  Y_coor: batch_regression_array,
                                                                  anch_bool: batch_anchor_booleans})
    #Save the model periodically.
    if epoch%10 == 0:
        saver.save(sess, model_checkpoint)
    
    print("Epoch : %d, Loss : %g"%(epoch_idx, theloss))

  0%|          | 7/1713 [00:16<1:06:54,  2.35s/it]

KeyboardInterrupt: 