In [1]:
import numpy as np
import os
import glob
import cv2
import xmltodict
import tensorflow as tf

In [2]:
data_images_path     = 'VOCdevkit/VOC2012/JPEGImages'
data_annotation_path = 'VOCdevkit/VOC2012/Annotations'
image_height = 448
image_width  = 448
image_depth  = 3

In [3]:
#since the naming of the image files and annotation files of VOC Pascal Dataset differs only in the extension,
#sorting the lists would enable us to select an image file and its corresponding annotation file at the same 
#index position from the lists.
list_images      = sorted([x for x in glob.glob(data_images_path + '/**')])     #length : 17125
list_annotations = sorted([x for x in glob.glob(data_annotation_path + '/**')]) #length : 17125

In [4]:
#Example
print(list_images[400:405])
print(list_annotations[400:405])

['VOCdevkit/VOC2012/JPEGImages/2007_005702.jpg', 'VOCdevkit/VOC2012/JPEGImages/2007_005705.jpg', 'VOCdevkit/VOC2012/JPEGImages/2007_005748.jpg', 'VOCdevkit/VOC2012/JPEGImages/2007_005759.jpg', 'VOCdevkit/VOC2012/JPEGImages/2007_005764.jpg']
['VOCdevkit/VOC2012/Annotations/2007_005702.xml', 'VOCdevkit/VOC2012/Annotations/2007_005705.xml', 'VOCdevkit/VOC2012/Annotations/2007_005748.xml', 'VOCdevkit/VOC2012/Annotations/2007_005759.xml', 'VOCdevkit/VOC2012/Annotations/2007_005764.xml']


In [5]:
def get_total_classes(xml_files=list_annotations):
    '''Get all the classes in the dataset to construct one hot vector later.
       Parameter
       ---------
       xml_files : a list containing paths to every single xml files.
    '''
    
    classes = []
    
    for file in xml_files: #iterate through every xml files
      
        f = open(file)
        doc = xmltodict.parse(f.read()) #parse the xml file
        
        #Some xml files may only contain one object tag as there's only 1 object in the image.
        #For-looping over these tags throws a TypeError. Therefore, we use try-except to avoid this.
        try:
            for obj in doc['annotation']['object']: # try iterating through the objects in the xml file
                    classes.append(obj['name'])
        
        except TypeError as e:
            classes.append(doc['annotation']['object']['name'])
        
        f.close()
            
    classes = list(set(classes)) #set to remove duplicates.
    classes.sort() #sort the list in ascending order
    
    
    return classes

In [6]:
classes = get_total_classes()
print(classes)

['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']


In [7]:
C = len(classes) #20
S = 7 #cells
B = 2 #num of bounding boxes per cell

In [8]:
def get_label(xml_file_path):
    '''Reads one file's annotation information and convert it to YOLO format.
       Returns a label list for one image.
       Parameter 
       ---------
       xml_file_path : path to a Pascal VOC format xml file   | string
    '''
    
    f   = open(xml_file_path)
    doc = xmltodict.parse(f.read()) #parse the xml file and convert it into python dict
    
    height = doc['annotation']['size']['height']
    width  = doc['annotation']['size']['width']
    
    #Each image must have labels for every cell. This means that in our case, S=7, C=20, we need to have
    #[x,y,w,h,confidence, Pr(C_0),Pr(C_1), ... ,Pr(C_19)]. The length of the list would be 25. The confidence is
    #zero when there is no object in the particular cell. Otherwise, the confidence is equal to the IoU between
    #the predicted bounding box and the ground truth. Hence to calculate the confidence, in the fifth index of 
    #the list, we mark the Pr(object). If there is an object in the cell, Pr(object) = 1. 0 otherwise. When 
    #the network predicts the Pr(object), the predicted Pr(object) and the ground truth of Pr(object) will be 
    #used to calculate to calculate the confidence. NOTE that the length of the prediction is 30 since there are
    #2 bounding box predictions. During training, only one of the box will be selected based on IoU. Hence,
    #the label's list length is 25
    label = [[0] * (5+C)] * S**2 #a 2-D list of zeros length 49 (S**2) where each element in the list is a list
    #of zeros of length 25 (5 + C).
    
    
    #Some xml files may only contain one object tag as there's only 1 object in the image.
    #For-looping over these tags throws a TypeError. Therefore, we use try-except to avoid this.
    try:
        for obj in doc['annotation']['object']:#we have to iterate here since an img may contain more than 1 obj
            
            #retrieve the information from the xmldict
            name  = obj['name']
            x_min = obj['bndbox']['xmin']
            x_max = obj['bndbox']['xmax']
            y_min = obj['bndbox']['ymin']
            y_max = obj['bndbox']['ymax']

            #center of the box.
            center_x = int(x_max) - int(x_min) 
            center_y = int(y_max) - int(y_min) 

            #the width and height of each cell when we divide the image into S x S cells.
            cell_size_x = int(width)/S 
            cell_size_y = int(height)/S

            '''
            Quote from paper 
            ----------------
            If the center of an object falls into a grid cell, that grid cell is responsible for detecting
            that object.

            '''
            #get the cell that is responsible for the object and the value of the coordinates relative to 
            #the responsible grid cell.
            x_coord_box, x_in_cell = divmod(center_x, cell_size_x)
            y_coord_box, y_in_cell = divmod(center_y, cell_size_y)

            #normalize the x and y coordinates in the cell.
            x = x_in_cell/cell_size_x
            y = y_in_cell/cell_size_y

            #normalize the width and height of the bounding box relative to the entire image's width and height.
            w = (int(x_max) - int(x_min))/int(width)
            h = (int(y_max) - int(y_min))/int(height)
            
            #one-hot *list* for the class
            one_hot_list = [0] * C #A list of zeros at length C
            index = classes.index(name) #get the index of the class from the list 'classes'
            one_hot_list[index] = 1.0 
            
            #list for each object. Round the floats to 2 decimal places
            obj_info = [round(x,2),round(y,2),round(w,2),round(h,2), 1.0 ] + one_hot_list
            
            #since here we have the position of the box as a coordinate, we can convert that coordinate to box
            #number with (x-coor + (y-coor x 7)). This is assuming the box numbering is from left to right
            #starting from 0.
            box_position = x_coord_box + (y_coord_box * 7)
            label[int(box_position)] = obj_info #replace the list of zeros

    #Some xml files may only contain one object tag as there's only 1 object in the image.
    #For-looping over these tags throws a TypeError. Therefore, we use try-except to avoid this.
    except TypeError as e:
        
        #Note that we use the doc dictionary, not obj dictionary
        name  = doc['annotation']['object']['name']
        x_min = doc['annotation']['object']['bndbox']['xmin']
        x_max = doc['annotation']['object']['bndbox']['xmax']
        y_min = doc['annotation']['object']['bndbox']['ymin']
        y_max = doc['annotation']['object']['bndbox']['ymax']

        #center of the box.
        center_x = int(x_max) - int(x_min) 
        center_y = int(y_max) - int(y_min) 

        #the width and height of each cell when we divide the image into S x S cells.
        cell_size_x = int(width)/S 
        cell_size_y = int(height)/S

        '''
        Quote from paper 
        ----------------
        If the center of an object falls into a grid cell, that grid cell is responsible for detecting
        that object.

        '''
        #get the cell that is responsible for the object and the value of the coordinates relative to 
        #the responsible grid cell.
        x_coord_box, x_in_cell = divmod(center_x, cell_size_x)
        y_coord_box, y_in_cell = divmod(center_y, cell_size_y)

        #normalize the x and y coordinates in the cell.
        x = x_in_cell/cell_size_x
        y = y_in_cell/cell_size_y

        #normalize the width and height of the bounding box relative to the entire image's width and height.
        w = (int(x_max) - int(x_min))/int(width)
        h = (int(y_max) - int(y_min))/int(height)
        
        #one-hot *list* for the class
        one_hot_list = [0] * C #A list of zeros at length C
        index = classes.index(name) #get the index of the class from the list 'classes'
        one_hot_list[index] = 1.0 

        #list for each object. Round the floats to 2 decimal places
        obj_info = [round(x,2),round(y,2),round(w,2),round(h,2), 1.0 ] + one_hot_list
        
        #since here we have the position of the box as a coordinate, we can convert that coordinate to box
        #number with (x-coor + (y-coor x 7)). This is assuming the box numbering is from left to right
        #starting from 0.
        box_position = x_coord_box + (y_coord_box * 7)
        label[int(box_position)] = obj_info #replace the list of zeros
    
    f.close()
        
    return label #returns the label of an image

In [9]:
def load_dataset(first_index, last_index):
    '''Load images into numpy array in a specific size (last_index - first_index).
       Load annotations in YOLO format.
       Returns np images and label
       Parameter
       ---------
       first_index : integer
       last_index  : integer
    '''
    
    images = [] #initialize an empty list to append the images
    labels    = [] #initialize an empty list to append the labels
    
    for i in range(first_index,last_index): 
        
        im = cv2.imread(list_images[i])                 #read the images from the path
        im = cv2.resize(im, (image_height,image_width)) #resize the images to 448x448x3
        images.append(im)                               #append the image into the list
        
        label = get_label(list_annotations[i]) #get the list label for an image 
        labels.append(label) #append a single label into the list of labels
        
        
    labels    = np.asarray(labels)    #convert the label list into np array
    images    = np.asarray(images) #convert the images list into np array
    
    return (images, labels)

In [10]:
images,labels = load_dataset(100,110)

In [None]:
def iou(pred, truth):
    
    

In [17]:
X       = tf.placeholder(tf.float32, shape=(None, image_height, image_width, image_depth)) #(None, 448, 448, 3)
Y       = tf.placeholder(tf.float32, shape=(None, S**2, 5+C)) #(None, 49, 25)
dropout = tf.placeholder(tf.float32)

#output size : (None, 224, 224, 64)
conv1 = tf.contrib.layers.conv2d(X, num_outputs=64, kernel_size=7, stride=2, 
                                 padding='SAME', activation_fn=tf.nn.leaky_relu)

#output size : (None, 112, 112, 64)
conv1_pool = tf.nn.max_pool(conv1, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

#output size : (None, 112, 112, 128)
conv2 = tf.contrib.layers.conv2d(conv1_pool, num_outputs=128, kernel_size=3, stride=1, 
                                 padding='SAME', activation_fn=tf.nn.leaky_relu)

#output size : (None, 56, 56, 128)
conv2_pool = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

#output size : (None, 56, 56, 192)
conv3 = tf.contrib.layers.conv2d(conv2_pool, num_outputs=192, kernel_size=1, stride=1,
                                padding='SAME', activation_fn=tf.nn.leaky_relu)

#output size : (None, 56, 56, 256)
conv4 = tf.contrib.layers.conv2d(conv3, num_outputs=256, kernel_size=3, stride=1,
                                padding='SAME', activation_fn=tf.nn.leaky_relu)

#output size : (None, 56, 56, 256)
conv5 = tf.contrib.layers.conv2d(conv4, num_outputs=256, kernel_size=1, stride=1,
                                padding='SAME', activation_fn=tf.nn.leaky_relu)

#output size : (None, 28, 28, 256)
conv5_pool = tf.nn.max_pool(conv5, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

#output size : (None, 28, 28, 512)
conv6 = tf.contrib.layers.conv2d(conv5_pool, num_outputs=512, kernel_size=3, stride=1,
                                padding='SAME', activation_fn=tf.nn.leaky_relu)

#output size : (None, 28, 28, 512)
conv7 = tf.contrib.layers.conv2d(conv6, num_outputs=512, kernel_size=1, stride=1,
                                padding='SAME', activation_fn=tf.nn.leaky_relu)

#output size : (None, 14, 14, 512)
conv7_pool = tf.nn.max_pool(conv7, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

#output size : (None, 14, 14, 600)
conv8 = tf.contrib.layers.conv2d(conv7_pool, num_outputs=600, kernel_size=3, stride=1,
                                padding='SAME', activation_fn=tf.nn.leaky_relu)

#output size : (None, 7, 7, 600)
conv8_pool = tf.nn.max_pool(conv8, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

#output size : (None, 7, 7, 600)
final_conv = tf.contrib.layers.conv2d(conv8_pool, num_outputs=600, kernel_size=3, stride=1,
                                padding='SAME', activation_fn=tf.nn.leaky_relu)

output_shape = 7*7*600
#feature vector shape : (None, 29400)
feature_vector = tf.reshape(final_conv, (-1, 7*7*600))

#Weight and bias variables for Fully connected layers
W1 = tf.Variable(tf.truncated_normal([output_shape, 2048], stddev=0.1))
B1 = tf.Variable(tf.constant(1.0, shape=[2048]))
W2 = tf.Variable(tf.truncated_normal([2048, 7*7*30], stddev=0.1))
B2 = tf.Variable(tf.constant(1.0, shape=[7*7*30]))

#First fully-connected layer
fc1 = tf.add(tf.matmul(feature_vector, W1), B1)
fc1_actv = tf.nn.leaky_relu(fc1) #non-linear actv func

#dropout
dropout_layer = tf.nn.dropout(fc1_actv, dropout)

#Second fully-connected layer
fc2 = tf.add(tf.matmul(dropout_layer, W2), B2)

Y_pred = tf.nn.sigmoid(fc2) #shape : [batch_size, 7*7*30]             

In [None]:
#Loss function

#constants
lambda_coord = 5
lambda_noobj = 0.5

prediction = tf.reshape(Y_pred, (-1, 49, 30))

first_part_loss = lambda_coord * tf.reduce_sum(Y[:,:,4] * ((prediction[:,:,0] - Y[:,:,0])**2 +
                                              (prediction[:,:,1] - Y[:,:,1])**2) +  )
                                 

second_part_loss = lambda_coord * tf.reduce_sum(Y[:,:,4] * ((tf.sqrt(prediction[:,:,2]) - tf.sqrt(Y[:,:,2]))**2 
                                              + (tf.sqrt(prediction[:,:,3]) - tf.sqrt(Y[:,:,3])**2)))



In [12]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

res1 = sess.run(conv1, feed_dict={X:images})
res2 = sess.run(conv1_pool, feed_dict={X:images})
res3 = sess.run(conv2, feed_dict={X:images})