# Bounding Box Prediction

$b_x=\sigma(t_x)+c_x$  

$b_y=\sigma(t_y)+c_y$  

$b_\omega=p_\omega e^{t_\omega}$  

$b_h=p_h e^{t_h}$

![title](imgs/bb-predictions.png)

# Object Score

YOLOv3 predicts an objectness score for each bounding box using logistic regression. 
* This should be 1 if the bounding box prior overlaps a ground truth object.
* If bounding box prior is not the best but does overlap a ground truth object by more than some threshold,ignore the prediction.
* if a bounding box prior is not assigned to a ground truth object and not ignored it incurs no loss for coordinate or class predictions, only objectness

# Feature Extractor 

![title](imgs/darknet53.png)

# predict bounding boxes across different feature maps scales

* 5 max poolings, for default input size (416,416) this is (13,13)
* 4 max poolings, for default input size (416,416) this is (26,26)
* 3 max poolings, for default input size (416,416) this is (52,52)
* For default input image size (416,416), there are $(13\times13+26\times26+52\times52)\times num\_anchors$ bounding box detectors in YOLOV3. 
* The smaller feature maps （with larger receptive field） usually predicts bounding boxes refers to big anchors
* The YOLOv3 upsamples the final feature maps (13,13) to (26,26) and (52,52), and concatenates them with feature maps of shadow convolutional layers(contains more detail features) to predict small objects

# Code 

In [1]:
from keras import backend as K
import tensorflow as tf
import numpy as np
from keras.models import Model

## yolo loss calculation

In [2]:
def box_iou(b1,b2):
    '''Return iou tensor
    This function is later used in yolo_loss function to decide which bounding box detectors can be ignored
    Parameters
    b1:tensor,shape=(i1,...,iN,4) xywh
    b2:tensor,shape=(j,4),xywh
    '''
    #  Expand dim to apply broadcasting.
    b1 = K.expand_dims(axis=-2) #(i1,...,iN,1,4)
    b1_xy = b1[...,0:2]
    b1_wh = b1[...,2:4] 
    b1_wh_half = b1_wh/2.0 
    b1_mins = b1_xy-b1_wh_half #(i1,...,iN,1,2)
    b1_maxes = b1_xy+b1_wh_half #(i1,...,iN,1,2)

    b2_xy = b2[...,0:2] #(j,2)
    b2_wh = b2[...,2:4]
    b2_wh_half = b2_wh/2.0 
    b2_mins = b2_xy-b2_wh_half #(j,2)
    b2_maxes = b2_xy+b2_wh_half #(j,2)
 
    intersect_mins = K.maximum(b1_mins,b2_mins) #(i1,...,iN,j,2)
    intersect_maxes = K.minimum(b1_maxes,b2_maxes) #(i1,...,iN,j,2)
    intersect_wh = K.maximum(intersect_maxes-intersect_mins,0) #(i1,...,iN,j,2)
    intersect_area = intersect_wh[...,0]*intersect_wh[...,1] #(i1,...,iN,j)
    b1_area = b1_wh[...,0]*b1_wh[...,1] #(i1,...,iN,j)
    b2_area = b2_wh[...,0]*b2_wh[...,1] #(i1,...,iN,j)
    iou = intersect_area/(b1_area+b2_area-intersect_area) #(i1,...,iN,j)
    return iou

def yolo_loss(args,anchors,num_classes,ignore_thresh=0.5):
    '''
        yolo_outputs:list of tensor, the output of three different convolutional layer of yolo_body,
        shape=[batch_size,grid_h,grid_w,num_anchors,(5+num_classes)] (the xywh are relative value calculate by the equations above).
        
        y_true: list of tensors represent the true value, shape = [batch_size,grid_h,grid_w,num_anchors,5+num_classes] 
        (the xywh are normalized by the size of input image).
        
        anchors:2-dims array,shape=(N,2) 
        
        num_classes:integer
        
        ignore_thres:float, the iou threshold whether to ignore object confidence loss
    '''
    num_layers = len(anchors)//3
    yolo_outputs = args[:num_layers]
    y_trues = args[num_layers:] 
    anchor_masks = [[6,7,8],[3,4,5],[0,1,2]] if num_layers==3 else [[3,4,5],[1,2,3]]
    input_shape = K.cast(K.shape(yolo_outputs[0])[1:3]*32,dtype=K.dtype(y_true[0]))
    loss = 0
    
    for l in range(num_layers):
        
        m = K.shape(yolo_outputs[l])[0]
        mf = K.cast(m,K.dtype(yolo_outputs[l]))
        yolo_output = yolo_outputs[l] # [batch_size,grid_h,grid_w,num_anchors*(5+num_classes)]
        y_true = y_trues[l] # [batch_size,grid_h,grid_w,num_anchor,5+num_classes]
        
        grid_shape = K.cast(K.shape(yolo_output)[1:3],dtype=K.dtype(y_true[0])) # (2,)
        grid_h,gird_w = grid_shape[0],grid_shape[1]
        
        anchors_data = anchors[anchor_masks[l]]
        num_anchors  len(anchors_data)
        anchors_tensor = K.reshape(K.constant(anchors_data),shape=[1,1,1,num_anchors,2]) # [1,1,1,num_anchors,2]
        
        raw_pred = K.reshape(yolo_output,shape=[m,grid_h,grid_w,num_anchors,5+num_classes]) # [batch_size,grid_h,grid_w,num_anchors,5+num_classes]
        object_mask = y_true[...,4:5] # [batch_size,grid_h,grid_w,num_anchors,1]
        
        # calculate the xywh loss
        # ** Frist generate the offset matrics, think about the np.meshgrid to understant the following code
        grid_y = K.arange(0,grid_h) #(grid_h,)
        grid_y = K.expand_dims(grid_y,axis=1) #(gird_h,1)
        grid_y = K.tile(grid_y,[1,grid_w]) #(grid_h,gird_w)
        grid_x = K.arange(0,grid_w) #(grid_w,)
        grid_x = K.expand_dims(grid_x,axis=0) #(1,grid_w)
        grid_x = K.tile(grid_x,[grid_h,1]) #(grid_h,grid_w)
        grid = K.stack([grid_x,grid_y],axis=2) #(grid_h,grid_w,2)
        grid = K.expand_dims(grid,axis=-2) #(grid_h,grid_w,1,2)
        grid = K.expand_dims(grid,axis=0) #(1,grid_h,grid_w,1,2) 
        # ** Second reshape the yolo_outputs to [batch_size,grid_h,grid_w,num_anchors,5+num_classes]
        # ** and normalize the bounding box parameters of the y_true as the equation above
        raw_true_xy = y_true[...,:2]*grid_shape[::-1]-grid #[batch_size,grid_h,grid_w,num_anchors,2]
        raw_true_wh = K.log(y_true[...,2:4]*input_shape[::-1]/anchors_tensor) #[batch_size,grid_h,grid_w,num_anchors,2]
        raw_true_wh = K.switch(object_mask,raw_true_wh,K.zeros_like(raw_true_wh))# [batch_size,grid_h,grid_w,num_anchors,2] (prevent log0)
        box_loss_scale = 1+(1-y_true[...,2:3]*y_true[l][...,3:4]) # [batch_size,grid_h,grid_w,num_anchors,1]
        # ** calculate the xywh loss
        xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(raw_true_xy,raw_pred[...,0:2],from_logits=True)
        wh_loss = object_mask * box_loss_scale * K.square(raw_true_wh-raw_pred[...,2:4])
        xy_loss = K.sum(xy_loss)/mf
        wh_loss = K.sum(wh_loss)/mf
        loss +=xy_loss
        loss +=wh_loss        
        
                
        # calculate the confidence loss
        # ** First,calculate the iou between all the predicted bounding boxes and all the ground truth boxes, 
        # ** and for a predicted box that is not responsible for predicting any ground truth boxes,
        # ** if the predicted bounding box does overlap a ground truth object by more than a threashold, 
        # ** the predicted box should be ignored when calculate the confidence loss.s        
        pred_xy = (K.sigmoid(raw_pred[...,0:2])+grid)/K.cast(grid_shape[::-1],K.dtype(feats)) # [batch_size,grid_h,grid_w,num_anchors,2]
        pred_wh = K.exp(raw_pred[...,2:4])*anchors_tensor/K.cast(input_shape[::-1,K.dtype(feats)]) # [batch_size,grid_h,grid_w,num_anchors,2]
        pred_box = K.concatenate([pred_xy,pred_wh],axis=-1) # [batch_size,grid_h,grid_w,num_anchors,4]       
        ignore_mask = tf.TensorArray(K.dtype(y_true[0]),size=1,dynamic_size=True)
        object_mask_bool = K.cast(object_mask,'bool')
        def loop_body(b,ignore_mask):
            true_box = tf.boolean_mask(y_true[b,...,0:4],object_mask_bool[b,...,0]) # [j,4]
            iou = box_iou(pred_box[b,...,0:4],true_box)# [grid_h,grid_w,num_anchors,j]
            best_iou = K.max(iou,axis=-1) #[grid_h,grid_w,num_anchors]
            ignore_mask = ignore_mask.write(b,K.cast(best_iou<ignore_thresh,K.dtype(true_box)))#list of tensor-[grid_y,grid_x,num_anchors]
            return b+1,ignore_mask
        _, ignore_mask = K.control_flow_ops.while_loop(lambda b,*args: b<m, loop_body, [0, ignore_mask])#list of tensor-[grid_y,grid_x,num_anchors]
        ignore_mask = ignore_mask.stack(axis=0)#[batch_size,grid_y,grid_x,num_anchors]
        ignore_mask = K.expand_dims(ignore_mask,axis=-1) #[batch_size,grid_y,grid_x,num_anchors,1]
        confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \
        (1-object_mask) * ignore_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) 
        confidence_loss = K.sum(confidence_loss) / mf
        loss += confidence_loss
        
        # calculate the class loss
        class_loss = object_mask*K.binary_crossentropy(true_class_probs,raw_pred[...,5:],from_logits=True)
        class_loss = K.sum(class_loss)/mf
        loss += class_loss

    return loss

## YOLOv3 Creation

In [3]:
from yolov3_body import YOLOv3

Using TensorFlow backend.


In [4]:
def freeze_layers(model,num_layers=185):
    '''
        185:for the darknet body
        -3:for the three prediction layer
    '''
    num_layers = num_layers if num_layers > 0 else len(model.layers)+num_layers
    for i in range(len(model.layers)):
        model.layers[i].trainabel = i < num_layers

In [None]:
input_shape = (416,416)
anchor_path = 'model_data/yolo_anchors.txt'
classes_path = 'model_data/classes.txt'
weights_path = 'model_data/yolo.h5'
yolov3 = YOLOv3(input_shape,anchor_path,classes_path,weights_path)