In [1]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import BatchNormalization, Conv2D, Input, ZeroPadding2D, LeakyReLU, UpSampling2D

In [2]:
def parse_cfg(cfgfile):
    with open(cfgfile, 'r') as file:
        # to remove unnecessary characters like '\n' and '#'
        # and variable 'lines' will have all lines of file yolov3.cfg
        lines = [line.rstrip('\n') for line in file if line != '\n' and line[0] != '#']    
    holder = {}
    blocks = []
    # loop over each line and store each key value pairs in holder dict holder
    # and then that dictionary is stored in blocks list
    for line in lines:
        if line[0] == '[':
            line = 'type=' + line[1:-1].rstrip()
            if len(holder) != 0:
                blocks.append(holder)
                holder = {}
        key, value = line.split('=')
        holder[key.rstrip()] = value.lstrip()
    blocks.append(holder)
    return blocks

In [3]:
def YOLOv3Net(cfgfile, model_size, num_classes):
    
    blocks = parse_cfg(cfgfile)
    
    outputs = {}
    output_filters = []
    filters = []
    out_pred = []
    scale = 0
    
    inputs = input_image = Input(shape = model_size)
    inputs = inputs / 255.0
    
    # The YOLOv3 has 5 layers types in general, they are: 
    # 1. Convolutional layer
    # 2. Upsample layer
    # 3. Route layer
    # 4. Shortcut layer
    # 5. Yolo layer
    for i, block in enumerate(blocks[1:]):
        
        # In YOLOv3, there are 2 convolutional layer types, i.e with and without batch normalization layer.
        if(block['type'] == 'convolutional'):
            
            activation = block['activation']
            filters = int(block['filters'])
            kernel_size = int(block['size'])
            strides = int(block['stride'])
            
            # If strides is greater than 1 then downsampling is performed,
            # hence need to adjust the padding
            if strides > 1:
                inputs = ZeroPadding2D(((1, 0), (1, 0)))(inputs)
            
            inputs = Conv2D(filters,
                            kernel_size,
                            strides=strides,
                            padding='valid' if strides > 1 else 'same',
                            name='conv_' + str(i),
                            use_bias=False if ("batch_normalize" in block) else True)(inputs)
            
            # The convolutional layer followed by a batch normalization layer uses a Leaky ReLU activation layer,
            # otherwise, it uses the linear activation by default.
            if "batch_normalize" in block:
                inputs = BatchNormalization(name='bnorm_'+str(i))(inputs)
            if activation == 'leaky':
                inputs = LeakyReLU(alpha=0.1, name='leaky_'+str(i))(inputs)
        
        # In YOLOv3, Upsampler layer performs upsampling of the previous fetaure map
        # by a factor of 'stride' using bilinear upsampling method.
        elif (block['type'] == 'upsample'):
            stride = int(block['stride'])
            inputs = UpSampling2D(stride)(inputs)
        
        # Route layer
        elif (block['type'] == 'route'):
            block['layers'] = block['layers'].split(',')
            start = int(block['layers'][0])
            # Check if attribute 'layers' has 1 value or 2
            # If it has 1 value such as -4, then we need to go backward 4 layers and then output
            # the feature map from that layer.
            # If it has 2 values such as -1 and 61, then we need to concatenate the feature map 
            # from a previous layer (-1) and the feature map from layer 61. 
            if len(block['layers']) > 1:
                end = int(block['layers'][1]) - i
                filters = output_filters[i + start] + output_filters[end]
                inputs = tf.concat([outputs[i + start], outputs[i + end]], axis = -1)
            else:
                filters = output_filters[i + start]
                inputs = outputs[i + start]
                
        # In Shortcut layer, we perform skip connection. If attribute 'from' has value -3, then
        # go backward 3 payers and take the feature map from that layer and add it with feature
        # map from previous layer.
        elif (block['type'] == 'shortcut'):
            from_ = int(block['from'])
            inputs = outputs[i - 1] + outputs[i + from_]
            
        # In Yolo layer, first we check the 'mask' and 'anchors' values.
        elif (block['type'] == 'yolo'):
            mask = block['mask'].split(',')
            mask = [int(x) for x in mask]
            #num_classes = int(block['classes'])
            anchors = block['anchors'].split(',')
            anchors = [int(a) for a in anchors]
            anchors = [(anchors[j], anchors[j+1]) for j in range(0,len(anchors),2)]
            anchors = [anchors[j] for j in mask]
            # Reshape the YOLO output to the form of [None, B*grid_size*grid_size, 5+C]
            # where B is the number of anchors and C is the number of classes
            n_anchors = len(anchors)
            out_shape = inputs.get_shape().as_list()
            
            inputs = tf.reshape(inputs, [-1, n_anchors * out_shape[1] * out_shape[2], \
                                        5 + num_classes])
            
            
            box_centers = inputs[:, :, 0:2]
            
            box_shapes = inputs[:, :, 2:4]
            confidence = inputs[:, :, 4:5]
            classes = inputs[:, :, 5:num_classes + 5]
            
            # Refining Bounding Boxes
            # Use the sigmoid function to convert box_centers, confidence and classes values 
            # into range of 0 - 1.
            box_centers = tf.nn.sigmoid(box_centers)
            confidence = tf.nn.sigmoid(confidence)
            classes = tf.nn.sigmoid(classes)
          
            # Convert box_shapes
            anchors = tf.tile(anchors, [out_shape[1] * out_shape[2], 1])
            box_shapes = tf.exp(box_shapes) * tf.cast(anchors, dtype = tf.float32)
            
            x = tf.range(out_shape[1], dtype = tf.float32)
            y = tf.range(out_shape[2], dtype = tf.float32)
           
            
            # Using meshgrid to convert the relative positions of the center boxes
            # into real positions
            cx, cy = tf.meshgrid(x,y)
            cx = tf.reshape(cx, (-1,1))
            cy = tf.reshape(cy, (-1,1))
            cxy = tf.concat([cx, cy], axis = -1)
            cxy = tf.tile(cxy, [1, n_anchors])
            cxy = tf.reshape(cxy, [1, -1, 2])
            
            strides = (input_image.shape[1] // out_shape[1], input_image.shape[2] // out_shape[2])
            box_centers = (box_centers + cxy) * strides
            
            prediction = tf.concat([box_centers, box_shapes, confidence, classes], axis = -1)
            
            if scale:
                out_pred = tf.concat([out_pred, prediction], axis = 1)
            else:
                out_pred = prediction
                scale = 1
            
        # Since Route and Shortcut layers need output feature maps from previous layers
        # so, we need to keep track of feature maps and output filters
        outputs[i] = inputs
        output_filters.append(filters)
                
    model = Model(input_image, out_pred)
    model.summary()
    return model
