<a href="https://colab.research.google.com/github/VedVyapak/YoloV3/blob/main/YoloV3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf

from tensorflow.keras.layers import Layer, Conv2D, MaxPool2D, Activation, BatchNormalization, UpSampling2D, ZeroPadding2D, Concatenate, Add, Flatten, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
NUM_CLASS = 13

In [None]:
class BatchNormalization(tf.keras.layers.BatchNormalization):
    """
    "Frozen state" and "inference mode" are two separate concepts.
    `layer.trainable = False` is to freeze the layer, so the layer will use
    stored moving `var` and `mean` in the "inference mode", and both `gama`
    and `beta` will not be updated !
    """
    def call(self, x, training=False):
        if not training:
            training = tf.constant(False)
        training = tf.logical_and(training, self.trainable)
        return super().call(x, training)

def convolutional(input_layer, filters_shape, downsample=False, activate=True, bn=True):
    if downsample:
        input_layer = tf.keras.layers.ZeroPadding2D(((1, 0), (1, 0)))(input_layer)
        padding = 'valid'
        strides = 2
    else:
        strides = 1
        padding = 'same'

    conv = tf.keras.layers.Conv2D(filters=filters_shape[-1], kernel_size = filters_shape[0], strides=strides, padding=padding,
                                  use_bias=not bn, kernel_regularizer=tf.keras.regularizers.l2(0.0005),
                                  kernel_initializer=tf.random_normal_initializer(stddev=0.01),
                                  bias_initializer=tf.constant_initializer(0.))(input_layer)

    if bn: conv = BatchNormalization()(conv)
    if activate == True: conv = tf.nn.leaky_relu(conv, alpha=0.1)

    return conv

def residual_block(input_layer, input_channel, filter_num1, filter_num2):
    short_cut = input_layer
    conv = convolutional(input_layer, filters_shape=(1, 1, input_channel, filter_num1))
    conv = convolutional(conv       , filters_shape=(3, 3, filter_num1,   filter_num2))

    residual_output = short_cut + conv
    return residual_output

def upsample(input_layer):
    return tf.image.resize(input_layer, (input_layer.shape[1] * 2, input_layer.shape[2] * 2), method='nearest')



In [None]:
def darknet53(input_data):

    input_data = convolutional(input_data, (3, 3,  3,  32))
    input_data = convolutional(input_data, (3, 3, 32,  64), downsample=True)

    for i in range(1):
        input_data = residual_block(input_data,  64,  32, 64)

    input_data = convolutional(input_data, (3, 3,  64, 128), downsample=True)

    for i in range(2):
        input_data = residual_block(input_data, 128,  64, 128)

    input_data = convolutional(input_data, (3, 3, 128, 256), downsample=True)

    for i in range(8):
        input_data = residual_block(input_data, 256, 128, 256)

    route_1 = input_data
    input_data = convolutional(input_data, (3, 3, 256, 512), downsample=True)

    for i in range(8):
        input_data = residual_block(input_data, 512, 256, 512)

    route_2 = input_data
    input_data = convolutional(input_data, (3, 3, 512, 1024), downsample=True)

    for i in range(4):
        input_data = residual_block(input_data, 1024, 512, 1024)

    return route_1, route_2, input_data



In [None]:
def YOLOv3(input_layer):
    route_1, route_2, conv = darknet53(input_layer)

    conv = convolutional(conv, (1, 1, 1024,  512))
    conv = convolutional(conv, (3, 3,  512, 1024))
    conv = convolutional(conv, (1, 1, 1024,  512))
    conv = convolutional(conv, (3, 3,  512, 1024))
    conv = convolutional(conv, (1, 1, 1024,  512))

    conv_lobj_branch = convolutional(conv, (3, 3, 512, 1024))
    conv_lbbox = convolutional(conv_lobj_branch, (1, 1, 1024, 3*(NUM_CLASS + 5)), activate=False, bn=False)

    conv = convolutional(conv, (1, 1,  512,  256))
    conv = upsample(conv)

    conv = tf.concat([conv, route_2], axis=-1)

    conv = convolutional(conv, (1, 1, 768, 256))
    conv = convolutional(conv, (3, 3, 256, 512))
    conv = convolutional(conv, (1, 1, 512, 256))
    conv = convolutional(conv, (3, 3, 256, 512))
    conv = convolutional(conv, (1, 1, 512, 256))

    conv_mobj_branch = convolutional(conv, (3, 3, 256, 512))
    conv_mbbox = convolutional(conv_mobj_branch, (1, 1, 512, 3*(NUM_CLASS + 5)), activate=False, bn=False)

    conv = convolutional(conv, (1, 1, 256, 128))
    conv = upsample(conv)

    conv = tf.concat([conv, route_1], axis=-1)

    conv = convolutional(conv, (1, 1, 384, 128))
    conv = convolutional(conv, (3, 3, 128, 256))
    conv = convolutional(conv, (1, 1, 256, 128))
    conv = convolutional(conv, (3, 3, 128, 256))
    conv = convolutional(conv, (1, 1, 256, 128))

    conv_sobj_branch = convolutional(conv, (3, 3, 128, 256))
    conv_sbbox = convolutional(conv_sobj_branch, (1, 1, 256, 3*(NUM_CLASS +5)), activate=False, bn=False)

    return [conv_sbbox, conv_mbbox, conv_lbbox]


In [None]:
# def decode(conv_output, i=0, STRIDES = [8, 16, 32], ANCHORS = [1.25,1.625, 2.0]):
#     """
#     return tensor of shape [batch_size, output_size, output_size, anchor_per_scale, 5 + num_classes]
#             contains (x, y, w, h, score, probability)
#     """

#     conv_shape       = tf.shape(conv_output)
#     batch_size       = conv_shape[0]
#     output_size      = conv_shape[1]

#     conv_output = tf.reshape(conv_output, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS))

#     conv_raw_dxdy = conv_output[:, :, :, :, 0:2]
#     conv_raw_dwdh = conv_output[:, :, :, :, 2:4]
#     conv_raw_conf = conv_output[:, :, :, :, 4:5]
#     conv_raw_prob = conv_output[:, :, :, :, 5: ]

#     y = tf.tile(tf.range(output_size, dtype=tf.int32)[:, tf.newaxis], [1, output_size])
#     x = tf.tile(tf.range(output_size, dtype=tf.int32)[tf.newaxis, :], [output_size, 1])

#     xy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]], axis=-1)
#     xy_grid = tf.tile(xy_grid[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, 3, 1])
#     xy_grid = tf.cast(xy_grid, tf.float32)

#     pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * STRIDES[i]
#     pred_wh = (tf.exp(conv_raw_dwdh) * ANCHORS[i]) * STRIDES[i]
#     pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1)

#     pred_conf = tf.sigmoid(conv_raw_conf)
#     pred_prob = tf.sigmoid(conv_raw_prob)

#     return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1)

In [None]:
input_layer = Input(shape = (416, 416, 3))
output = YOLOv3(input_layer)

In [None]:
# output_tensors = []
# for i, conv_tensor in enumerate(output):
#     pred_tensor = decode(conv_tensor, i)
#     output_tensors.append(conv_tensor)
#     output_tensors.append(pred_tensor)

In [None]:
# print(output_tensors)

In [None]:
model = Model(inputs = input_layer, outputs = output)

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 416, 416, 3) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 416, 416, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 416, 416, 32) 128         conv2d[0][0]                     
__________________________________________________________________________________________________
tf.nn.leaky_relu (TFOpLambda)   (None, 416, 416, 32) 0           batch_normalization[0][0]        
______________________________________________________________________________________________

In [None]:
print(model.output)

[<KerasTensor: shape=(None, 52, 52, 54) dtype=float32 (created by layer 'conv2d_74')>, <KerasTensor: shape=(None, 26, 26, 54) dtype=float32 (created by layer 'conv2d_66')>, <KerasTensor: shape=(None, 13, 13, 54) dtype=float32 (created by layer 'conv2d_58')>]


In [None]:
import zipfile
zip_ref = zipfile.ZipFile("/content/drive/MyDrive/Okutama/Okutama_1.zip", 'r')
zip_ref.extractall("/content/Okutama_images")
zip_ref.close()
import zipfile
zip_ref = zipfile.ZipFile("/content/drive/MyDrive/Okutama/Okutama_2.zip", 'r')
zip_ref.extractall("/content/Okutama_images")
zip_ref.close()
import zipfile
zip_ref = zipfile.ZipFile("/content/drive/MyDrive/Okutama/Okutama_3.zip", 'r')
zip_ref.extractall("/content/Okutama_images")
zip_ref.close()
import zipfile
zip_ref = zipfile.ZipFile("/content/drive/MyDrive/Okutama/Okutama_4.zip", 'r')
zip_ref.extractall("/content/Okutama_images")
zip_ref.close()
import zipfile
zip_ref = zipfile.ZipFile("/content/drive/MyDrive/Okutama/okutama_labels.zip", 'r')
zip_ref.extractall("/content")
zip_ref.close()

In [None]:
import os
img_paths = []
ann_paths = []
frame_no = []
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.1.1"
ann_dir = "/content/3840x2160/1.1.1.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 2272:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)

        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass

img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.1.3"
ann_dir = "/content/3840x2160/1.1.3.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1966:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass

img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.1.4"
ann_dir = "/content/3840x2160/1.1.4.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1950:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass

img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.1.5"
ann_dir = "/content/3840x2160/1.1.5.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1560:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass

img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.1.6"
ann_dir = "/content/3840x2160/1.1.6.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 2382:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass


img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.1.11"
ann_dir = "/content/3840x2160/1.1.11.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 604:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass

img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.2.2"
ann_dir = "/content/3840x2160/1.2.2.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1098:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.2.4"
ann_dir = "/content/3840x2160/1.2.4.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1973:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.2.5"
ann_dir = "/content/3840x2160/1.2.5.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1028:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.2.6"
ann_dir = "/content/3840x2160/1.2.6.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1014:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.2.7"
ann_dir = "/content/3840x2160/1.2.7.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1846:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.2.8"
ann_dir = "/content/3840x2160/1.2.8.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1541:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.2.9"
ann_dir = "/content/3840x2160/1.2.9.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1399:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/1.2.11"
ann_dir = "/content/3840x2160/1.2.11.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1822:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/2.1.1"
ann_dir = "/content/3840x2160/2.1.1.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1252:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/2.1.2"
ann_dir = "/content/3840x2160/2.1.2.txt"

for i in os.listdir(img_dir):
    if int(i[:-4]) < 1398:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/2.1.3"
ann_dir = "/content/3840x2160/2.1.3.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 2878:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/2.1.4"
ann_dir = "/content/3840x2160/2.1.4.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 2108:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/2.1.5"
ann_dir = "/content/3840x2160/2.1.5.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1851:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/2.1.6"
ann_dir = "/content/3840x2160/2.1.6.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 2519:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/2.1.7"
ann_dir = "/content/3840x2160/2.1.7.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 2517:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass
img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/2.1.10"
ann_dir = "/content/3840x2160/2.1.10.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 2713:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass

img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/2.2.5"
ann_dir = "/content/3840x2160/2.2.5.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1062:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass


img_dir = "/content/Okutama_images/Extracted-Frames-1280x720/2.2.8"
ann_dir = "/content/3840x2160/2.2.8.txt"
for i in os.listdir(img_dir):
    if int(i[:-4]) < 1772:
        img_path = img_dir + '/' + i
        ann_path = ann_dir
        frame = int(i[:-4])
        
        img_paths.append(img_path)
        ann_paths.append(ann_path)
        frame_no.append(frame)
    else:
        pass

In [None]:
print(ann_paths)

['/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/content/3840x2160/1.1.1.txt', '/conte

In [None]:
x_data = np.array(img_paths)
y_data = np.array(ann_paths)
frame_no = np.array(frame_no)

In [None]:
print(x_data.shape)

(42525,)


In [None]:
def load_annotation(filename, frame):
    classes = ['"None"\n', '"Handshaking"\n', '"Hugging"\n', '"Reading"\n', '"Drinking"\n', '"Pushing/Pulling"\n', '"Carrying"\n', '"Calling"\n', '"Running"\n', '"Walking"\n', '"Lying"\n', '"Sitting"\n', '"Standing"\n']
    classes_lower = [s.lower() for s in classes]
    frame_map = dict()
    with open(filename, 'r') as fp:
        line = fp.readline()

        while line:
            line_split = line.split(' ')
            frame_id = int(line_split[5])
            if line_split[10] == '\n':
                label = '"None"\n'
            elif line_split[10] == '"Hand':
                label = '"Handshaking"\n'
            else:
                label = line_split[10]
            val = (int(line_split[0]), list(map(int, line_split[1:5])), list(map(int, line_split[6:8])), label)
            if frame_id not in frame_map:
                frame_map[frame_id] = [val]
            else:
                frame_map[frame_id].append(val)

            line = fp.readline()
    boxes = []
    # labels = []
    for obj in frame_map[frame]:
        # try:
        xmin = float(obj[1][0]) / 3840.0
        ymin = float(obj[1][1]) / 2160.0
        xmax = float(obj[1][2]) / 3840.0
        ymax = float(obj[1][3]) / 2160.0
        name = classes_lower.index(obj[3].lower())
        boxes.append([xmin, ymin, xmax, ymax, name])
        # labels.append(name)
        # except Exception:
        #     print("In annotation", Exception)
        #     pass


    return boxes

In [None]:
def transform_corner_to_center(boxes):

    center_box = tf.concat([
        (boxes[:2] + boxes[2:]) / 2,
        boxes[2:] - boxes[:2]], axis=-1)

    return center_box

def transform_center_to_corner(boxes):
    corner_box = np.concatenate([
        boxes[:2] - boxes[2:]/2,
        boxes[:2] + boxes[2:]/2], axis=-1)

    return corner_box

In [None]:
def bbox_iou(boxes1, boxes2):

    boxes1_area = boxes1[..., 2] * boxes1[..., 3]
    boxes2_area = boxes2[..., 2] * boxes2[..., 3]

    boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
                        boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
    boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
                        boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)

    left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
    right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])

    inter_section = tf.maximum(right_down - left_up, 0.0)
    inter_area = inter_section[..., 0] * inter_section[..., 1]
    union_area = boxes1_area + boxes2_area - inter_area

    return 1.0 * inter_area / union_area

In [None]:
def bbox_giou(boxes1, boxes2):

    boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
                        boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
    boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
                        boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)

    boxes1 = tf.concat([tf.minimum(boxes1[..., :2], boxes1[..., 2:]),
                        tf.maximum(boxes1[..., :2], boxes1[..., 2:])], axis=-1)
    boxes2 = tf.concat([tf.minimum(boxes2[..., :2], boxes2[..., 2:]),
                        tf.maximum(boxes2[..., :2], boxes2[..., 2:])], axis=-1)

    boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
    boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])

    left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
    right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])

    inter_section = tf.maximum(right_down - left_up, 0.0)
    inter_area = inter_section[..., 0] * inter_section[..., 1]
    union_area = boxes1_area + boxes2_area - inter_area
    iou = inter_area / union_area

    enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2])
    enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:])
    enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0)
    enclose_area = enclose[..., 0] * enclose[..., 1]
    giou = iou - 1.0 * (enclose_area - union_area) / enclose_area

    return giou

In [None]:
train_output_sizes = [52, 26, 13]
anchors = np.array([12,16, 19,36, 40,28, 36,75, 76,55, 72,146, 142,110, 192,243, 459,401])
anchors = anchors.reshape(3, 3, 2)
# print(anchors)
xyscale = [1.2, 1.1, 1.05]
def preprocess_true_boxes(bboxes):
        label = [
            np.zeros(
                (
                    train_output_sizes[i],
                    train_output_sizes[i],
                    3,
                    5 + 13,
                )
            )
            for i in range(3)
        ]
        bboxes_xywh = [np.zeros((150, 4)) for _ in range(3)]
        bbox_count = np.zeros((3,))

        for bbox in bboxes:
            bbox_coor = bbox[:4]
            bbox_class_ind = bbox[4]
            bbox_coor = np.array(bbox_coor, dtype = np.float32)
            onehot = np.zeros(13, dtype=np.float)
            onehot[bbox_class_ind] = 1.0
            uniform_distribution = np.full(
                13, 1.0 / 13
            )
            deta = 0.01
            smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution

            bbox_xywh = transform_center_to_corner(bbox_coor)
            bbox_xywh_scaled = (
                1.0 * bbox_xywh[np.newaxis, :] / np.array([8, 16, 32])[:, np.newaxis]
            )

            iou = []
            exist_positive = False
            for i in range(3):
                anchors_xywh = np.zeros((3, 4))
                anchors_xywh[:, 0:2] = (
                    np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5
                )
                anchors_xywh[:, 2:4] = anchors[i]

                iou_scale = bbox_iou(
                    bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh
                )
                iou.append(iou_scale)
                iou_mask = iou_scale > 0.3

                if np.any(iou_mask):
                    xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(
                        np.int32
                    )

                    label[i][yind, xind, iou_mask, :] = 0
                    label[i][yind, xind, iou_mask, 0:4] = bbox_xywh
                    label[i][yind, xind, iou_mask, 4:5] = 1.0
                    label[i][yind, xind, iou_mask, 5:] = smooth_onehot

                    bbox_ind = int(bbox_count[i] % self.max_bbox_per_scale)
                    bboxes_xywh[i][bbox_ind, :4] = bbox_xywh
                    bbox_count[i] += 1

                    exist_positive = True

            if not exist_positive:
                best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)
                best_detect = int(best_anchor_ind / 3)
                best_anchor = int(best_anchor_ind % 3)
                xind, yind = np.floor(
                    bbox_xywh_scaled[best_detect, 0:2]
                ).astype(np.int32)

                label[best_detect][yind, xind, best_anchor, :] = 0
                label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh
                label[best_detect][yind, xind, best_anchor, 4:5] = 1.0
                label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot

                bbox_ind = int(
                    bbox_count[best_detect] % 150
                )
                bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh
                bbox_count[best_detect] += 1
        label_sbbox, label_mbbox, label_lbbox = label
        sbboxes, mbboxes, lbboxes = bboxes_xywh
        return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes

In [None]:
import numpy as np
import cv2
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import os
import json


class DataGenerator(Sequence):
    def __init__(self, x_data, y_data, frame_no,
                 batch_size=32, dim=(416,416), strides = [8, 16, 32], anchors = np.array([12,16, 19,36, 40,28, 36,75, 76,55, 72,146, 142,110, 192,243, 459,401]), num_classes = 13, 
                 shuffle=True):
        self.x_data = x_data
        self.y_data = y_data
        self.frame_no = frame_no
        self.batch_size = batch_size
        self.dim = dim
        self.strides = [8, 16, 32]
        self.anchors = anchors.reshape(3, 3, 2)
        self.train_input_size = 416
        self.num_classes = 13
        self.anchor_per_scale = 3
        self.max_bbox_per_scale = 150
        self.train_output_sizes = [52, 26, 13]


    def __len__(self):
        return int(np.floor(len(self.x_data) / self.batch_size))


    def __getitem__(self, index):
        start_index = index * self.batch_size
        i = start_index - 1
        x_train = []
        batch_image = np.zeros(
            (
                self.batch_size,
                416, 
                416,
                3,
            ),
            dtype=np.float32
        )

        batch_label_sbbox = np.zeros(
            (
                self.batch_size,
                self.train_output_sizes[0],
                self.train_output_sizes[0],
                3,
                5 + self.num_classes,
            ),
            dtype=np.float32,
        )
        batch_label_mbbox = np.zeros(
            (
                self.batch_size,
                self.train_output_sizes[1],
                self.train_output_sizes[1],
                3,
                5 + self.num_classes,
            ),
            dtype=np.float32,
        )
        batch_label_lbbox = np.zeros(
            (
                self.batch_size,
                self.train_output_sizes[2],
                self.train_output_sizes[2],
                3,
                5 + self.num_classes,
            ),
            dtype=np.float32,
        )

        batch_sbboxes = np.zeros(
            (self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32
        )
        batch_mbboxes = np.zeros(
            (self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32
        )
        batch_lbboxes = np.zeros(
            (self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32
        )
        num = 0
        while len(x_train) < self.batch_size:
            
            # try:
            img = cv2.imread(self.x_data[i % len(self.x_data)])
            img = cv2.resize(img, (416, 416))
            # img = np.array(img, dtype = np.float32)
            img = img / 255.0
            boxes = load_annotation(filename=self.y_data[i%len(self.y_data)], frame=self.frame_no[i%len(self.frame_no)])
            label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = preprocess_true_boxes(bboxes = boxes)
            x_train.append(img)
            batch_image[num, :, :, :] = img
            batch_label_sbbox[num, :, :, :, :] = label_sbbox
            batch_label_mbbox[num, :, :, :, :] = label_mbbox
            batch_label_lbbox[num, :, :, :, :] = label_lbbox

            # batch_label_sbbox = np.reshape(batch_label_sbbox, (1, 4, 52, 52, 3, 18))

            batch_sbboxes[num, :, :] = sbboxes
            batch_mbboxes[num, :, :] = mbboxes
            batch_lbboxes[num, :, :] = lbboxes

            i += 1
            num+=1
            # except Exception as err:
            #     print(err)
            #     pass
        batch_smaller_target = batch_label_sbbox, batch_sbboxes
        batch_medium_target = batch_label_mbbox, batch_mbboxes
        batch_larger_target = batch_label_lbbox, batch_lbboxes


        return (np.array(batch_image, dtype = np.float32), (batch_smaller_target, batch_medium_target, batch_larger_target))

In [None]:
train_data = DataGenerator(x_data[:34020], y_data[:34020], frame_no[:34020], batch_size=4)

In [None]:
validation_data = DataGenerator(x_data[34020:], y_data[34020:], frame_no[34020:], batch_size=4)

In [None]:
a = train_data.__getitem__(random.randint(0, 35000))

In [None]:
# print(a[1][0][0])

In [None]:
def compute_loss(pred, conv, label, bboxes, i=0):

    conv_shape  = tf.shape(conv)
    batch_size  = conv_shape[0]
    output_size = conv_shape[1]
    input_size  = 416
    conv = tf.reshape(conv, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS))

    conv_raw_conf = conv[:, :, :, :, 4:5]
    conv_raw_prob = conv[:, :, :, :, 5:]

    pred_xywh     = pred[:, :, :, :, 0:4]
    pred_conf     = pred[:, :, :, :, 4:5]

    label_xywh    = label[:, :, :, :, 0:4]
    respond_bbox  = label[:, :, :, :, 4:5]
    label_prob    = label[:, :, :, :, 5:]

    giou = tf.expand_dims(bbox_giou(pred_xywh, label_xywh), axis=-1)
    input_size = tf.cast(input_size, tf.float32)

    bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / (input_size ** 2)
    giou_loss = respond_bbox * bbox_loss_scale * (1- giou)

    iou = bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :])
    max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1)

    respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < 0.5, tf.float32 )

    conf_focal = tf.pow(respond_bbox - pred_conf, 2)

    conf_loss = conf_focal * (
            respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf)
            +
            respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf)
    )

    prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_prob, logits=conv_raw_prob)

    giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1,2,3,4]))
    conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1,2,3,4]))
    prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1,2,3,4]))

    return giou_loss, conf_loss, prob_loss


In [None]:
def decode_train(conv_output, output_size, NUM_CLASS, STRIDES, ANCHORS, i=0, XYSCALE=[1, 1, 1]):
    conv_output = tf.reshape(conv_output,
                             (tf.shape(conv_output)[0], output_size, output_size, 3, 5 + NUM_CLASS))

    conv_raw_dxdy, conv_raw_dwdh, conv_raw_conf, conv_raw_prob = tf.split(conv_output, (2, 2, 1, NUM_CLASS),
                                                                          axis=-1)

    xy_grid = tf.meshgrid(tf.range(output_size), tf.range(output_size))
    xy_grid = tf.expand_dims(tf.stack(xy_grid, axis=-1), axis=2)  # [gx, gy, 1, 2]
    xy_grid = tf.tile(tf.expand_dims(xy_grid, axis=0), [tf.shape(conv_output)[0], 1, 1, 3, 1])

    xy_grid = tf.cast(xy_grid, tf.float32)

    pred_xy = ((tf.sigmoid(conv_raw_dxdy) * XYSCALE[i]) - 0.5 * (XYSCALE[i] - 1) + xy_grid) * \
              STRIDES
    pred_wh = (tf.exp(conv_raw_dwdh) * ANCHORS[i])
    pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1)

    pred_conf = tf.sigmoid(conv_raw_conf)
    pred_prob = tf.sigmoid(conv_raw_prob)

    return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1)

In [None]:
trainset = train_data
testset = validation_data
logdir = "/content/drive/MyDrive/Logs/YoloV3"
steps_per_epoch = len(trainset)
first_stage_epochs = 20
second_stage_epochs = 30
global_steps = tf.Variable(1, trainable=False, dtype=tf.int64)
warmup_steps = 2 * steps_per_epoch
total_steps = (first_stage_epochs + second_stage_epochs) * steps_per_epoch
anchors = np.array([12,16, 19,36, 40,28, 36,75, 76,55, 72,146, 142,110, 192,243, 459,401])
anchors = anchors.reshape(3, 3, 2)
xyscale = [1.2, 1.1, 1.05]
input_layer = tf.keras.layers.Input([416, 416, 3])
IOU_LOSS_THRESH = 0.5


feature_maps = YOLOv3(input_layer)

bbox_tensors = []
for i, fm in enumerate(feature_maps):
    if i == 0:
        bbox_tensor = decode_train(fm, 52, 13, 8, ANCHORS = anchors, i = i, XYSCALE = xyscale)
    elif i == 1:
        bbox_tensor = decode_train(fm, 26, 13, 16, ANCHORS = anchors, i = i, XYSCALE = xyscale)
    else:
        bbox_tensor = decode_train(fm, 13, 13, 32, ANCHORS = anchors, i = i, XYSCALE = xyscale)
    bbox_tensors.append(fm)
    bbox_tensors.append(bbox_tensor)

model = tf.keras.Model(input_layer, bbox_tensors)
optimizer = tf.keras.optimizers.Adam()
if os.path.exists(logdir): shutil.rmtree(logdir)
writer = tf.summary.create_file_writer(logdir)

def train_step(image_data, target):
    with tf.GradientTape() as tape:
        pred_result = model(image_data, training=True)
        giou_loss = conf_loss = prob_loss = 0

        for i in range(3):
            conv, pred = pred_result[i * 2], pred_result[i * 2 + 1]
            loss_items = compute_loss(pred, conv, target[i][0], target[i][1], i=i)
            giou_loss += loss_items[0]
            conf_loss += loss_items[1]
            prob_loss += loss_items[2]

        total_loss = giou_loss + conf_loss + prob_loss

        gradients = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        tf.print("=> STEP %4d/%4d   lr: %.6f   giou_loss: %4.2f   conf_loss: %4.2f   "
                  "prob_loss: %4.2f   total_loss: %4.2f" % (global_steps, steps_per_epoch, optimizer.lr.numpy(),
                                                            giou_loss, conf_loss,
                                                            prob_loss, total_loss))
        global_steps.assign_add(1)
        if global_steps < warmup_steps:
            lr = global_steps / warmup_steps * 0.001
        else:
            lr = 0.000001 + 0.5 * (0.001 - 0.000001) * (
                (1 + tf.cos((global_steps - warmup_steps) / (total_steps - warmup_steps) * np.pi))
            )
        optimizer.lr.assign(lr.numpy())

        with writer.as_default():
            tf.summary.scalar("lr", optimizer.lr, step=global_steps)
            tf.summary.scalar("loss/total_loss", total_loss, step=global_steps)
            tf.summary.scalar("loss/giou_loss", giou_loss, step=global_steps)
            tf.summary.scalar("loss/conf_loss", conf_loss, step=global_steps)
            tf.summary.scalar("loss/prob_loss", prob_loss, step=global_steps)
        writer.flush()
        
def test_step(image_data, target):
    with tf.GradientTape() as tape:
        pred_result = model(image_data, training=True)
        giou_loss = conf_loss = prob_loss = 0

        for i in range(3):
            conv, pred = pred_result[i * 2], pred_result[i * 2 + 1]
            loss_items = compute_loss(pred, conv, target[i][0], target[i][1], i=i)
            giou_loss += loss_items[0]
            conf_loss += loss_items[1]
            prob_loss += loss_items[2]

        total_loss = giou_loss + conf_loss + prob_loss

        tf.print("=> TEST STEP %4d   giou_loss: %4.2f   conf_loss: %4.2f   "
                  "prob_loss: %4.2f   total_loss: %4.2f" % (global_steps, giou_loss, conf_loss,
                                                            prob_loss, total_loss))

for epoch in range(2):
    for image_data, target in trainset:
        train_step(image_data, target)
    for image_data, target in testset:
        test_step(image_data, target)
    model.save_weights("/content/drive/MyDrive/YoloV3_1.h5")

=> STEP    1/8505   lr: 0.001000   giou_loss: 2.98   conf_loss: 1923.28   prob_loss: 11.18   total_loss: 1937.44
=> STEP    2/8505   lr: 0.000000   giou_loss: 3.96   conf_loss: 2027.76   prob_loss: 13.02   total_loss: 2044.73
=> STEP    3/8505   lr: 0.000000   giou_loss: 3.96   conf_loss: 1944.96   prob_loss: 13.09   total_loss: 1962.01
=> STEP    4/8505   lr: 0.000000   giou_loss: 4.94   conf_loss: 1961.23   prob_loss: 15.36   total_loss: 1981.53
=> STEP    5/8505   lr: 0.000000   giou_loss: 3.96   conf_loss: 1928.23   prob_loss: 13.15   total_loss: 1945.34
=> STEP    6/8505   lr: 0.000000   giou_loss: 3.95   conf_loss: 1991.95   prob_loss: 13.03   total_loss: 2008.93
=> STEP    7/8505   lr: 0.000000   giou_loss: 4.93   conf_loss: 1988.39   prob_loss: 15.23   total_loss: 2008.55
=> STEP    8/8505   lr: 0.000000   giou_loss: 4.94   conf_loss: 1983.96   prob_loss: 15.24   total_loss: 2004.14
=> STEP    9/8505   lr: 0.000001   giou_loss: 3.95   conf_loss: 2007.67   prob_loss: 12.92   tot

KeyboardInterrupt: ignored