# Object detection using the YOLO V4 pre-trained model

*by Georgios K. Ouzounis*

In this exercise, we will experiment with object detection in a streaming video using the YOLO V4 pretrained model. This is only a demo that will be performed slowly owing to the virtual environment. For substantially improved performance, compile a .py file with all the relevant code and run it locally 

## Setup

In [None]:
# import the relevant libraries
import numpy as np
#import cv2 # openCV
from google.colab.patches import cv2_imshow

In [None]:
!pip install -r requirements.txt

## Get the model



In [None]:
# first, create a directory to store the model
%mkdir model

In [None]:
# enter the directory and download the necessary files 
%cd model
!wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights
!wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4.cfg
!wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/data/coco.names
%cd ..

## Customize the YOLO detector

class labels:

In [None]:
class_labels_path = "/content/model/coco.names"
class_labels = open(class_labels_path).read().strip().split("\n")
class_labels

bounding box color definitions: two options

In [None]:
# declare the repeating bounding box colors for each class
# 1st: create a list colors as an RGB string array
# Example: Red, Green, Blue, Yellow, Magneta

class_colors = ["255,0,0","0,255,0","0,0,255","255,255,0","255,0, 255"]

#2nd: split the array on comma-separated strings and for change each string type to integer
class_colors = [np.array(every_color.split(",")).astype("int") for every_color in class_colors]

#3d: convert the array or arrays to a numpy array
class_colors = np.array(class_colors)

#4th: tile this to get 80 class colors, i.e., as many as the classes  (16rows of 5cols each).
# If you want unique colors for each class, you may randomize the color generation
# or set them manually

class_colors = np.tile(class_colors,(16,1))

or random colors:

In [None]:
class_colors = np.random.randint(0, 255, size=(len(class_labels), 3), dtype="uint8")

Declare remaining parameters

In [None]:
# for the image2blob conversion
scalefactor = 1.0/255.0
new_size = (416, 416)

# for the NMS
score_threshold = 0.5
nms_threshold = 0.4

In [None]:
import cv2

## Load the model

In [None]:
# Load the pre-trained model 
yolo_model = cv2.dnn.readNetFromDarknet('model/yolov4.cfg','model/yolov4.weights')

In [None]:
# Read the network layers/components. The YOLO V4 neural network has 379 components.
# They consist of convolutional layers (conv), rectifier linear units (relu), etc.:

model_layers = yolo_model.getLayerNames()

In [None]:
# Loop through all the network layers to find the output layers
output_layers = [model_layers[model_layer - 1] for model_layer in yolo_model.getUnconnectedOutLayers()]

## Run the model on the live video feed using NMS


install the following two packages to access video content  from www.youtube.com

get any video. We have selected the particular one as it shows views of city life 

In [None]:
import pafy

url = "https://www.youtube.com/watch?v=_MMpKnfT5oU"
video = pafy.new(url)
best = video.getbest(preftype="mp4")

In [None]:
def object_detection_analysis_with_nms(test_img, class_labels, class_colors, obj_detections_in_layers, score_threshold, nms_threshold):

	# get the image dimensions  
	img_height = test_img.shape[0]
	img_width = test_img.shape[1]

	result = test_img.copy()

    # declare the lists for the arguments of interest: classID, bbox info, and detection confidences
	class_ids_list = []
	boxes_list = []
	confidences_list = []
    # loop over each output layer
		for object_detections_in_single_layer in obj_detections_in_layers:
		# loop over the detections in each layer
		for object_detection in object_detections_in_single_layer:
    # get the confidence scores of all the objects detected with the bounding box
									prediction_scores = object_detection[5:]
			# consider the highest score being associated with the winning class
			# get the class ID from the index of the highest score
			predicted_class_id = np.argmax(prediction_scores)
			# get the prediction confidence
			prediction_confidence = prediction_scores[predicted_class_id]

			# consider object detections with confidence score higher than threshold
			if prediction_confidence > score_threshold:
				# get the predicted label
				predicted_class_label = class_labels[predicted_class_id]
				# compute the bounding box cooridnates scaled for the input image
				bounding_box = object_detection[0:4] * np.array([img_width, img_height, img_width, img_height])
				(box_center_x_pt, box_center_y_pt, box_width, box_height) = bounding_box.astype("int")
				start_x_pt = max(0, int(box_center_x_pt - (box_width / 2)))
				start_y_pt = max(0, int(box_center_y_pt - (box_height / 2)))

				# update the 3 lists for nms processing
				# - confidence is needed as a float 
				# - the bbox info has the openCV Rect format
				class_ids_list.append(predicted_class_id)
				confidences_list.append(float(prediction_confidence))
				boxes_list.append([int(start_x_pt), int(start_y_pt), int(box_width), int(box_height)])

	# NMS for a set of overlapping bboxes returns the ID of the one with highest 
	# confidence score while suppressing all others (non-maxima)
	# - score_threshold: a threshold used to filter boxes by score 
	# - nms_threshold: a threshold used in non maximum suppression. 

	winner_ids = cv2.dnn.NMSBoxes(boxes_list, confidences_list, score_threshold, nms_threshold)

	# create a list of winner boxes
	winner_box_list = []

	for winner_id in winner_ids:
		max_class_id = winner_id
		box = boxes_list[max_class_id]
		start_x_pt = box[0]
		start_y_pt = box[1]
		box_width = box[2]
		box_height = box[3]
		winner_box_list.append(box)

		#get the predicted class id and label
		predicted_class_id = class_ids_list[max_class_id]
		predicted_class_label = class_labels[predicted_class_id]
		prediction_confidence = confidences_list[max_class_id]

		#obtain the bounding box end co-oridnates
		end_x_pt = start_x_pt + box_width
		end_y_pt = start_y_pt + box_height

		#get a random mask color from the numpy array of colors
		box_color = class_colors[predicted_class_id]

		#convert the color numpy array as a list and apply to text and box
		box_color = [int(c) for c in box_color]

		# print the prediction in console
		predicted_class_label = "{}: {:.2f}%".format(predicted_class_label, prediction_confidence * 100)
		print("predicted object {}".format(predicted_class_label))

		# draw rectangle and text in the image
		cv2.rectangle(result, (start_x_pt, start_y_pt), (end_x_pt, end_y_pt), box_color, 1)
		cv2.putText(result, predicted_class_label, (start_x_pt, start_y_pt-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, box_color, 1)

	return result, winner_box_list



In [None]:
def object_detection_iou(iou_image, detection_box, gt_box):
	start_pt_x_box_a = detection_box[0]
	start_pt_y_box_a = detection_box[1]
	end_pt_x_box_a = detection_box[0] + detection_box[2]
	end_pt_y_box_a = detection_box[1] + detection_box[3]
	cv2.rectangle(iou_image, (start_pt_x_box_a, start_pt_y_box_a), (end_pt_x_box_a, end_pt_y_box_a), (0, 255, 0), 2)
	cv2.putText(iou_image, "predicted bbox", (start_pt_x_box_a, start_pt_y_box_a-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

	start_pt_x_box_b = gt_box[0]
	start_pt_y_box_b = gt_box[1]
	end_pt_x_box_b = gt_box[0] + gt_box[2]
	end_pt_y_box_b = gt_box[1] + gt_box[3]
	cv2.rectangle(iou_image, (start_pt_x_box_b, start_pt_y_box_b), (end_pt_x_box_b, end_pt_y_box_b), (0, 0, 255), 2)
	cv2.putText(iou_image, "ground truth bbox", (start_pt_x_box_b, start_pt_y_box_b-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)

	# determine the (x, y)-coordinates of the intersection rectangle
	xA = max(start_pt_x_box_a, start_pt_x_box_b)
	yA = max(start_pt_y_box_a, start_pt_y_box_b)
	xB = min(end_pt_x_box_a, end_pt_x_box_b)
	yB = min(end_pt_y_box_a, end_pt_y_box_b)

	# compute the area of intersection rectangle
	intersection_area = max(0, xB - xA + 1) * max(0, yB - yA + 1)

	# compute the areas of both rectangles  separately
	detArea = (end_pt_x_box_a - start_pt_x_box_a + 1) * (end_pt_y_box_a - start_pt_y_box_a + 1)
	gtArea = (end_pt_x_box_b - start_pt_x_box_b + 1) * (end_pt_y_box_b - start_pt_y_box_b + 1)
	unionArea = detArea + gtArea - intersection_area

	# compute the intersection over union 
	iou_value = intersection_area / float(unionArea)
	cv2.putText(iou_image, "IoU: {:.4f}".format(iou_value), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
	print("iou = {:.4f}".format(iou_value))

	# return the intersection over union value
	return iou_image, iou_value

mount your Google Drive and get the following file (customize the path; the file is included in the git repo):

In [None]:
#%cp /content/drive/MyDrive/object_detection/object_detection_functions.py .

In [None]:
#from object_detection_functions import object_detection_analysis_with_nms

**WARNING:** this will be a very slow loop in part owing to the cv2_imshow() command. Everyframe processed will be displayed after the previous one. To break this loop go to Runtime->Interrupt Execution

---




In [None]:
cap = cv2.VideoCapture(best.url)

new_width = 640
new_height = 480
dim = (new_width, new_height)

if cap.isOpened():
  while True:
    #get the current frame from the video stream
    ret,frame = cap.read()

    frame = cv2.resize(frame, dim, interpolation = cv2.INTER_AREA)

    blob = cv2.dnn.blobFromImage(frame, scalefactor, new_size, swapRB=True, crop=False)

    # input preprocessed blob into the model
    yolo_model.setInput(blob)

    # compute the forward pass for the input, storing the results per output layer in a list
    obj_detections_in_layers = yolo_model.forward(output_layers)

    # get  the object detections drawn on  the frame
    frame, winner_boxes = object_detection_analysis_with_nms(frame, class_labels, class_colors, obj_detections_in_layers, score_threshold, nms_threshold)

    #display the frame
    cv2_imshow(frame)
    # if running outside Colab notebooks use:
    # cv2.imshow(frame)

    #terminate while loop if the 'q' key is pressed - applicable outside the notebooks
    if cv2.waitKey(1) & 0xFF == ord('q'):
      break

  #releasing the stream and the camera
  cap.release()
  cv2.destroyAllWindows()




In [None]:
from google.colab import drive
drive.mount('/content/drive')