### OBJECT DETECTION AND TRACKING WITH YOLOv5

In [None]:
# install the necessary libraries
import sys
!{sys.executable} -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
!{sys.executable} -m pip install pandas pyyaml tqdm seaborn numpy matplotlib opencv-contrib-python

In [None]:
# clone the YOLOv5 repo
!git clone https://github.com/ultralytics/yolov5  # clone

In [None]:
# import libraries
import cv2
import torch
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
# load model (reference: https://github.com/ultralytics/yolov5)
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # or yolov5n - yolov5x6, custom

#### WITH IMAGES

In [None]:
# get image
img = cv2.imread('images/img-4.jpg')  # or file, Path, PIL, OpenCV, numpy, list
# make inference
results = model(img)
# get the results as a pandas dataframe
result_df = results.pandas().xyxy[0]

# filter out object detections
# supported objects:
# - person
# - bird, cat, cow, dog, horse, sheep
# - aeroplane, bicycle, boat, bus, car, motorbike, train
# - bottle, chair, dining table, potted plant, sofa, tv/monitor
objects = result_df
# objects = result_df[result_df["name"] == "person"]

# loop over the detections
for i in range(objects.shape[0]):
    # get a detected person detection data
    object_row = objects.iloc[i]
    start_x = int(object_row["xmin"])
    start_y = int(object_row["ymin"])
    end_x = int(object_row["xmax"])
    end_y = int(object_row["ymax"])
    confidence = round(object_row["confidence"], 2) * 100
    
    # print object name, coords and confidence
    print(f"name: {object_row['name']} |", f"coords: {start_x, start_y, end_x, end_y}", f"| confidence: {confidence}%")
    
    # fish out only results with confidence >= 50%
    if object_row["confidence"] >= 0.5:
        # draw a rectangle
        img = cv2.rectangle(
            img=img.copy(), 
            pt1=(start_x, start_y), 
            pt2=(end_x, end_y), 
            color=(255, 0, 0), 
            thickness=3,
            lineType=cv2.LINE_AA
        )
        
        # write the name of the object on it's bounding box
        # put text on the image
        img = cv2.putText(
            img=img.copy(), 
            text=f"{object_row['name']} ({confidence}%)", 
            org=(start_x, start_y), 
            fontFace=cv2.FONT_HERSHEY_COMPLEX, 
            fontScale=1, 
            color=(0, 255, 0), 
            thickness=3, 
            lineType=cv2.LINE_AA
        )
    
# convert the image from BGR to RGB
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
# show resulting image
plt.figure(figsize=(15, 15))
plt.imshow(img)

#### WITH VIDEOS

In [79]:
# start video capture
capture = cv2.VideoCapture(0)

# check whether the capture was opened successfully
if not capture.isOpened():
    print("Unable to start camera")

# initialize frame tracker and frame holders
frames_tracker = 0
previous_frame, current_frame, next_frame = None, None, None

while True:
    # capture the next frame
    ret, frame = capture.read()
    
    # check whether the frame is available or not
    if frame is None:
        break
    
    ############################### BEGIN OBJECT DETECTION
    
    # make inference
    results = model(frame)
    # get the results as a pandas dataframe
    result_df = results.pandas().xyxy[0]

    # filter out object detections
    # supported objects:
    # - person
    # - bird, cat, cow, dog, horse, sheep
    # - aeroplane, bicycle, boat, bus, car, motorbike, train
    # - bottle, chair, dining table, potted plant, sofa, tv/monitor
    objects = result_df
    # objects = result_df[result_df["name"] == "person"]

    # loop over the detections
    for i in range(objects.shape[0]):
        # get a detected person detection data
        object_row = objects.iloc[i]
        start_x = int(object_row["xmin"])
        start_y = int(object_row["ymin"])
        end_x = int(object_row["xmax"])
        end_y = int(object_row["ymax"])
        confidence = round(object_row["confidence"], 2) * 100
        
        # fish out only results with confidence >= 50%
        if object_row["confidence"] >= 0.5:
            # draw a rectangle
            frame = cv2.rectangle(
                img=frame.copy(), 
                pt1=(start_x, start_y), 
                pt2=(end_x, end_y), 
                color=(255, 0, 0), 
                thickness=3,
                lineType=cv2.LINE_AA
            )
            
            # write the name of the object on it's bounding box
            # put text on the image
            frame = cv2.putText(
                img=frame.copy(), 
                text=f"{object_row['name']} ({confidence}%)", 
                org=(start_x, start_y), 
                fontFace=cv2.FONT_HERSHEY_COMPLEX, 
                fontScale=1, 
                color=(0, 255, 0), 
                thickness=3, 
                lineType=cv2.LINE_AA
            )
    
    ############################### END OBJECT DETECTION
    
    # show the captured frame
    cv2.imshow("Video", frame)
    
    # wait 30 milliseconds for a key press event
    keypressed = cv2.waitKey(30)
    # if the ESC key is pressed, destroy all windows and release resources
    if keypressed == 27:
        # release camera
        capture.release()
        # destroy all windows
        cv2.destroyAllWindows()
        break