In [88]:
import sys
import torch

import cv2
import time
import random
import numpy as np
import onnxruntime as ort
from PIL import Image
from pathlib import Path
from collections import OrderedDict, namedtuple

In [89]:
# Exporting the ONNX model or onnxruntime

# Don't forget to alter the image dimensions accordingly

# !py "../export.py" --weights "./yolov7-tiny.pt" --grid --end2end --simplify --topk-all 100 --iou-thres 0.65 --conf-thres 0.35 --img-size 640 640 --dynamic-batch --max-wh 7680

In [90]:
cuda = True # For utilizing GPU, and performing parallel computing
weights = "./yolov7-tiny.onnx"

imgList = [
    cv2.imread("../images/img1.jpg"),
    # cv2.imread("../images/img2.jpg")
]

In [91]:
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if cuda else ["CPUExecutionProvider"]

# Creating an inference session to utilize the pre build model
session = ort.InferenceSession(weights, providers=providers)

In [92]:
all_classes = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 
         'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 
         'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 
         'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 
         'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 
         'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 
         'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 
         'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 
         'hair drier', 'toothbrush']

# Target classes on which I need to focus
classes = ["person", "car"]
# Generating random colors or bounding box of each of these classes

colors = {}

for class_name in classes:

    colors[class_name] = tuple([random.randint(0, 255) for _ in range(3)]) # Generating (r, g, b) list

print("Generated random colors for object bounding boxes")
colors

Generated random colors for object bounding boxes


{'person': (212, 181, 212), 'car': (6, 188, 245)}

In [93]:
def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):

    """Resize and pad image while meeting stride-multiple constraints"""
    
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, r, (dw, dh)

In [94]:
# Processing the images

rgb_images = []
resize_data = []

for image in imgList:

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # YOLO requires BGR images to be in RGB form
    rgb_images.append(image)

    image_cpy = image.copy()
    image_cpy, ratio, dwdh = letterbox(image_cpy, auto=False)

    image_cpy = image_cpy.transpose((2, 0, 1)) 
    image_cpy = np.expand_dims(image_cpy, 0) # Adds an extra dimension to image at index 0, YOLOv7 format
    image_cpy = np.ascontiguousarray(image_cpy) # Changes the matrix structure of image_cpy, again YOLOv7 format
    image_cpy = image_cpy.astype(np.float32)

    resize_data.append((image_cpy, ratio, dwdh))

In [95]:
# Extracting the names of input, output nodes from ONNX model

outname = [i.name for i in session.get_outputs()]
inname = [i.name for i in session.get_inputs()]

(outname, inname)

(['output'], ['images'])

In [96]:
# Running batch 1 inference

image = np.ascontiguousarray(resize_data[0][0]/255) # Normalizing the image
prediction = session.run(outname, {"images": image})

In [None]:
thickness = 2

for i, prediction_array in enumerate(prediction):
    
    for (batch_id, x0, y0, x1, y1, cls_id, score) in prediction_array:

        class_name = all_classes[int(cls_id)]

        if class_name in classes and score > 0.4:
            class_color = colors[class_name]

            # Reversing the paddings and other transformations applied during letterbox

            box = np.array([x0,y0,x1,y1])
            box -= np.array(dwdh*2)
            box /= ratio
            box = box.round().astype(np.int32).tolist()

            cv2.rectangle(imgList[0], box[:2], box[2:], class_color, thickness)

cv2.imshow("Image", imgList[0])
cv2.waitKey(0)
cv2.destroyAllWindows()

In [2]:
# OpenCV Live feed version

import cv2
import time
import random
import numpy as np
import onnxruntime as ort

cuda = True # For utilizing GPU, and performing parallel computing
weights = "./yolov7-tiny.onnx"
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if cuda else ["CPUExecutionProvider"]
# Creating an inference session to utilize the pre build model
session = ort.InferenceSession(weights, providers=providers)

all_classes = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 
         'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 
         'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 
         'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 
         'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 
         'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 
         'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 
         'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 
         'hair drier', 'toothbrush']

# Target classes on which I need to focus
# classes = ["person", "car"]
classes = all_classes
# Generating random colors or bounding box of each of these classes

colors = {}

for class_name in classes:

    colors[class_name] = tuple([random.randint(0, 255) for _ in range(3)]) # Generating (r, g, b) list

print("Generated random colors for object bounding boxes")

def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):

    """Resize and pad image while meeting stride-multiple constraints"""
    
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, r, (dw, dh)

# Starting OpenCV Video Capture

capture = cv2.VideoCapture(0)

if not capture.isOpened():
    print("Camera being used by another application, unable to gain access")
    exit()

outname = [i.name for i in session.get_outputs()]
inname = [i.name for i in session.get_inputs()]
thickness = 2
screen_width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
screen_height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Create a named window for full screen display
cv2.namedWindow("Live Footage", cv2.WINDOW_NORMAL)
cv2.setWindowProperty("Live Footage", cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)

# Calculating time taken to process each frame
start_time = time.time()
img_counter = 1

# For OpenCV fonts
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 1.0

while True:

    ret, frame = capture.read()
    # frame_cpy = frame.copy()
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # YOLO requires BGR images to be in RGB form

    resize_data = []

    image_cpy, ratio, dwdh = letterbox(image, auto=False)

    image_cpy = image_cpy.transpose((2, 0, 1)) 
    image_cpy = np.expand_dims(image_cpy, 0) # Adds an extra dimension to image at index 0, YOLOv7 format
    image_cpy = np.ascontiguousarray(image_cpy) # Changes the matrix structure of image_cpy, again YOLOv7 format
    image_cpy = image_cpy.astype(np.float32)

    resize_data.append((image_cpy, ratio, dwdh))

    # Running batch 1 inference

    image = np.ascontiguousarray(resize_data[0][0]/255) # Normalizing the image
    prediction = session.run(outname, {"images": image})

    enu_predic = enumerate(prediction)

    for i, prediction_array in enu_predic:
    
        for (batch_id, x0, y0, x1, y1, cls_id, score) in prediction_array:
            # Coordinates are of top left and bottom right

            if score < 0.4:
                continue

            class_name = all_classes[int(cls_id)]

            if class_name in classes:
                class_color = colors[class_name]

                # Reversing the paddings and other transformations applied during letterbox

                box = np.array([x0,y0,x1,y1])
                box -= np.array(dwdh*2)
                box /= ratio
                box = box.round().astype(np.int32).tolist()

                cv2.rectangle(frame, box[:2], box[2:], class_color, thickness)
                cv2.putText(frame, class_name, box[:2], font, font_scale, class_color, thickness)

    cv2.imshow("Live Footage", frame)

    img_counter += 1
    if (cv2.waitKey(1) == ord("q")):
        end_time = time.time()
        print("Avg frame processing time (time taken/ frames processed): ",(end_time- start_time)/img_counter)
        break
capture.release()
cv2.destroyAllWindows()

Generated random colors for object bounding boxes
Avg frame processing time:  0.1386938817360822


In [None]:
# 0.16311155995236168 sec, all
# 0.163175533979367 sec, just 2 classes