# Detection Methods

In [1]:
from collections import OrderedDict
from PIL import ImageColor

import json
import cv2

In [21]:
class Point:
    def __init__(self, raw_point):
        self.x = raw_point[0]
        self.y = raw_point[1]

        def to_string(self): '(' + str(self.x) + ', ' + str(self.y) + ')'

        def to_dict(self): return {'x': self.x, 'y': self.y}


class Box:
    def __init__(self, class_name, confidence, raw_corner_points, color, track_id=None):
        self.class_name = class_name
        self.confidence = confidence
        self.raw_corner_points = raw_corner_points
        self.top_left_point = Point(raw_corner_points[0])
        self.bottom_right_point = Point(raw_corner_points[1])
        self.width = self.bottom_right_point.x - self.top_left_point.x
        self.height = self.bottom_right_point.y - self.top_left_point.y
        self.color = color
        self.track_id = track_id

    def to_dict(self):
        box = OrderedDict([
            ('class', self.class_name),
            ('confidence', self.confidence),
            ('x', self.top_left_point.x),
            ('y', self.top_left_point.y),
            ('width', self.width),
            ('height', self.height),
            ('color', self.color)
        ])
        if self.track_id is not None:
            box['id'] = self.track_id
        return box


class Detections:
    def __init__(self, raw_detection, classes, tracking=False):
        self.__raw_detection = raw_detection
        self.__classes = classes
        self.__boxes = []
        self.__tracking = tracking
        self.__point1_index = 0
        self.__point2_index = 1
        self.__point3_index = 2
        self.__point4_index = 3
        self.__tracking_index = 4
        self.__class_index = 5 if tracking else 5
        self.__confidence_index = 6 if tracking else 4
        self.__extract_boxes()

    def __extract_boxes(self):
        for raw_box in self.__raw_detection:
            track_id = None
            if self.__tracking:
                track_id = int(raw_box[self.__tracking_index])
            class_id = int(raw_box[self.__class_index])
            raw_corner_points = (int(raw_box[self.__point1_index]), int(raw_box[self.__point2_index])), (
                int(raw_box[self.__point3_index]), int(raw_box[self.__point4_index]))
            confidence = raw_box[self.__confidence_index]
            dataset_class = self.__classes[class_id]
            class_name = dataset_class['name']
            class_color = dataset_class['color']
            box = Box(class_name, confidence, raw_corner_points, class_color, track_id=track_id)
            self.__boxes.append(box)

    def get_boxes(self):
        return self.__boxes

    def to_dict(self):
        boxes = []
        for box in self.__boxes:
            boxes.append(box.to_dict())
        return boxes

    def to_json(self):
        boxes = self.to_dict()
        return json.dumps(boxes, indent=4)


def plot_box(image, top_left_point, bottom_right_point, width, height, label, num_people, color=(210, 240, 0), padding=6,
             font_scale=0.35):
    label = label.upper()

    cv2.rectangle(image, (top_left_point['x'] - 1, top_left_point['y']),
                  (bottom_right_point['x'], bottom_right_point['y']), color, thickness=2, lineType=cv2.LINE_AA)
    res_scale = (image.shape[0] + image.shape[1]) / 1600
    font_scale = font_scale * res_scale
    font_width, font_height = 0, 0
    font_face = cv2.FONT_HERSHEY_DUPLEX
    text_size = cv2.getTextSize(label, font_face, fontScale=font_scale, thickness=1)[0]

    if text_size[0] > font_width:
        font_width = text_size[0]
    if text_size[1] > font_height:
        font_height = text_size[1]
    if top_left_point['x'] - 1 < 0:
        top_left_point['x'] = 1
    if top_left_point['x'] + font_width + padding * 2 > image.shape[1]:
        top_left_point['x'] = image.shape[1] - font_width - padding * 2
    if top_left_point['y'] - font_height - padding * 2 < 0:
        top_left_point['y'] = font_height + padding * 2

    p3 = top_left_point['x'] + font_width + padding * 2, top_left_point['y'] - font_height - padding * 2
    cv2.rectangle(image, (top_left_point['x'] - 2, top_left_point['y']), p3, color, -1, lineType=cv2.LINE_AA)
    x = top_left_point['x'] + padding
    y = top_left_point['y'] - padding
    cv2.putText(image, label, (x, y), font_face, font_scale, [0, 0, 0], thickness=1, lineType=cv2.LINE_AA)
    return image

def draw(image, detections):
    image_copy = image.copy()
    for box in detections:
        class_name = box['class']
        conf = box['confidence']
        text = ''
        if 'text' in box:
            text = box['text']
            if len(text) > 50:
                text = text[:50] + ' ...'
        label = (str(box['id']) + '. ' if 'id' in box else '') + class_name + ' ' + str(int(conf * 100)) + '%' + (
            (' | ' + text) if ('text' in box and len(box['text']) > 0 and not box['text'].isspace()) else '')
        width = box['width']
        height = box['height']
        color = box['color']

        if isinstance(color, str):
            color = ImageColor.getrgb(color)
            color = (color[2], color[1], color[0])

        top_left_point = {'x': box['x'], 'y': box['y']}
        bottom_right_point = {'x': box['x'] + width, 'y': box['y'] + height}
        image_copy = plot_box(image_copy, top_left_point, bottom_right_point, width, height, label, color=color)
    return image_copy

def counter(image, detections):
    num_people = 0
    for box in detections:
        if box['class'] == 'person':
            num_people += 1

    cv2.putText(image, str(num_people), (50, 75), fontScale=2, fontFace=cv2.FONT_HERSHEY_DUPLEX, color=(0, 0, 255), thickness=1, lineType=cv2.LINE_AA)
    return image

# Object Detector

In [22]:
import argparse
import os

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

import sys

import warnings

warnings.filterwarnings("ignore")

import cv2
from tqdm import tqdm

from detections import draw
from obj_detector import YOLOv7

sys.path.insert(0, './yolov7')
sys.argv=['']
del sys


def track_objects(weights, classes, device, video, output):
    yolov7 = YOLOv7()
    yolov7.load(weights_path=weights, classes=classes, device=device)

    vid = cv2.VideoCapture(video)
    width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(vid.get(cv2.CAP_PROP_FPS))
    frames_count = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
    output = cv2.VideoWriter(output, fourcc, fps, (width, height))

    if vid.isOpened() == False:
        print('Error Opening Video!')

    print('Tracking Objects...\n')
    pbar = tqdm(total=frames_count, unit=' frames', dynamic_ncols=True, position=0, leave=True)

    try:
        while vid.isOpened():
            ret, frame = vid.read()
            if ret:
                detections = yolov7.detect(frame, track=True)
                detected_frame = draw(frame, detections)
                detected_frame = counter(detected_frame, detections)
                output.write(detected_frame)
                pbar.update(1)
            else:
                break
    except KeyboardInterrupt:
        pass

    pbar.close()
    vid.release()
    output.release()
    yolov7.unload()


WEIGHTS_PATH = os.path.join(os.getcwd(), 'yolov7/yolov7.pt')
CLASSES_PATH = os.path.join(os.getcwd(), 'track_classes.yaml')

if __name__ == '__main__':
    parser = argparse.ArgumentParser(prog='track_objects.py')
    parser.add_argument('--weights', nargs='+', type=str, default='yolov7/yolov7.pt', help='model.pt path(s)')
    parser.add_argument('--classes', type=str, default='track_classes.yaml',
                        help='YAML File of Object(s) to Track (See '
                             'Documentation)')
    parser.add_argument('--device', type=str, default='cpu', help='CPU or GPU')
    parser.add_argument('--video', type=str, help='Video Path')
    parser.add_argument('--output', type=str, help='(output_video_name).mp4')
    opt = parser.parse_args()
    print(opt)

    track_objects(opt.weights, opt.classes, opt.device, opt.video, opt.output)


Namespace(weights='yolov7/yolov7.pt', classes='track_classes.yaml', device='cpu', video=None, output=None)
Fusing layers... 
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
Error Opening Video!
Tracking Objects...



0 frames [00:00, ? frames/s]


Note that in the cell block above, the block is executed by cleaning `sys.argv` by evaluating 
`import sys; sys.argv=['']; del sys`

For single frame inferences

In [23]:
import argparse
import sys

import warnings
warnings.filterwarnings("ignore")

import cv2

from detections import draw
from obj_detector import YOLOv7

sys.path.insert(0, './yolov7')
sys.argv=['']
del sys


def track_single_frame(weights, classes, device, image_file, output):
    yolov7 = YOLOv7()
    yolov7.load(weights_path=weights, classes=classes, device=device)

    image = cv2.imread(image_file)
    detections = yolov7.detect(image, track=True)
    detected_frame = draw(image, detections)
    detected_frame = counter(detected_frame, detections)

    num_people = 0
    for detected_obj in detections:
        if detected_obj['class'] == 'person':
            num_people += 1

    print(f'Number of People in Frame: {num_people}')

    cv2.imwrite(output, detected_frame)

    print(detections)
    yolov7.unload()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(prog='track_single_frame.py')
    parser.add_argument('--weights', nargs='+', type=str, default='yolov7/yolov7.pt', help='model.pt path(s)')
    parser.add_argument('--classes', type=str, default='track_classes.yaml',
                        help='YAML File of Object(s) to Track (See '
                             'Documentation)')
    parser.add_argument('--device', type=str, default='cpu', help='CPU or GPU')
    parser.add_argument('--image_file', type=str, default='test_frame.png', help='Image File Path')
    parser.add_argument('--output', type=str, default='detected_test_frame.png', help='(output_image_name).png')
    opt = parser.parse_args()
    print(opt)

    track_single_frame(opt.weights, opt.classes, opt.device, opt.image_file, opt.output)


Namespace(weights='yolov7/yolov7.pt', classes='track_classes.yaml', device='cpu', image_file='test_frame.png', output='detected_test_frame.png')
Fusing layers... 
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
Number of People in Frame: 1
[OrderedDict([('class', 'person'), ('confidence', 0.63), ('x', 0), ('y', 670), ('width', 63), ('height', 132), ('color', '#00ffc3'), ('id', 17)]), OrderedDict([('class', 'car'), ('confidence', 0.94), ('x', 618), ('y', 467), ('width', 250), ('height', 179), ('color', '#00ffc3'), ('id', 18)])]


We now have three pythons scripts (four including the main object detector class) that can be used via command-line interfaces to perform object tracking on videos with the script being easily modifiable to switch to a live-video feed for analytics. For more single frame inferences, the `track_single_frame.py` script can be utilized.