In [None]:
import cv2
import argparse

from ultralytics import YOLO
import supervision as sv
import numpy as np

def parse_arguments() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="YOLOv8 live")
    parser.add_argument(
        "--webcam-resolution", 
        default=[1280, 720], 
        nargs=2, 
        type=int
    )
    args = parser.parse_args(args=[])
    return args


def main():
    args = parse_arguments()
    frame_width, frame_height = args.webcam_resolution

    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, frame_width)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, frame_height)

    model = YOLO("yolov8n.pt")

    box_annotator = sv.BoxAnnotator(
        thickness=2,
        text_thickness=2,
        text_scale=1
    )

    while True:
        ret, frame = cap.read()
        result = model(frame, agnostic_nms=True)[0]
        detections = sv.Detections.from_yolov8(result)
        labels = [
            f"{model.model.names[class_id]} {confidence:0.2f}"
            for _, _, confidence, class_id, _
            in detections
        ]
        frame = box_annotator.annotate(
            scene=frame, 
            detections=detections, 
            labels=labels
        )
        
        cv2.imshow("yolov8", frame)

        if (cv2.waitKey(30) == 27):
            break


if __name__ == "__main__":
    main()


0: 384x640 1 person, 190.0ms
Speed: 4.6ms preprocess, 190.0ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 164.9ms
Speed: 4.2ms preprocess, 164.9ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 170.5ms
Speed: 6.1ms preprocess, 170.5ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 153.2ms
Speed: 4.4ms preprocess, 153.2ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 149.8ms
Speed: 3.6ms preprocess, 149.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 156.9ms
Speed: 6.9ms preprocess, 156.9ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 144.2ms
Speed: 4.6ms preprocess, 144.2ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 163.6ms
Speed: 3.8ms preprocess, 163.6ms inference, 0.0ms postprocess per image at


0: 384x640 1 person, 127.7ms
Speed: 4.5ms preprocess, 127.7ms inference, 3.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 130.2ms
Speed: 2.9ms preprocess, 130.2ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 132.6ms
Speed: 2.8ms preprocess, 132.6ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 129.7ms
Speed: 3.0ms preprocess, 129.7ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 127.0ms
Speed: 3.4ms preprocess, 127.0ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 127.6ms
Speed: 3.9ms preprocess, 127.6ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 129.1ms
Speed: 3.7ms preprocess, 129.1ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 130.3ms
Speed: 2.5ms preprocess, 130.3ms inference, 3.4ms postprocess per image a


0: 384x640 1 person, 135.3ms
Speed: 1.6ms preprocess, 135.3ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 206.9ms
Speed: 2.8ms preprocess, 206.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 124.1ms
Speed: 5.0ms preprocess, 124.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 131.7ms
Speed: 3.7ms preprocess, 131.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 131.9ms
Speed: 4.7ms preprocess, 131.9ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 126.8ms
Speed: 3.0ms preprocess, 126.8ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 125.4ms
Speed: 3.4ms preprocess, 125.4ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 128.5ms
Speed: 4.7ms preprocess, 128.5ms inference, 0.0ms postprocess per image at


0: 384x640 1 person, 1 cell phone, 137.0ms
Speed: 3.6ms preprocess, 137.0ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 cell phone, 129.9ms
Speed: 3.7ms preprocess, 129.9ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 cell phone, 125.6ms
Speed: 2.1ms preprocess, 125.6ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 cell phone, 128.2ms
Speed: 3.6ms preprocess, 128.2ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 cell phone, 125.2ms
Speed: 5.1ms preprocess, 125.2ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 cell phone, 128.5ms
Speed: 2.2ms preprocess, 128.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 cell phone, 124.8ms
Speed: 3.2ms preprocess, 124.8ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 38