In [1]:
import cv2
import argparse

from ultralytics import YOLO
import supervision as sv
import numpy as np

def parse_arguments() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="YOLOv8 live")
    parser.add_argument(
        "--webcam-resolution", 
        default=[1280, 720], 
        nargs=2, 
        type=int
    )
    args = parser.parse_args(args=[])
    return args


def main():
    args = parse_arguments()
    frame_width, frame_height = args.webcam_resolution

    cap = cv2.VideoCapture(0)

    cap.set(cv2.CAP_PROP_FRAME_WIDTH, frame_width)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, frame_height)

    model = YOLO("yolov8n.pt")

    box_annotator = sv.BoxAnnotator(
        thickness=2,
        text_thickness=2,
        text_scale=1
    )

    while True:
        ret, frame = cap.read()
        result = model(frame, agnostic_nms=True)[0]
        detections = sv.Detections.from_yolov8(result)
        print(detections)
        labels = [
            f"{model.model.names[class_id]} {confidence:0.2f}"
            for _, _, confidence, class_id, _
            in detections
        ]
        frame = box_annotator.annotate(
            scene=frame, 
            detections=detections, 
            labels=labels
        )
        fps=cv2.CAP_PROP_FPS
        print(detections)
        cv2.putText(frame, 'fps:'.format(fps), (100,100), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2, cv2.LINE_AA)
        cv2.imshow("yolov8", frame)

        if (cv2.waitKey(30) == 27):
            break


if __name__ == "__main__":
    main()


0: 384x640 4 persons, 196.5ms
Speed: 27.4ms preprocess, 196.5ms inference, 744.1ms postprocess per image at shape (1, 3, 384, 640)



Detections(xyxy=array([[     968.82,      576.72,        1161,         720],
       [     221.08,       621.6,      402.32,      719.92],
       [     563.24,      227.37,      1083.8,      716.14],
       [     227.58,      222.81,      1138.9,         720]], dtype=float32), mask=None, confidence=array([    0.66223,     0.59198,      0.5892,     0.25179], dtype=float32), class_id=array([0, 0, 0, 0]), tracker_id=None)
Detections(xyxy=array([[     968.82,      576.72,        1161,         720],
       [     221.08,       621.6,      402.32,      719.92],
       [     563.24,      227.37,      1083.8,      716.14],
       [     227.58,      222.81,      1138.9,         720]], dtype=float32), mask=None, confidence=array([    0.66223,     0.59198,      0.5892,     0.25179], dtype=float32), class_id=array([0, 0, 0, 0]), tracker_id=None)


0: 384x640 2 persons, 24.5ms
Speed: 11.2ms preprocess, 24.5ms inference, 15.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 30.6ms
Speed: 5.1ms preprocess, 30.6ms inference, 8.4ms postprocess per image at shape (1, 3, 384, 640)



Detections(xyxy=array([[      562.6,      225.85,      1134.2,      715.76],
       [     221.55,      621.63,      403.36,         720]], dtype=float32), mask=None, confidence=array([    0.74987,     0.71944], dtype=float32), class_id=array([0, 0]), tracker_id=None)
Detections(xyxy=array([[      562.6,      225.85,      1134.2,      715.76],
       [     221.55,      621.63,      403.36,         720]], dtype=float32), mask=None, confidence=array([    0.74987,     0.71944], dtype=float32), class_id=array([0, 0]), tracker_id=None)
Detections(xyxy=array([[     223.59,      621.65,      402.81,         720],
       [     559.89,      228.18,        1012,      715.53],
       [     940.83,      583.57,        1133,         720]], dtype=float32), mask=None, confidence=array([     0.7158,      0.7079,     0.57925], dtype=float32), class_id=array([0, 0, 0]), tracker_id=None)
Detections(xyxy=array([[     223.59,      621.65,      402.81,         720],
       [     559.89,      228.18,        1

0: 384x640 2 persons, 42.2ms
Speed: 0.0ms preprocess, 42.2ms inference, 8.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 27.4ms
Speed: 6.3ms preprocess, 27.4ms inference, 8.1ms postprocess per image at shape (1, 3, 384, 640)



Detections(xyxy=array([[     560.31,      228.05,        1133,      715.75],
       [     221.62,      621.57,      402.96,         720]], dtype=float32), mask=None, confidence=array([    0.69827,     0.69384], dtype=float32), class_id=array([0, 0]), tracker_id=None)
Detections(xyxy=array([[     560.31,      228.05,        1133,      715.75],
       [     221.62,      621.57,      402.96,         720]], dtype=float32), mask=None, confidence=array([    0.69827,     0.69384], dtype=float32), class_id=array([0, 0]), tracker_id=None)
Detections(xyxy=array([[     222.21,       621.5,      403.16,         720],
       [      560.1,      228.43,      1083.6,      715.53],
       [     942.36,      583.79,      1139.2,      719.74]], dtype=float32), mask=None, confidence=array([    0.68814,     0.67547,     0.30789], dtype=float32), class_id=array([0, 0, 0]), tracker_id=None)
Detections(xyxy=array([[     222.21,       621.5,      403.16,         720],
       [      560.1,      228.43,      108

0: 384x640 3 persons, 38.0ms
Speed: 0.0ms preprocess, 38.0ms inference, 8.0ms postprocess per image at shape (1, 3, 384, 640)



Detections(xyxy=array([[     221.22,      621.48,      402.86,         720],
       [     560.15,      228.56,      1010.8,         715],
       [     944.05,      583.04,      1139.8,         720]], dtype=float32), mask=None, confidence=array([    0.72203,     0.66212,     0.56252], dtype=float32), class_id=array([0, 0, 0]), tracker_id=None)
Detections(xyxy=array([[     221.22,      621.48,      402.86,         720],
       [     560.15,      228.56,      1010.8,         715],
       [     944.05,      583.04,      1139.8,         720]], dtype=float32), mask=None, confidence=array([    0.72203,     0.66212,     0.56252], dtype=float32), class_id=array([0, 0, 0]), tracker_id=None)


KeyboardInterrupt: 