In [18]:
import cv2
import supervision as sv
from ultralytics import YOLO


def main():
    
    # to save the video
    writer= cv2.VideoWriter('webcam_yolo.mp4', 
                            cv2.VideoWriter_fourcc(*'DIVX'), 
                            7, 
                            (1280, 720))
    
    # define resolution
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

    # specify the model
    model = YOLO("yolov8n.pt")

    # customize the bounding box
    box_annotator = sv.BoxAnnotator(
        thickness=2,
        text_thickness=2,
        text_scale=1
    )

    # create the output file
    with open('yolo_labels.txt', 'w') as f:
        f.write('Detected labels:\n')
    
    while True:
        ret, frame = cap.read()
        result = model(frame, agnostic_nms=True)[0]
        detections = sv.Detections.from_yolov8(result)
        labels = [
            f"{model.model.names[class_id]} {confidence:0.2f}"
            for _, confidence, class_id, _
            in detections
        ]
        frame = box_annotator.annotate(
            scene=frame, 
            detections=detections, 
            labels=labels
        ) 
        
        writer.write(frame)
        
        cv2.imshow("yolov8", frame)

        # save labels to file
        with open('yolo_labels.txt', 'a') as f:
            for label in labels:
                f.write(label + '\n')
        
        if (cv2.waitKey(30) == 27): # break with escape key
            break
            
    cap.release()
    writer.release()
    cv2.destroyAllWindows()
    
if __name__ == "__main__":
    main()


0: 384x640 2 persons, 136.2ms
Speed: 1.0ms preprocess, 136.2ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 139.7ms
Speed: 1.9ms preprocess, 139.7ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 128.0ms
Speed: 0.7ms preprocess, 128.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 144.2ms
Speed: 1.9ms preprocess, 144.2ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 132.2ms
Speed: 1.3ms preprocess, 132.2ms inference, 4.3ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 123.2ms
Speed: 1.1ms preprocess, 123.2ms inference, 3.3ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 123.3ms
Speed: 1.7ms preprocess, 123.3ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 131.7ms
Speed: 2.0ms preprocess, 131.7ms inference, 3.0ms postprocess per 

Speed: 2.6ms preprocess, 123.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 cell phone, 121.8ms
Speed: 0.5ms preprocess, 121.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 cell phone, 109.7ms
Speed: 1.0ms preprocess, 109.7ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 cell phone, 114.1ms
Speed: 1.3ms preprocess, 114.1ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 cell phone, 111.6ms
Speed: 1.0ms preprocess, 111.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 cell phone, 123.5ms
Speed: 1.5ms preprocess, 123.5ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 4 persons, 1 cell phone, 105.2ms
Speed: 1.0ms preprocess, 105.2ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 116.5ms
Speed: 1.8ms 


0: 384x640 3 persons, 1 cell phone, 136.4ms
Speed: 1.2ms preprocess, 136.4ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 cell phone, 118.5ms
Speed: 1.3ms preprocess, 118.5ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 cell phone, 121.2ms
Speed: 1.6ms preprocess, 121.2ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 cell phone, 136.2ms
Speed: 1.1ms preprocess, 136.2ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 cell phone, 121.8ms
Speed: 0.5ms preprocess, 121.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 cell phone, 124.4ms
Speed: 1.0ms preprocess, 124.4ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 4 persons, 120.9ms
Speed: 1.6ms preprocess, 120.9ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 4

Speed: 0.5ms preprocess, 125.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 cell phone, 121.6ms
Speed: 1.6ms preprocess, 121.6ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 cell phone, 102.7ms
Speed: 1.1ms preprocess, 102.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 person, 110.6ms
Speed: 1.4ms preprocess, 110.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 129.5ms
Speed: 1.2ms preprocess, 129.5ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 121.6ms
Speed: 1.0ms preprocess, 121.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 122.9ms
Speed: 2.2ms preprocess, 122.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 125.1ms
Speed: 1.0ms preprocess, 125.1ms inference, 3.0ms postprocess per imag


0: 384x640 3 persons, 1 cat, 1 cell phone, 118.7ms
Speed: 0.6ms preprocess, 118.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 119.4ms
Speed: 1.6ms preprocess, 119.4ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 4 persons, 124.1ms
Speed: 0.5ms preprocess, 124.1ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 cat, 1 laptop, 113.4ms
Speed: 1.3ms preprocess, 113.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 cat, 108.8ms
Speed: 1.0ms preprocess, 108.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 cat, 1 laptop, 120.5ms
Speed: 0.5ms preprocess, 120.5ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 cat, 1 laptop, 107.8ms
Speed: 1.2ms preprocess, 107.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 pers


0: 384x640 2 persons, 1 cup, 111.9ms
Speed: 2.1ms preprocess, 111.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 refrigerator, 113.1ms
Speed: 1.9ms preprocess, 113.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 111.5ms
Speed: 1.2ms preprocess, 111.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 108.1ms
Speed: 1.2ms preprocess, 108.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 benchs, 110.7ms
Speed: 1.6ms preprocess, 110.7ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 person, 1 cat, 112.4ms
Speed: 1.7ms preprocess, 112.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 person, 1 cat, 132.6ms
Speed: 0.8ms preprocess, 132.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 person, 1 cat, 107.7ms
Speed: 1.5ms preprocess, 1

Speed: 1.4ms preprocess, 112.7ms inference, 3.6ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 toilet, 116.3ms
Speed: 1.2ms preprocess, 116.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 toilet, 116.6ms
Speed: 1.1ms preprocess, 116.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 110.5ms
Speed: 1.6ms preprocess, 110.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 person, 123.3ms
Speed: 1.1ms preprocess, 123.3ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 123.6ms
Speed: 1.3ms preprocess, 123.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 person, 1 refrigerator, 117.3ms
Speed: 1.2ms preprocess, 117.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 refrigerator, 117.8ms
Speed: 1.9ms preprocess, 117.8ms inference, 2.0ms postprocess per im


0: 384x640 1 bicycle, 1 toilet, 135.9ms
Speed: 0.8ms preprocess, 135.9ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 bicycle, 125.2ms
Speed: 1.1ms preprocess, 125.2ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 bicycle, 1 toilet, 129.2ms
Speed: 1.5ms preprocess, 129.2ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 bicycle, 129.9ms
Speed: 1.0ms preprocess, 129.9ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 bicycle, 129.0ms
Speed: 1.6ms preprocess, 129.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 bicycle, 1 toilet, 125.5ms
Speed: 1.4ms preprocess, 125.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 bicycle, 141.1ms
Speed: 2.0ms preprocess, 141.1ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 bicycle, 1 toilet, 129.7ms
Speed: 1.1ms preprocess, 1


0: 384x640 2 persons, 133.8ms
Speed: 1.0ms preprocess, 133.8ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 121.3ms
Speed: 1.1ms preprocess, 121.3ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 106.7ms
Speed: 1.2ms preprocess, 106.7ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 126.9ms
Speed: 0.5ms preprocess, 126.9ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 119.9ms
Speed: 1.1ms preprocess, 119.9ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 118.1ms
Speed: 0.5ms preprocess, 118.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 112.2ms
Speed: 1.0ms preprocess, 112.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 (no detections), 132.8ms
Speed: 1.1ms preprocess, 132.8ms inference, 1.0ms postproces


0: 384x640 1 person, 114.7ms
Speed: 0.9ms preprocess, 114.7ms inference, 3.4ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 sports ball, 123.4ms
Speed: 1.1ms preprocess, 123.4ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 111.0ms
Speed: 1.4ms preprocess, 111.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 sports ball, 105.1ms
Speed: 2.0ms preprocess, 105.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 person, 1 fork, 126.9ms
Speed: 1.0ms preprocess, 126.9ms inference, 2.7ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 115.3ms
Speed: 1.7ms preprocess, 115.3ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 person, 1 fork, 113.0ms
Speed: 0.8ms preprocess, 113.0ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 131.2ms
Speed: 1.8ms preprocess


0: 384x640 2 persons, 1 bottle, 112.1ms
Speed: 1.7ms preprocess, 112.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 126.4ms
Speed: 1.1ms preprocess, 126.4ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 118.2ms
Speed: 1.4ms preprocess, 118.2ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 122.5ms
Speed: 1.0ms preprocess, 122.5ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 115.0ms
Speed: 1.0ms preprocess, 115.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 130.7ms
Speed: 1.0ms preprocess, 130.7ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 123.0ms
Speed: 1.0ms preprocess, 123.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bo

Speed: 2.2ms preprocess, 123.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 120.0ms
Speed: 1.1ms preprocess, 120.0ms inference, 3.2ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 134.7ms
Speed: 1.0ms preprocess, 134.7ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 121.6ms
Speed: 1.3ms preprocess, 121.6ms inference, 3.3ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 121.2ms
Speed: 0.5ms preprocess, 121.2ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 107.7ms
Speed: 1.2ms preprocess, 107.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 121.2ms
Speed: 1.0ms preprocess, 121.2ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 115.7ms
Speed: 1.4ms preprocess, 115.7ms inference, 1.0ms postprocess per i

Speed: 1.0ms preprocess, 109.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 4 persons, 125.1ms
Speed: 1.1ms preprocess, 125.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 5 persons, 115.2ms
Speed: 1.0ms preprocess, 115.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 5 persons, 111.6ms
Speed: 1.2ms preprocess, 111.6ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 4 persons, 1 cell phone, 120.0ms
Speed: 0.6ms preprocess, 120.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 103.3ms
Speed: 1.4ms preprocess, 103.3ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 119.1ms
Speed: 1.0ms preprocess, 119.1ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 115.0ms
Speed: 0.9ms preprocess, 115.0ms inference, 3.0ms postprocess per image at shape (1

Speed: 1.7ms preprocess, 120.0ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 117.3ms
Speed: 1.0ms preprocess, 117.3ms inference, 2.9ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 105.8ms
Speed: 1.0ms preprocess, 105.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 113.0ms
Speed: 1.6ms preprocess, 113.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 132.6ms
Speed: 1.2ms preprocess, 132.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 126.3ms
Speed: 1.3ms preprocess, 126.3ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 bottle, 130.3ms
Speed: 1.0ms preprocess, 130.3ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 122.6ms
Speed: 0.5ms preprocess, 12

Speed: 1.9ms preprocess, 120.4ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 cup, 128.3ms
Speed: 0.5ms preprocess, 128.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 135.4ms
Speed: 1.6ms preprocess, 135.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 114.0ms
Speed: 1.0ms preprocess, 114.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 111.2ms
Speed: 1.1ms preprocess, 111.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 114.7ms
Speed: 1.4ms preprocess, 114.7ms inference, 2.6ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 119.6ms
Speed: 1.2ms preprocess, 119.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 116.5ms
Speed: 1.0ms preprocess, 116.5ms inference, 2.0ms postprocess per image at shape (1, 3, 64


0: 384x640 2 persons, 1 bottle, 114.5ms
Speed: 1.0ms preprocess, 114.5ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 122.3ms
Speed: 1.5ms preprocess, 122.3ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 113.1ms
Speed: 1.9ms preprocess, 113.1ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 131.9ms
Speed: 1.1ms preprocess, 131.9ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 113.1ms
Speed: 1.3ms preprocess, 113.1ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 128.4ms
Speed: 1.1ms preprocess, 128.4ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 106.0ms
Speed: 1.3ms preprocess, 106.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bo

Speed: 1.1ms preprocess, 106.9ms inference, 2.4ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 115.5ms
Speed: 1.0ms preprocess, 115.5ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 111.7ms
Speed: 1.0ms preprocess, 111.7ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 108.8ms
Speed: 0.5ms preprocess, 108.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 125.3ms
Speed: 0.9ms preprocess, 125.3ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 107.4ms
Speed: 1.3ms preprocess, 107.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 1 cell phone, 112.2ms
Speed: 1.0ms preprocess, 112.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 1 cell phone, 115.4ms


0: 384x640 2 persons, 1 bottle, 110.8ms
Speed: 1.0ms preprocess, 110.8ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 126.3ms
Speed: 2.0ms preprocess, 126.3ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 107.8ms
Speed: 1.2ms preprocess, 107.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 128.8ms
Speed: 1.0ms preprocess, 128.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 105.8ms
Speed: 1.3ms preprocess, 105.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 138.4ms
Speed: 1.6ms preprocess, 138.4ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 122.2ms
Speed: 1.5ms preprocess, 122.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bo


0: 384x640 2 persons, 1 bottle, 113.8ms
Speed: 1.3ms preprocess, 113.8ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 127.6ms
Speed: 1.0ms preprocess, 127.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 121.4ms
Speed: 1.2ms preprocess, 121.4ms inference, 3.6ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 121.8ms
Speed: 1.4ms preprocess, 121.8ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 110.7ms
Speed: 1.1ms preprocess, 110.7ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 143.8ms
Speed: 1.0ms preprocess, 143.8ms inference, 3.7ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 105.1ms
Speed: 1.4ms preprocess, 105.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bo

Speed: 1.9ms preprocess, 130.9ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 119.5ms
Speed: 1.1ms preprocess, 119.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 133.7ms
Speed: 1.0ms preprocess, 133.7ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 116.1ms
Speed: 2.0ms preprocess, 116.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 120.6ms
Speed: 0.5ms preprocess, 120.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 121.0ms
Speed: 1.0ms preprocess, 121.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 122.8ms
Speed: 1.5ms preprocess, 122.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 3 persons, 1 bottle, 115.5ms
Speed: 0.9ms preprocess, 11

Speed: 0.9ms preprocess, 106.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 118.8ms
Speed: 1.0ms preprocess, 118.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 116.4ms
Speed: 1.1ms preprocess, 116.4ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 108.1ms
Speed: 1.6ms preprocess, 108.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 113.1ms
Speed: 1.0ms preprocess, 113.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 105.2ms
Speed: 1.0ms preprocess, 105.2ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 110.5ms
Speed: 1.3ms preprocess, 110.5ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 2 persons, 1 bottle, 124.7ms
Speed: 1.0ms preprocess, 12

In [3]:
import cv2
import supervision as sv
from ultralytics import YOLO


def main():
    
    # to save the video
    writer = cv2.VideoWriter('webcam_yolo.mp4', 
                             cv2.VideoWriter_fourcc(*'DIVX'), 
                             7, 
                             (1280, 720))
    
    # define the video file
    video_file = "test_video.mp4"
    
    # open the video file
    cap = cv2.VideoCapture(video_file)
    
    # specify the model
    model = YOLO("yolov5s.pt")
    
    # customize the bounding box
    box_annotator = sv.BoxAnnotator(
        thickness=2,
        text_thickness=2,
        text_scale=1
    )
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # process the frame for object detection
        result = model(frame, agnostic=True)
        detections = sv.Detections.from_yolov5(result.pandas().xyxy[0].values)
        labels = [
            f"{model.model.names[int(class_id)]} {confidence:0.2f}"
            for _, confidence, class_id, _
            in detections
        ]
        frame = box_annotator.annotate(
            scene=frame, 
            detections=detections, 
            labels=labels
        )
        
        # save the annotated frame
        writer.write(frame)
        
        # show the annotated frame
        cv2.imshow("yolov5", frame)
        
        if cv2.waitKey(1) == ord("q"):
            break
    
    cap.release()
    writer.release()
    cv2.destroyAllWindows()


if __name__ == "__main__":
    main()

PRO TIP  Replace 'model=yolov5s.pt' with new 'model=yolov5su.pt'.
YOLOv5 'u' models are trained with https://github.com/ultralytics/ultralytics and feature improved performance vs standard YOLOv5 models trained with https://github.com/ultralytics/yolov5.



In [1]:
import cv2
import math
import argparse

def highlightFace(net, frame, conf_threshold=0.7):
    frameOpencvDnn=frame.copy()
    frameHeight=frameOpencvDnn.shape[0]
    frameWidth=frameOpencvDnn.shape[1]
    blob=cv2.dnn.blobFromImage(frameOpencvDnn, 1.0, (300, 300), [104, 117, 123], True, False)

    net.setInput(blob)
    detections=net.forward()
    faceBoxes=[]
    for i in range(detections.shape[2]):
        confidence=detections[0,0,i,2]
        if confidence>conf_threshold:
            x1=int(detections[0,0,i,3]*frameWidth)
            y1=int(detections[0,0,i,4]*frameHeight)
            x2=int(detections[0,0,i,5]*frameWidth)
            y2=int(detections[0,0,i,6]*frameHeight)
            faceBoxes.append([x1,y1,x2,y2])
            cv2.rectangle(frameOpencvDnn, (x1,y1), (x2,y2), (0,255,0), int(round(frameHeight/150)), 8)
    return frameOpencvDnn,faceBoxes


parser=argparse.ArgumentParser()
parser.add_argument('--image')

args=parser.parse_args()

faceProto="opencv_face_detector.pbtxt"
faceModel="opencv_face_detector_uint8.pb"
ageProto="age_deploy.prototxt"
ageModel="age_net.caffemodel"
genderProto="gender_deploy.prototxt"
genderModel="gender_net.caffemodel"

MODEL_MEAN_VALUES=(78.4263377603, 87.7689143744, 114.895847746)
ageList=['(0-2)', '(4-6)', '(8-12)', '(15-20)', '(25-32)', '(38-43)', '(48-53)', '(60-100)']
genderList=['Male','Female']

faceNet=cv2.dnn.readNet(faceModel,faceProto)
ageNet=cv2.dnn.readNet(ageModel,ageProto)
genderNet=cv2.dnn.readNet(genderModel,genderProto)

video=cv2.VideoCapture(args.image if args.image else 0)
padding=20
while cv2.waitKey(1)<0:
    hasFrame,frame=video.read()
    if not hasFrame:
        cv2.waitKey()
        break

    resultImg,faceBoxes=highlightFace(faceNet,frame)
    if not faceBoxes:
        print("No face detected")

    for faceBox in faceBoxes:
        face=frame[max(0,faceBox[1]-padding):
                   min(faceBox[3]+padding,frame.shape[0]-1),max(0,faceBox[0]-padding)
                   :min(faceBox[2]+padding, frame.shape[1]-1)]

        blob=cv2.dnn.blobFromImage(face, 1.0, (227,227), MODEL_MEAN_VALUES, swapRB=False)
        genderNet.setInput(blob)
        genderPreds=genderNet.forward()
        gender=genderList[genderPreds[0].argmax()]
        print(f'Gender: {gender}')

        ageNet.setInput(blob)
        agePreds=ageNet.forward()
        age=ageList[agePreds[0].argmax()]
        print(f'Age: {age[1:-1]} years')

        cv2.putText(resultImg, f'{gender}, {age}', (faceBox[0], faceBox[1]-10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,255), 2, cv2.LINE_AA)
        cv2.imshow("Detecting age and gender", resultImg)


usage: ipykernel_launcher.py [-h] [--image IMAGE]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\ASUS\AppData\Roaming\jupyter\runtime\kernel-7bf329bb-10d5-459e-8172-09b445253612.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [17]:
import cv2
import numpy as np
import pyautogui

def detect_objects(image, net):
    height, width = image.shape[:2]
    blob = cv2.dnn.blobFromImage(image, 1 / 255.0, (416, 416), swapRB=True, crop=False)

    net.setInput(blob)
    layer_names = net.getLayerNames()
    output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
    outputs = net.forward(output_layers)

    boxes = []
    confidences = []
    class_ids = []

    for output in outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]

            if confidence > 0.5:
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)

                x = int(center_x - w / 2)
                y = int(center_y - h / 2)

                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    indices = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

    return boxes, confidences, class_ids, indices

def main():
    net = cv2.dnn.readNetFromDarknet("yolov8.cfg", "yolov8.weights")

    classes = []
    with open("coco.names", "r") as f:
        classes = [line.strip() for line in f.readlines()]

    colors = np.random.uniform(0, 255, size=(len(classes), 3))

    cap = cv2.VideoCapture(0)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        boxes, confidences, class_ids, indices = detect_objects(frame, net)

        for i in indices:
            i = i[0]
            box = boxes[i]
            x, y, w, h = box[0], box[1], box[2], box[3]
            label = classes[class_ids[i]]

            cv2.rectangle(frame, (x, y), (x + w, y + h), colors[class_ids[i]], 2)
            cv2.putText(frame, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, colors[class_ids[i]], 2)

        cv2.imshow("Object Detection", frame)

        key = cv2.waitKey(1)
        if key == 27:  # Press 'Esc' key to exit
            break
        elif key == ord('c'):  # Press 'c' key to select and remove object
            mouse_x, mouse_y = pyautogui.position()
            for i in indices:
                i = i[0]
                box = boxes[i]
                x, y, w, h = box[0], box[1], box[2], box[3]
                if x <= mouse_x <= x + w and y <= mouse_y <= y + h:
                    frame[y:y+h, x:x+w] = cv2.imread("new_background.jpg")

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

error: OpenCV(4.7.0) D:\a\opencv-python\opencv-python\opencv\modules\dnn\src\darknet\darknet_importer.cpp:210: error: (-212:Parsing error) Failed to open NetParameter file: yolov8.cfg in function 'cv::dnn::dnn4_v20221220::readNetFromDarknet'
