In [1]:
from ultralytics import YOLO
import cv2
import math
import os

# Function to draw bounding boxes, labels, and confidence scores
def draw_boxes(img, results, class_names, objects_to_detect, confidence_threshold=0.5):
    object_detected = False
    for r in results:
        boxes = r.boxes.data

        for box in boxes:
            x1, y1, x2, y2 = map(int, box[:4])  # Extracting the first 4 elements
            confidence = math.ceil((box[4] * 100)) / 100

            # Check if the detected class index is within the range of class_names
            if int(box[5]) < len(class_names):
                # Check if the detected object is in the objects_to_detect list
                if confidence > confidence_threshold and class_names[int(box[5])] in objects_to_detect:
                    object_detected = True
                    cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)

                    org = [x1, y1]
                    font = cv2.FONT_HERSHEY_SIMPLEX
                    fontScale = 1
                    color = (255, 0, 0)
                    thickness = 2

                    cv2.putText(img, f"{class_names[int(box[5])]} {confidence:.2f}", org, font, fontScale, color, thickness)
            else:
                print(f"Detected class index out of range: {int(box[5])}")

    return object_detected

# start webcam
cap = cv2.VideoCapture(0)
cap.set(3, 640)
cap.set(4, 480)

# model
model = YOLO("yolo-Weights/yolov8n.pt")

# object classes
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "potato", "tomato", "mango", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]

objects_to_detect = ["apple", "banana", "mango", "potato", "tomato"]

while True:
    success, img = cap.read()
    results = model(img, stream=True)
    object_detected = draw_boxes(img, results, classNames, objects_to_detect)

    cv2.imshow('Webcam', img)

    # Auto capture when an object is detected
    if object_detected:
        capture_path = "captured_images"
        os.makedirs(capture_path, exist_ok=True)
        capture_file = os.path.join(capture_path, "object_capture.jpg")
        cv2.imwrite(capture_file, img)
        print(f"Object detected! Image captured: {capture_file}")

        break

    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

0: 480x640 1 person, 2 bottles, 58.0ms
Speed: 2.4ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
0: 480x640 2 persons, 2 bottles, 50.1ms
Speed: 2.2ms preprocess, 50.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
0: 480x640 2 persons, 2 bottles, 48.4ms
Speed: 2.4ms preprocess, 48.4ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
0: 480x640 1 person, 2 bottles, 46.6ms
Speed: 1.5ms preprocess, 46.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
0: 480x640 2 persons, 2 bottles, 44.1ms
Speed: 1.3ms preprocess, 44.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
0: 480x640 2 persons, 2 bottles, 45.7ms
Speed: 1.1ms preprocess, 45.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
0: 480x640 2 persons, 2 bottles, 45.9ms
Speed: 1.3ms preprocess, 45.9ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)
0: 480x640 2 persons, 2 bottles, 42.8ms
Spe