In [None]:
'''YOLO (You Only Look Once) is one of the most widely used object detection algorithms
today. This exercise will explore its usage with a webcam to enable real-time object
detection.'''

In [1]:
#Set up the environment. OpenCV and YOLO need to be installed.
!pip install opencv-python



In [2]:
#We install the ultralytics library that makes working with YOLO very easy and hassle-free.
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.2.19-py3-none-any.whl.metadata (40 kB)
     ---------------------------------------- 0.0/40.7 kB ? eta -:--:--
     ------------------- ------------------ 20.5/40.7 kB 640.0 kB/s eta 0:00:01
     ------------------- ------------------ 20.5/40.7 kB 640.0 kB/s eta 0:00:01
     ---------------------------- --------- 30.7/40.7 kB 217.9 kB/s eta 0:00:01
     -------------------------------------- 40.7/40.7 kB 177.1 kB/s eta 0:00:00
Collecting torchvision>=0.9.0 (from ultralytics)
  Downloading torchvision-0.18.0-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Downloading ultralytics-8.2.19-py3-none-any.whl (757 kB)
   ---------------------------------------- 0.0/757.9 kB ? eta -:--:--
   --- ------------------------------------ 61.4/757.9 kB 1.7 MB/s eta 0:00:01
   ------ --------------------------------- 122.9/757.9 kB 1.2 MB/s 

In [2]:
'''The YOLO model is loaded using the ultralytics library and specifies the location of the YOLO weights file in the yolo-Weights/yolov8n.pt.'''
from ultralytics import YOLO
model = YOLO("yolo-Weights/yolov8n.pt")

In [3]:
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck",
"boat","traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
"dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
"umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
"baseball bat","baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wineglass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa",
"pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse",
"remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
"refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
"toothbrush"]


In [4]:
import cv2
import math
# start webcam
cap = cv2.VideoCapture(0)
cap.set(3, 640)
cap.set(4, 480)

True

In [None]:
'''The while loop starts and it reads each frame from the webcam using cap.read(). Then it passes the frame to the YOLO model for object detection. The results of object detection are
stored in the ‘results’ variable'''

# Start webcam
cap = cv2.VideoCapture(0)
cap.set(3, 640)
cap.set(4, 480)

while True:
    success, img = cap.read()
    
    # Check if frame is read correctly
    if not success:
        print("Failed to capture image")
        continue
    
    results = model(img, stream=True)
    
    for r in results:
        boxes = r.boxes
        
        for box in boxes:
            # Bounding box
            x1, y1, x2, y2 = box.xyxy[0]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)
            
            # Confidence
            confidence = math.ceil((box.conf[0]*100))/100
            print("Confidence --->", confidence)
            
            # Class name
            cls = int(box.cls[0])
            print("Class name -->", classNames[cls])
            
            # Object details
            org = (x1, y1)
            font = cv2.FONT_HERSHEY_SIMPLEX
            fontScale = 1
            color = (255, 0, 0)
            thickness = 2
            cv2.putText(img, classNames[cls], org, font, fontScale, color, thickness)
        
    cv2.imshow('Webcam', img)
    
    if cv2.waitKey(1) == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()




0: 480x640 3 persons, 1 chair, 482.7ms
Confidence ---> 0.89
Class name --> person
Confidence ---> 0.48
Class name --> person
Confidence ---> 0.44
Class name --> chair
Confidence ---> 0.34
Class name --> person
Speed: 24.5ms preprocess, 482.7ms inference, 2921.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 1 chair, 292.3ms
Confidence ---> 0.85
Class name --> person
Confidence ---> 0.6
Class name --> chair
Confidence ---> 0.56
Class name --> person
Confidence ---> 0.55
Class name --> person
Speed: 13.1ms preprocess, 292.3ms inference, 6.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 1 chair, 254.2ms
Confidence ---> 0.78
Class name --> person
Confidence ---> 0.54
Class name --> chair
Confidence ---> 0.44
Class name --> person
Confidence ---> 0.43
Class name --> person
Speed: 5.7ms preprocess, 254.2ms inference, 5.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 1 wine glass, 1 chair, 242.8ms
Confidence --->