Using Yolov5 Model

In [2]:
from ultralytics import YOLO
import cv2

# Load the YOLOv5 model
model = YOLO('yolov5s.pt')  # YOLOv5s is a small pre-trained model

# Open a video file or real-time camera
cap = cv2.VideoCapture(0)  # Use 0 for webcam or provide a video path
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Perform object detection
    results = model.predict(source=frame, save=False)

    # Draw bounding boxes
    annotated_frame = results[0].plot()

    # Display the frame
    cv2.imshow('YOLOv5 Detection', annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break


cap.release()
cv2.destroyAllWindows()



PRO TIP  Replace 'model=yolov5s.pt' with new 'model=yolov5su.pt'.
YOLOv5 'u' models are trained with https://github.com/ultralytics/ultralytics and feature improved performance vs standard YOLOv5 models trained with https://github.com/ultralytics/yolov5.


0: 480x640 1 person, 147.1ms
Speed: 0.0ms preprocess, 147.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 145.6ms
Speed: 0.0ms preprocess, 145.6ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 125.7ms
Speed: 0.0ms preprocess, 125.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 117.0ms
Speed: 0.0ms preprocess, 117.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 115.7ms
Speed: 0.0ms preprocess, 115.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 couch, 118.1ms
Speed: 2.3ms preprocess, 118.1ms inference, 0.0ms postprocess per

Using SSD Model

In [3]:
import cv2
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F, ToTensor
import numpy as np

# COCO class names
COCO_CLASSES = [
    "__background__", "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
    "truck", "boat", "traffic light", "fire hydrant", "N/A", "stop sign", "parking meter",
    "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
    "giraffe", "N/A", "backpack", "umbrella", "N/A", "N/A", "handbag", "tie", "suitcase",
    "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
    "skateboard", "surfboard", "tennis racket", "bottle", "N/A", "wine glass", "cup",
    "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
    "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
    "bed", "N/A", "dining table", "N/A", "N/A", "toilet", "N/A", "tv", "laptop", "mouse",
    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator",
    "N/A", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
]

# Load the Faster R-CNN model
model = fasterrcnn_resnet50_fpn(pretrained=True).eval()

# Helper function to draw labels
def draw_label(frame, label, x1, y1):
    font_scale = 0.5
    font = cv2.FONT_HERSHEY_SIMPLEX
    thickness = 1
    color = (0, 255, 0)
    size = cv2.getTextSize(label, font, font_scale, thickness)[0]
    cv2.rectangle(frame, (x1, y1 - size[1]), (x1 + size[0], y1), color, cv2.FILLED)
    cv2.putText(frame, label, (x1, y1 - 5), font, font_scale, (0, 0, 0), thickness)

# Open video source
cap = cv2.VideoCapture(0)  # Use 0 for webcam or provide a video file path

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame to tensor
    image_tensor = ToTensor()(frame).unsqueeze(0)

    # Perform object detection
    with torch.no_grad():
        outputs = model(image_tensor)

    # Extract detections
    detections = outputs[0]
    boxes = detections['boxes'].cpu().numpy()
    scores = detections['scores'].cpu().numpy()
    labels = detections['labels'].cpu().numpy()
    

    # Loop through detected objects
    for i, box in enumerate(boxes):
        if scores[i] > 0.5:  # Confidence threshold
            x1, y1, x2, y2 = box.astype(int)
            label = COCO_CLASSES[labels[i]]  # Class name
            confidence = f"{scores[i]:.2f}"  # Confidence score
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Draw box
            draw_label(frame, f"{label} ({confidence})", x1, y1)  # Add label

    # Display the annotated frame
    cv2.imshow('Faster R-CNN Detection (COCO)', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


