In [28]:
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn

# Load Faster R-CNN
rcnn_model = fasterrcnn_resnet50_fpn(pretrained=True)
rcnn_model.eval()


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [16]:
import sys
sys.path.append('yolov5')  # Add YOLOv5 path
import torch

# Load the YOLO model (pre-trained on COCO)
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)  # Change 'yolov5s' to 'yolov5m' or 'yolov5l' for larger models
yolo_model.eval()



Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to C:\Users\subra/.cache\torch\hub\master.zip
YOLOv5  2024-10-2 Python-3.12.4 torch-2.4.0+cu124 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 4096MiB)

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt to yolov5s.pt...
100%|█████████████████████████████████████████████████████████████████████████████| 14.1M/14.1M [00:03<00:00, 3.90MB/s]

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


AutoShape(
  (model): DetectMultiBackend(
    (model): DetectionModel(
      (model): Sequential(
        (0): Conv(
          (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
          (act): SiLU(inplace=True)
        )
        (1): Conv(
          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (act): SiLU(inplace=True)
        )
        (2): C3(
          (cv1): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv3): Conv(
            (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (m): Sequential(
            (0): Bottleneck(
              (cv1): Conv(
                (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
  

In [18]:
import cv2
import numpy as np

def process_image(image_path):
    image = cv2.imread(image_path)
    
    # Run YOLO predictions
    yolo_results = yolo_model(image)
    
    # Run R-CNN predictions
    rcnn_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    rcnn_tensor = torch.from_numpy(rcnn_image).float().permute(2, 0, 1) / 255.0
    rcnn_tensor = rcnn_tensor.unsqueeze(0)  # Add batch dimension

    with torch.no_grad():
        rcnn_results = rcnn_model(rcnn_tensor)[0]

    # Merge predictions from both models
    final_results = merge_predictions(yolo_results, rcnn_results)
    
    # Annotate output
    annotated_image = annotate_output(image, final_results)
    
    return annotated_image



In [20]:
def merge_predictions(yolo_results, rcnn_results):
    final_detections = []

    # Process YOLO results
    for *box, conf, cls in yolo_results.xyxy[0]:  # YOLO results
        final_detections.append((box, int(cls.item()), conf.item()))

    # Process R-CNN results
    for box, label, score in zip(rcnn_results['boxes'], rcnn_results['labels'], rcnn_results['scores']):
        if score > 0.5:  # Only consider confident R-CNN detections
            final_detections.append((box, label.item(), score.item()))
    
    return final_detections


In [32]:
# COCO Class Names
COCO_CLASSES = [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter',
    'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
    'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase',
    'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
    'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle',
    'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
    'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet',
    'TV', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
    'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
    'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]


In [34]:
def annotate_output(image, detections):
    for box, label, score in detections:
        box = list(map(int, box))  # Convert tensor to int
        cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 2)

        # Use class names from COCO_CLASSES
        if label < len(COCO_CLASSES):
            label_name = f"{COCO_CLASSES[label]}: {score:.2f}"  # Map label to class name
        else:
            label_name = f"Class {label}: {score:.2f}"

        cv2.putText(image, label_name, (box[0], box[1]-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
    return image


In [24]:
def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        final_results = process_image(frame)
        
        cv2.imshow('Detection', final_results)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
            
    cap.release()
    cv2.destroyAllWindows()


In [40]:
# Process a single image
annotated_image = process_image('bird.jpg')
cv2.imshow('Annotated Image', annotated_image)
cv2.waitKey(0)
cv2.destroyAllWindows()




  with amp.autocast(autocast):


In [41]:
# Process a videoq
process_video(0)

KeyboardInterrupt: 