In [11]:
# Install the ultralytics library if not already installed
!pip install ultralytics

from ultralytics import YOLO

# Load YOLOv8 model (nano version, replace with desired version if needed)
model = YOLO('yolo11m.pt') # You can use yolov8n.pt, yolov8s.pt, yolov8m.pt, etc.

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11m.pt to 'yolo11m.pt'...


100%|██████████| 38.8M/38.8M [00:00<00:00, 248MB/s]


In [12]:
!pip install -q segment_anything
!pip install -q supervision

In [14]:
import cv2
import torch
import numpy as np
from segment_anything import SamPredictor, sam_model_registry
from ultralytics import YOLO
import matplotlib.pyplot as plt
from google.colab import files

# Download SAM checkpoint if not already downloaded
!wget -nc 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'
sam_checkpoint = "sam_vit_h_4b8939.pth"
model_type = 'vit_h'

# Set device to CPU
device = 'cuda'

# Load YOLOv8 model
model = YOLO('yolo11m.pt')

File ‘sam_vit_h_4b8939.pth’ already there; not retrieving.



In [15]:
# Load SAM model
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device)
mask_predictor = SamPredictor(sam)

# Open video file
video_path = "/content/video70.mp4"  # Provide your input video path
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    print(f"Error: Cannot open video file {video_path}")
    exit()

# Define output video writer
output_video_path = "output_video.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = int(cap.get(cv2.CAP_PROP_FPS))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Run YOLOv11 on the frame
    results = model.predict(source=frame, conf=0.25, verbose=False)
    predicted_boxes = results[0].boxes.xyxy.cpu().numpy()  # Convert to numpy for SAM compatibility

    # Check if YOLOv11 found any boxes
    if len(predicted_boxes) == 0:
        # Write the original frame if no objects are detected
        out.write(frame)
        continue

    # Transform the YOLOv11 predicted boxes for SAM
    transformed_boxes = mask_predictor.transform.apply_boxes_torch(
        torch.tensor(predicted_boxes, device=device),
        frame.shape[:2]
    )

    # Run SAM on the frame
    mask_predictor.set_image(frame_rgb)
    masks, scores, logits = mask_predictor.predict_torch(
        boxes=transformed_boxes,
        multimask_output=False,
        point_coords=None,
        point_labels=None
    )

    # Combine masks
    final_mask = np.zeros((frame.shape[0], frame.shape[1]), dtype=np.uint8)
    for mask in masks:
        final_mask = np.bitwise_or(final_mask, mask[0].cpu().numpy())

    # Apply mask to the frame
    mask_overlay = np.zeros_like(frame, dtype=np.uint8)
    mask_overlay[:, :, 1] = (final_mask * 255).astype(np.uint8)  # Green channel

    # Blend the mask overlay with the original frame
    blended_frame = cv2.addWeighted(frame, 0.7, mask_overlay, 0.3, 0)

    # Write the frame to the output video
    out.write(blended_frame)

# Release resources
cap.release()
out.release()