## YOLOv11 + SAM2 Video Segmentation Pipeline
Processes video frames using YOLOv11 for detection and SAM2 for segmentation, saving:
- detection frames  
- segmentation masks  
- overlay frames  
Outputs a final annotated video and reports model size, FPS, inference time, and GPU memory usage.


In [None]:
import os
import cv2
import time
import glob
import numpy as np
import torch
from ultralytics import YOLO
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
from PIL import Image

VIDEO_PATH   = ""   # Directory containing input video
OUT_DIR      = ""   # Output directory for results
YOLO_MODEL   = ""   # Path to YOLO model weights
SAM2_CFG     = ""   # Path to SAM2 large config
SAM2_WEIGHTS = ""   # Path to SAM2 large weights
IMG_SIZE     = 960
CONF_THRESH  = 0.3

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

mask_dir = os.path.join(OUT_DIR, 'masks')
overlay_dir = os.path.join(OUT_DIR, 'overlays')
det_dir = os.path.join(OUT_DIR, 'detected_fires')
os.makedirs(mask_dir, exist_ok=True)
os.makedirs(overlay_dir, exist_ok=True)
os.makedirs(det_dir, exist_ok=True)

yolo_model = YOLO(YOLO_MODEL).to(DEVICE)

def load_sam(cfg, ckpt, device):
    model = build_sam2(cfg, ckpt, device=device)
    return SAM2ImagePredictor(model)

sam_predictor = load_sam(SAM2_CFG, SAM2_WEIGHTS, DEVICE)

param_count_yolo = sum(p.numel() for p in yolo_model.model.parameters())
param_count_sam2 = sum(p.numel() for p in sam_predictor.model.parameters())
model_size_mb = (param_count_yolo + param_count_sam2) * 4 / (1024**2)
print(f"Model Size: {model_size_mb:.2f} MB")
print(f"YOLO Parameters: {param_count_yolo:,}")
print(f"SAM2 Parameters: {param_count_sam2:,}")

cap = cv2.VideoCapture(VIDEO_PATH)
fourcc = cv2.VideoWriter_fourcc(*'XVID')
fps = cap.get(cv2.CAP_PROP_FPS)
frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out_video_path = os.path.join(OUT_DIR, "final_output.avi")
out_writer = cv2.VideoWriter(out_video_path, fourcc, fps, (frame_w, frame_h))

frame_count = 0
total_time = 0.0
torch.cuda.reset_peak_memory_stats()

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_count += 1
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img_pil = Image.fromarray(img_rgb)

    start_time = time.time()

    results = yolo_model.predict([img_pil], imgsz=IMG_SIZE, conf=CONF_THRESH, verbose=False)
    boxes_xyxy = results[0].boxes.xyxy.cpu().numpy()
    confs = results[0].boxes.conf.cpu().numpy()

    det_vis = img_rgb.copy()
    for box, conf in zip(boxes_xyxy, confs):
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(det_vis, (x1, y1), (x2, y2), (0, 255, 0), 2)
        conf_text = f"{conf:.2f}"
        cv2.putText(det_vis, conf_text, (x1 - 50, y1 + 15),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
    cv2.imwrite(os.path.join(det_dir, f"{frame_count:05d}.png"),
                cv2.cvtColor(det_vis, cv2.COLOR_RGB2BGR))

    sam_predictor.set_image(img_rgb)
    final_mask = np.zeros(img_rgb.shape[:2], dtype=np.uint8)
    for box in boxes_xyxy:
        masks, scores, _ = sam_predictor.predict(
            point_coords=None,
            point_labels=None,
            box=box[None, :],
            multimask_output=False,
        )
        mask = masks[0].astype(np.uint8)
        final_mask = np.maximum(final_mask, mask)

    cv2.imwrite(os.path.join(mask_dir, f"{frame_count:05d}.png"), final_mask * 255)

    overlay = img_rgb.copy()
    red = np.zeros_like(img_rgb); red[:] = (255, 0, 0)
    alpha = 0.4
    overlay = np.where(final_mask[..., None] == 1,
                       cv2.addWeighted(overlay, 1 - alpha, red, alpha, 0),
                       overlay)
    overlay_bgr = cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR)
    cv2.imwrite(os.path.join(overlay_dir, f"{frame_count:05d}.png"), overlay_bgr)

    out_writer.write(overlay_bgr)

    elapsed = time.time() - start_time
    total_time += elapsed
    print(f"[{frame_count}] Frame processed in {elapsed*1000:.2f} ms")

cap.release()
out_writer.release()

avg_time_ms = (total_time / frame_count) * 1000
fps_actual = frame_count / total_time
peak_mem_mb = torch.cuda.max_memory_allocated() / (1024**2)

print(f"\n--- Computational Efficiency ---")
print(f"Average Inference Time: {avg_time_ms:.2f} ms/frame")
print(f"FPS: {fps_actual:.2f}")
print(f"Peak GPU Memory: {peak_mem_mb:.2f} MB")
print(f"Model Size: {model_size_mb:.2f} MB")
print(f"Output video saved to: {out_video_path}")
print(f"Overlay images, masks, and detections saved to: {OUT_DIR}")
