## YOLOv11 + TinySAM Video Segmentation (Box Prompting)
Processes video frames with YOLOv11 for detection and TinySAM for segmentation using bounding box prompts.  
Saves:
- detection frames  
- segmentation masks  
- overlays  
Outputs final annotated video and reports model size, FPS, inference time, and GPU memory usage.


In [None]:
import os
import cv2
import time
import numpy as np
import torch
from PIL import Image
from ultralytics import YOLO

import sys
sys.path.append("..")
from tinysam import sam_model_registry, SamPredictor


VIDEO_PATH   = ""   # Directory containing input video
YOLO_MODEL   = ""   # Path to YOLO model weights
TINY_CKPT    = ""   # Path to YOLO model tinysam_42.3.pth weights
OUT_DIR      = ""   # Output directory for results
IMG_SIZE     = 960
CONF_THRESH  = 0.3

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

yolo_model = YOLO(YOLO_MODEL).to(DEVICE)
sam = sam_model_registry["vit_t"](checkpoint=TINY_CKPT)
sam.to(device=DEVICE)
predictor = SamPredictor(sam)

mask_dir = os.path.join(OUT_DIR, "masks")
overlay_dir = os.path.join(OUT_DIR, "overlays")
det_dir = os.path.join(OUT_DIR, "detected_fires")
os.makedirs(mask_dir, exist_ok=True)
os.makedirs(overlay_dir, exist_ok=True)
os.makedirs(det_dir, exist_ok=True)

param_count_yolo = sum(p.numel() for p in yolo_model.model.parameters())
param_count_tinysam = sum(p.numel() for p in sam.parameters())
model_size_mb = (param_count_yolo + param_count_tinysam) * 4 / (1024**2)
print(f"Model Size: {model_size_mb:.2f} MB")
print(f"YOLO Parameters: {param_count_yolo:,}")
print(f"TinySAM Parameters: {param_count_tinysam:,}")

cap = cv2.VideoCapture(VIDEO_PATH)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out_video_path = os.path.join(OUT_DIR, "final_output.avi")
out_writer = cv2.VideoWriter(out_video_path, fourcc, fps, (frame_w, frame_h))

frame_count = 0
total_time = 0.0
torch.cuda.reset_peak_memory_stats()

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame_count += 1
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img_pil = Image.fromarray(img_rgb)

    start_time = time.time()

    results = yolo_model.predict([img_pil], imgsz=IMG_SIZE, conf=CONF_THRESH, verbose=False)
    boxes_xyxy = results[0].boxes.xyxy.cpu().numpy()
    confs = results[0].boxes.conf.cpu().numpy()

    det_vis = img_rgb.copy()
    for box, conf in zip(boxes_xyxy, confs):
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(det_vis, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(det_vis, f"{conf:.2f}", (x1 - 50, y1 + 15),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
    cv2.imwrite(os.path.join(det_dir, f"{frame_count:05d}.png"),
                cv2.cvtColor(det_vis, cv2.COLOR_RGB2BGR))

    predictor.set_image(img_rgb)
    final_mask = np.zeros(img_rgb.shape[:2], dtype=np.uint8)
    for box in boxes_xyxy:
        masks, scores, logits = predictor.predict(
            point_coords=None,
            point_labels=None,
            box=box[None, :]
        )
        mask = masks[0].astype(np.uint8)
        final_mask = np.maximum(final_mask, mask)

    cv2.imwrite(os.path.join(mask_dir, f"{frame_count:05d}.png"), final_mask * 255)

    overlay = img_rgb.copy()
    red = np.zeros_like(img_rgb); red[:] = (255, 0, 0)
    overlay = np.where(final_mask[..., None] == 1,
                       cv2.addWeighted(overlay, 0.6, red, 0.4, 0),
                       overlay)
    overlay_bgr = cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR)
    cv2.imwrite(os.path.join(overlay_dir, f"{frame_count:05d}.png"), overlay_bgr)

    out_writer.write(overlay_bgr)

    elapsed = time.time() - start_time
    total_time += elapsed
    print(f"[{frame_count}] Frame processed in {elapsed*1000:.2f} ms")

cap.release()
out_writer.release()

avg_time_ms = (total_time / frame_count) * 1000
fps_actual = frame_count / total_time
peak_mem_mb = torch.cuda.max_memory_allocated() / (1024**2)

print(f"\n--- Computational Efficiency ---")
print(f"Average Inference Time: {avg_time_ms:.2f} ms/frame")
print(f"FPS: {fps_actual:.2f}")
print(f"Peak GPU Memory: {peak_mem_mb:.2f} MB")
print(f"Model Size: {model_size_mb:.2f} MB")
print(f"Output video saved to: {out_video_path}")
print(f"Detection images, masks, and overlays saved to: {OUT_DIR}")
