In [1]:
import torch
import pandas as pd
from PIL import Image
import numpy as np
import os
import clip
from ultralytics import YOLO
import faiss
import json
import cv2  # For video processing
from tqdm.notebook import tqdm
import collections

# Visualization
%matplotlib inline
import matplotlib.pyplot as plt

print("Libraries imported.")

# Device setup
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Apple Silicon GPU)")
else:
    device = torch.device("cpu")
    print("MPS not found, using CPU")

Libraries imported.
Using MPS (Apple Silicon GPU)


In [2]:
# --- Paths ---
MODELS_DIR = '../models/'
DATA_DIR = '../data/'
VIDEO_DIR = '../videos/'

# --- Load Models (YOLO and CLIP) ---
yolo_model = YOLO(os.path.join(MODELS_DIR, 'best.pt')).to(device)
clip_model, preprocess = clip.load("ViT-B/32", device=device)
print("YOLO and CLIP models loaded.")

# --- Load FAISS Index and Mapping ---
index = faiss.read_index(os.path.join(MODELS_DIR, "catalog_index.faiss"))
with open(os.path.join(MODELS_DIR, "product_id_map.json"), 'r') as f:
    product_id_map = json.load(f)
print(f"FAISS index loaded with {index.ntotal} vectors.")

# --- Load Master Catalog to get Product Details ---
df_catalog = pd.read_csv(os.path.join(DATA_DIR, 'catalog_full.csv')).set_index('id')
print("Master catalog data loaded.")

YOLO and CLIP models loaded.
FAISS index loaded with 7922 vectors.
Master catalog data loaded.


In [3]:
def process_video_for_products(video_path, yolo_model, clip_model, preprocess, faiss_index, id_map, device, frame_rate=1):
    """
    Processes a video to find matching products from the catalog.
    
    :param frame_rate: How many frames per second to process.
    :return: A list of detected product IDs and their similarity scores.
    """
    detected_products = []
    
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps / frame_rate)
    
    frame_count = 0
    while True:
        success, frame_bgr = vidcap.read()
        if not success:
            break # End of video
        
        # Process only at the desired frame rate
        if frame_count % frame_interval == 0:
            # Convert frame from BGR (OpenCV) to RGB (Pillow/CLIP)
            frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
            image = Image.fromarray(frame_rgb)

            # 1. Detect with YOLO
            results = yolo_model(image, verbose=False)

            # 2. Find the best box
            best_box = None
            max_area = 0
            if len(results[0].boxes) > 0:
                for box in results[0].boxes:
                    area = (box.xyxy[0][2] - box.xyxy[0][0]) * (box.xyxy[0][3] - box.xyxy[0][1])
                    if area > max_area:
                        max_area = area
                        best_box = box

            # 3. If a box is found, get embedding and search
            if best_box is not None:
                x1, y1, x2, y2 = map(int, best_box.xyxy[0])
                cropped_image = image.crop((x1, y1, x2, y2))
                
                # Get CLIP embedding
                image_input = preprocess(cropped_image).unsqueeze(0).to(device)
                with torch.no_grad():
                    image_features = clip_model.encode_image(image_input)
                image_features /= image_features.norm(dim=-1, keepdim=True)
                embedding_np = image_features.cpu().numpy()

                # 4. Search FAISS
                k = 5 # Number of nearest neighbors to find
                distances, indices = faiss_index.search(embedding_np, k)
                
                # Store results
                for i in range(k):
                    match_index = indices[0][i]
                    match_distance = distances[0][i]
                    # Convert L2 distance to a pseudo-similarity score (0-1)
                    similarity = 1 / (1 + match_distance) 
                    
                    product_id = id_map[match_index]
                    detected_products.append({'id': product_id, 'similarity': similarity})
        
        frame_count += 1
        
    vidcap.release()
    return detected_products

print("Video processing function is defined.")

Video processing function is defined.


In [4]:
import gc

def process_video_in_batches(video_path, yolo_model, clip_model, preprocess, faiss_index, id_map, device, frame_rate=1, batch_size=10):
    """
    Processes a video in small batches to prevent memory-related kernel crashes.
    
    :param frame_rate: How many frames per second to sample from the video.
    :param batch_size: How many frames to process at a time before clearing memory.
    :return: A list of detected product IDs and their similarity scores.
    """
    detected_products = []
    
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    if fps == 0:
        print("Warning: Could not determine video FPS. Assuming 30.")
        fps = 30
        
    frame_interval = int(fps / frame_rate)
    if frame_interval == 0: frame_interval = 1
    
    frame_count = 0
    frame_buffer = [] # This will hold the frames for one batch

    while True:
        success, frame_bgr = vidcap.read()
        if not success:
            break
        
        if frame_count % frame_interval == 0:
            frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
            image = Image.fromarray(frame_rgb)
            frame_buffer.append(image)

            # WHEN THE BATCH IS FULL, PROCESS IT
            if len(frame_buffer) >= batch_size:
                print(f"--- Processing batch of {len(frame_buffer)} frames ---")
                
                # YOLO can process a list of images at once - this is very efficient
                results_batch = yolo_model(frame_buffer, verbose=False)

                # Now loop through the results FOR THIS BATCH
                for i, results in enumerate(results_batch):
                    original_image = frame_buffer[i]
                    
                    # Same 'best box' logic as before
                    best_box = None
                    max_area = 0
                    if len(results.boxes) > 0:
                        for box in results.boxes:
                            area = (box.xyxy[0][2] - box.xyxy[0][0]) * (box.xyxy[0][3] - box.xyxy[0][1])
                            if area > max_area:
                                max_area = area
                                best_box = box
                    
                    # If a box is found, get embedding and search
                    if best_box is not None:
                        x1, y1, x2, y2 = map(int, best_box.xyxy[0])
                        cropped_image = original_image.crop((x1, y1, x2, y2))
                        
                        image_input = preprocess(cropped_image).unsqueeze(0).to(device)
                        with torch.no_grad():
                            image_features = clip_model.encode_image(image_input)
                        image_features /= image_features.norm(dim=-1, keepdim=True)
                        embedding_np = image_features.cpu().numpy()

                        k = 5
                        distances, indices = faiss_index.search(embedding_np, k)
                        
                        for j in range(k):
                            similarity = 1 / (1 + distances[0][j])
                            product_id = id_map[indices[0][j]]
                            detected_products.append({'id': product_id, 'similarity': similarity})

                # --- MEMORY CLEANUP (THE CRITICAL PART) ---
                frame_buffer.clear() # Empty the buffer
                del results_batch # Explicitly delete the large results object
                gc.collect() # Force Python's garbage collector to run
                print("--- Batch processed and memory cleared ---")

        frame_count += 1
    
    # Process any leftover frames in the buffer after the loop finishes
    if frame_buffer:
        print(f"--- Processing final batch of {len(frame_buffer)} frames ---")
        # (Repeat the same processing logic as inside the loop)
        results_batch = yolo_model(frame_buffer, verbose=False)
        for i, results in enumerate(results_batch):
            original_image = frame_buffer[i]
            best_box = None; max_area = 0
            if len(results.boxes) > 0:
                for box in results.boxes:
                    area = (box.xyxy[0][2] - box.xyxy[0][0]) * (box.xyxy[0][3] - box.xyxy[0][1])
                    if area > max_area:
                        max_area = area; best_box = box
            if best_box is not None:
                x1, y1, x2, y2 = map(int, best_box.xyxy[0]); cropped_image = original_image.crop((x1, y1, x2, y2))
                image_input = preprocess(cropped_image).unsqueeze(0).to(device)
                with torch.no_grad():
                    image_features = clip_model.encode_image(image_input)
                image_features /= image_features.norm(dim=-1, keepdim=True); embedding_np = image_features.cpu().numpy()
                k = 5; distances, indices = faiss_index.search(embedding_np, k)
                for j in range(k):
                    similarity = 1 / (1 + distances[0][j]); product_id = id_map[indices[0][j]]
                    detected_products.append({'id': product_id, 'similarity': similarity})
    
    vidcap.release()
    print("Video processing complete.")
    return detected_products

In [6]:
# --- Choose a video to test ---

test_video_name = '2025-05-28_13-42-32_UTC.mp4' 


test_video_path = os.path.join(VIDEO_DIR, test_video_name)

if os.path.exists(test_video_path):
    print(f"Processing video: {test_video_name} with batching...")
    
    # --- Calling the NEW function ---
    all_detections = process_video_in_batches(
        video_path=test_video_path, 
        yolo_model=yolo_model, 
        clip_model=clip_model, 
        preprocess=preprocess, 
        faiss_index=index, 
        id_map=product_id_map, 
        device=device,
        frame_rate=1,  # Process 2 frames per second
        batch_size=1  # Process them in chunks of 10
    )
    # -------------------------------

    print(f"Finished processing. Found {len(all_detections)} potential matches across all frames.")
else:
    print(f"Video file not found at {test_video_path}. Please check the file name and location.")
    all_detections = []

# --- The rest of the cell stays the same ---
if all_detections:
    df_results = pd.DataFrame(all_detections)
    display(df_results.head(10))

--- STARTING YOLO-ONLY DEBUG RUN ---


YOLO Debug Run:   0%|          | 0/25 [00:00<?, ?it/s]

--- YOLO DEBUG COMPLETE ---
Processed video without crashing. Total boxes found: 32
