In [1]:
import os
import json
import cv2
import numpy as np
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# ===== Load CLIP model from Hugging Face =====
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# ===== Generate CLIP embedding for one image =====
def get_image_embedding(image_path_or_pil):
    if isinstance(image_path_or_pil, str):
        image = Image.open(image_path_or_pil).convert("RGB")
    else:
        image = image_path_or_pil

    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = clip_model.get_image_features(**inputs)
    return outputs[0].cpu().numpy()


# ===== Crop all detections for a given video =====
def crop_detected_objects(detection_json, frames_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    with open(detection_json, "r") as f:
        detections = json.load(f)

    crop_paths = []

    for frame_obj in detections:
        frame_name = frame_obj["frame"]
        frame_path = os.path.join(frames_dir, frame_name)
        image = cv2.imread(frame_path)

        for i, det in enumerate(frame_obj["detections"]):
            x, y, w, h = map(int, det["bbox"])
            x1 = max(int(x - w / 2), 0)
            y1 = max(int(y - h / 2), 0)
            x2 = min(int(x + w / 2), image.shape[1])
            y2 = min(int(y + h / 2), image.shape[0])

            crop = image[y1:y2, x1:x2]
            crop_path = os.path.join(output_dir, f"{frame_name[:-4]}_obj_{i}.jpg")
            cv2.imwrite(crop_path, crop)
            crop_paths.append(crop_path)

    print(f"✅ Cropped {len(crop_paths)} objects")
    return crop_paths


# ====== MAIN RUN SECTION ======
if __name__ == "__main__":
    video_id = "2025-05-22_08-25-12_UTC"  # change to your actual video ID
    detection_json = f"outputs/{video_id}.json"
    frames_dir = f"frames/{video_id}"
    output_dir = f"cropped_objects/{video_id}"

    os.makedirs(output_dir, exist_ok=True)

    # 1. Crop detections
    crops = crop_detected_objects(detection_json, frames_dir, output_dir)

    # 2. Generate and print sample embedding for one crop
    if crops:
        emb = get_image_embedding(crops[0])
        print("✅ Sample embedding shape:", emb.shape)
        print("🔢 Sample embedding (first 5 values):", emb[:5])


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✅ Cropped 3 objects
✅ Sample embedding shape: (512,)
🔢 Sample embedding (first 5 values): [-0.08477929 -0.47870237 -0.2567988  -0.34448606  0.22731078]


In [6]:
def crop_detected_objects(detection_json, frames_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    with open(detection_json, "r") as f:
        detections = json.load(f)

    crop_paths = []
    for frame_obj in detections:
        frame_name = frame_obj["frame"]
        frame_path = os.path.join(frames_dir, frame_name)
        if not os.path.exists(frame_path):
            continue
        image = cv2.imread(frame_path)

        for i, det in enumerate(frame_obj["detections"]):
            label = det.get("label", det.get("class_name", "unknown")).lower()
            if label not in FASHION_CLASSES:
                continue

            if "bbox" not in det or len(det["bbox"]) < 4:
                continue

            # If bbox is in center (x,y,w,h) format, convert to corners
            if len(det["bbox"]) == 4 and all(isinstance(v, float) for v in det["bbox"]):
                x, y, w, h = map(int, det["bbox"])
                x1 = max(x - w // 2, 0)
                y1 = max(y - h // 2, 0)
                x2 = min(x + w // 2, image.shape[1])
                y2 = min(y + h // 2, image.shape[0])
            else:
                x1, y1, x2, y2 = map(int, det["bbox"])

            crop = image[y1:y2, x1:x2]
            if crop.size == 0 or crop.shape[0] < 50 or crop.shape[1] < 50:
                continue

            crop_path = os.path.join(output_dir, f"{frame_name[:-4]}_obj_{i}.jpg")
            cv2.imwrite(crop_path, crop)
            crop_paths.append(crop_path)

    print(f"✅ Cropped {len(crop_paths)} valid objects")
    return crop_paths


In [7]:
# Rerun this after updating crop_detected_objects
base_frames_dir = "frames"
base_outputs_dir = "outputs"
base_crops_dir = "cropped_objects"

all_video_ids = [d for d in os.listdir(base_frames_dir) if os.path.isdir(os.path.join(base_frames_dir, d))]

for video_id in sorted(all_video_ids):
    detection_json = os.path.join(base_outputs_dir, f"{video_id}.json")
    frames_dir = os.path.join(base_frames_dir, video_id)
    output_dir = os.path.join(base_crops_dir, video_id)

    if not os.path.exists(detection_json):
        print(f"⚠️ Skipping {video_id} (no detection JSON found)")
        continue

    print(f"\n📂 Processing: {video_id}")
    os.makedirs(output_dir, exist_ok=True)
    crops = crop_detected_objects(detection_json, frames_dir, output_dir)
    print(f"🖼️ Crops saved for {video_id}: {len(crops)}")



📂 Processing: 2025-05-22_08-25-12_UTC
✅ Cropped 0 valid objects
🖼️ Crops saved for 2025-05-22_08-25-12_UTC: 0

📂 Processing: 2025-05-27_13-46-16_UTC
✅ Cropped 39 valid objects
🖼️ Crops saved for 2025-05-27_13-46-16_UTC: 39

📂 Processing: 2025-05-28_13-40-09_UTC
✅ Cropped 55 valid objects
🖼️ Crops saved for 2025-05-28_13-40-09_UTC: 55

📂 Processing: 2025-05-28_13-42-32_UTC
✅ Cropped 62 valid objects
🖼️ Crops saved for 2025-05-28_13-42-32_UTC: 62

📂 Processing: 2025-05-31_14-01-37_UTC
✅ Cropped 23 valid objects
🖼️ Crops saved for 2025-05-31_14-01-37_UTC: 23

📂 Processing: 2025-06-02_11-31-19_UTC
✅ Cropped 194 valid objects
🖼️ Crops saved for 2025-06-02_11-31-19_UTC: 194
