In [4]:
!pip install transformers torchvision



Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
   ---------------------------------------- 0.0/10.5 MB ? eta -:--:--
   ------------------------ --------------- 6.3/10.5 MB 32.2 MB/s eta 0:00:01
   ---------------------------------------- 10.5/10.5 MB 29.6 MB/s eta 0:00:00
Downloading huggingface_hub-0.32.4-py3-none-any.whl (512 kB)
Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl (308 kB)
Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 

In [21]:
import os
import json
import cv2
import numpy as np
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# ===== Load CLIP model from Hugging Face =====
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


# ===== Generate CLIP embedding for one image =====
def get_image_embedding(image_path_or_pil):
    if isinstance(image_path_or_pil, str):
        image = Image.open(image_path_or_pil).convert("RGB")
    else:
        image = image_path_or_pil

    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = clip_model.get_image_features(**inputs)
    return outputs[0].cpu().numpy()


# ===== Crop all detections for a given video =====
def crop_detected_objects(detection_json, frames_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    with open(detection_json, "r") as f:
        detections = json.load(f)

    crop_paths = []

    for frame_obj in detections:
        frame_name = frame_obj["frame"]
        frame_path = os.path.join(frames_dir, frame_name)
        image = cv2.imread(frame_path)

        for i, det in enumerate(frame_obj["detections"]):
            x, y, w, h = map(int, det["bbox"])
            x1 = max(int(x - w / 2), 0)
            y1 = max(int(y - h / 2), 0)
            x2 = min(int(x + w / 2), image.shape[1])
            y2 = min(int(y + h / 2), image.shape[0])

            crop = image[y1:y2, x1:x2]
            crop_path = os.path.join(output_dir, f"{frame_name[:-4]}_obj_{i}.jpg")
            cv2.imwrite(crop_path, crop)

            crop_paths.append(crop_path)

    print(f"✅ Cropped {len(crop_paths)} objects")
    return crop_paths


# ====== MAIN RUN SECTION ======
if __name__ == "__main__":
    video_id = "2025-06-02_11-31-19_UTC"  # change to your actual video ID
    
    # Since you're already inside 'outputs/', move one level up for frames and cropped_objects
    detection_json = f"{video_id}.json"
    frames_dir = f"../frames/{video_id}"
    output_dir = f"../cropped_objects/{video_id}"

    os.makedirs(output_dir, exist_ok=True)

    # 1. Crop detections
    crops = crop_detected_objects(detection_json, frames_dir, output_dir)

    # 2. Get embedding for first crop as test
    if crops:
        emb = get_image_embedding(crops[0])
        print("✅ Sample embedding shape:", emb.shape)
        print("🔢 Sample embedding (first 5 values):", emb[:5])


✅ Cropped 374 objects
✅ Sample embedding shape: (512,)
🔢 Sample embedding (first 5 values): [-0.00303599  0.16039643 -0.5945467   0.24727184 -0.15835276]
