Kaggle URL: [Siamese Distillation Dataset Creation](https://www.kaggle.com/code/phatle1578/siamese-distillation-dataset-creation)

In [None]:
# !pip -q uninstall -y numpy scipy scikit-learn tensorflow keras
# !pip -q install ultralytics==8.3.27 numpy==1.26.4 scipy==1.11.4 scikit-learn==1.3.2 open_clip_torch==2.24.0
!pip -q install ultralytics==8.3.27 open_clip_torch==2.24.0

In [None]:
!git clone https://github.com/IDEA-Research/GroundingDINO.git

In [None]:
%cd /kaggle/working/GroundingDINO/
!pip install -e .

In [None]:
!mkdir weights
%cd weights
!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

In [None]:
import numpy, scipy
print(numpy.__version__)
print(scipy.__version__)

In [None]:
from ultralytics import YOLO
import cv2
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
import os, json
import requests
import math
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import re

import warnings
warnings.filterwarnings("ignore")

In [None]:
!python -V

In [None]:
%cd /kaggle/working/GroundingDINO
from groundingdino.util.inference import load_model, load_image, predict, annotate
import warnings
warnings.filterwarnings("ignore")

dino_model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")

In [None]:
!pip install clip

In [None]:
from transformers import CLIPProcessor, CLIPModel
import open_clip

In [None]:
# --- Load Teacher CLIP (Để lấy tri thức đặc trưng) ---
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
    'ViT-L-14', pretrained='openai'
)
clip_model = clip_model.to(device).eval()

# 1. Cấu hình đường dẫn
DATA_ROOT = "/kaggle/input/train-zaic-dl"
DISTILL_DIR = "/kaggle/working/distill_data"
YOLO_IMG = f"{DISTILL_DIR}/yolo/images"
YOLO_LBL = f"{DISTILL_DIR}/yolo/labels"
SIAM_CROPS = f"{DISTILL_DIR}/siamese/crops"
os.makedirs(YOLO_IMG, exist_ok=True)
os.makedirs(YOLO_LBL, exist_ok=True)
os.makedirs(SIAM_CROPS, exist_ok=True)

In [None]:
def get_reference_features(obj_images_dir):
    """Tính vector đặc trưng trung bình từ các ảnh trong folder object_images"""
    feats = []
    for img_name in os.listdir(obj_images_dir):
        img_p = os.path.join(obj_images_dir, img_name)
        img_pil = Image.open(img_p).convert("RGB")
        img_in = clip_preprocess(img_pil).unsqueeze(0).to(device)
        with torch.no_grad():
            f = clip_model.encode_image(img_in)
            f /= f.norm(dim=-1, keepdim=True)
        feats.append(f)
    if not feats: return None
    combined = torch.mean(torch.stack(feats), dim=0)
    return combined / combined.norm(dim=-1, keepdim=True)

In [None]:
def annotate_on_frame(image_path, text_prompt, reference_features=None, box_threshold=0.35, text_threshold=0.25):
    """
    Sử dụng GroundingDINO để tìm box và CLIP để lọc theo độ tương đồng (nếu có reference_features).
    """
    # 1. Load và tiền xử lý ảnh cho GroundingDINO
    image_source, image = load_image(image_path) # Hàm load_image có sẵn trong GroundingDINO repo

    # 2. Dự đoán bằng GroundingDINO (Teacher Detection)
    boxes, logits, phrases = predict(
        model=dino_model, 
        image=image, 
        caption=text_prompt, 
        box_threshold=box_threshold, 
        text_threshold=text_threshold,
        device=device
    )

    # 3. Chuyển đổi tọa độ từ normalized [0, 1] sang pixel tuyệt đối
    h, w, _ = image_source.shape
    abs_boxes = []
    for box in boxes:
        # box format: [cx, cy, w, h]
        x_c, y_c, wb, hb = box * torch.Tensor([w, h, w, h])
        x1 = int(x_c - wb/2)
        y1 = int(y_c - hb/2)
        x2 = int(x_c + wb/2)
        y2 = int(y_c + hb/2)
        abs_boxes.append([max(0, x1), max(0, y1), min(w, x2), min(h, y2)])

    # 4. Cắt ảnh (Crops) để chuẩn bị cho CLIP lọc/trích xuất đặc trưng
    crops = []
    valid_indices = []
    for i, box in enumerate(abs_boxes):
        crop = image_source[box[1]:box[3], box[0]:box[2]]
        if crop.size > 0:
            crops.append(crop)
            valid_indices.append(i)

    # 5. Nếu có ảnh mẫu (Few-shot logic), lọc những box không giống ảnh mẫu
    if reference_features is not None and len(crops) > 0:
        final_boxes, final_crops, final_logits = [], [], []
        
        for i, crop in enumerate(crops):
            # Trích xuất feature của crop hiện tại bằng CLIP Teacher
            crop_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
            crop_input = clip_preprocess(crop_pil).unsqueeze(0).to(device)
            
            with torch.no_grad():
                crop_feat = clip_model.encode_image(crop_input)
                crop_feat /= crop_feat.norm(dim=-1, keepdim=True)
            
            # Tính similarity với trung bình cộng feature ảnh mẫu
            sim = torch.cosine_similarity(crop_feat, reference_features).item()
            
            # Chỉ giữ lại nếu similarity đủ cao (ví dụ > 0.25)
            if sim > 0.25:
                idx = valid_indices[i]
                final_boxes.append(abs_boxes[idx])
                final_crops.append(crop)
                final_logits.append(logits[idx])
        
        return final_boxes, final_crops, final_logits
    
    # Nếu không có ảnh mẫu, trả về toàn bộ kết quả của GroundingDINO
    return [abs_boxes[i] for i in valid_indices], crops, [logits[i] for i in valid_indices]

In [None]:
def get_label_from_folder(folder_name):
    # Loại bỏ phần số và gạch dưới ở cuối (vd: Backpack_0 -> Backpack)
    label = re.sub(r'_\d+$', '', folder_name)
    # Thay thế gạch dưới giữa các từ bằng khoảng trắng (nếu có)
    return label.replace('_', ' ')

In [None]:
distill_metadata = []

for folder_name in sorted(os.listdir(DATA_ROOT)):
    folder_path = os.path.join(DATA_ROOT, folder_name)
    if not os.path.isdir(folder_path): continue
    
    prompt = get_label_from_folder(folder_name) # Hàm bạn đã viết
    ref_feat = get_reference_features(os.path.join(folder_path, "object_images")) # CLIP ViT-L-14
    
    frames_path = os.path.join(folder_path, "object_frames")
    for frame_name in tqdm(os.listdir(frames_path), desc=f"Distilling {folder_name}"):
        img_p = os.path.join(frames_path, frame_name)
        
        # 1. Teacher tạo Box và lọc bằng CLIP
        # Trả về abs_boxes [x1, y1, x2, y2]
        boxes, crops, _ = annotate_on_frame(img_p, prompt, reference_features=ref_feat)
        if not boxes: continue

        # 2. Lưu cho YOLO11
        img_cv2 = cv2.imread(img_p)
        h, w, _ = img_cv2.shape
        new_name = f"{folder_name}_{frame_name}"
        cv2.imwrite(os.path.join(YOLO_IMG, new_name), img_cv2)
        
        with open(os.path.join(YOLO_LBL, new_name.replace(".jpg", ".txt")), "w") as f:
            for i, box in enumerate(boxes):
                x1, y1, x2, y2 = box
                # Chuyển sang format YOLO: class cx cy bw bh normalized
                cx, cy = (x1 + x2)/2/w, (y1 + y2)/2/h
                bw, bh = (x2 - x1)/w, (y2 - y1)/h
                f.write(f"0 {cx:.6f} {cy:.6f} {bw:.6f} {bh:.6f}\n")

                # 3. Lưu cho Siamese (Trích xuất tri thức Teacher)
                crop_pil = Image.fromarray(cv2.cvtColor(crops[i], cv2.COLOR_BGR2RGB))
                crop_input = clip_preprocess(crop_pil).unsqueeze(0).to(device)
                with torch.no_grad():
                    teacher_emb = clip_model.encode_image(crop_input)
                    teacher_emb = torch.nn.functional.normalize(teacher_emb, p=2, dim=1)
                
                crop_name = f"{new_name.split('.')[0]}_{i}.jpg"
                cv2.imwrite(os.path.join(SIAM_CROPS, crop_name), crops[i])
                distill_metadata.append({
                    "crop_name": crop_name,
                    "embedding": teacher_emb.cpu().numpy().tolist()[0] # 768 dims
                })

with open(f"{DISTILL_DIR}/siamese_metadata.json", "w") as f:
    json.dump(distill_metadata, f)