Kaggle URL: [Pipeline Inference Drone](https://www.kaggle.com/code/phatle1578/pipeline-inference-drone)

In [None]:
!pip -q install ultralytics==8.3.27 open_clip_torch==2.24.0
!pip uninstall -y ray ray[default] ray[tune] >/dev/null 2>&1 || true
!pip install -q "numpy<2.0"
!pip install -q protobuf==3.20.3 tensorboard==2.14.0
!pip install -q filterpy

In [None]:
from ultralytics import YOLO
import cv2
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
import os, json
from transformers import BlipProcessor, BlipForConditionalGeneration
import requests
import math
import torch
import numpy as np
import open_clip
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

In [None]:
# ============================================================
# PATH CONFIG
# ============================================================
SAMPLES_DIR = "/kaggle/input/pt-zaic/private_test/samples"

# --- SỬA CHO ĐÚNG PATH MODEL CỦA BẠN ---
YOLO_WORLD_WEIGHTS = "/kaggle/input/yolo-world-distillation/transformers/default/1/best.pt"
YOLO11S_AUG_WEIGHTS = "/kaggle/input/yolo11s-object/other/default/1/best.pt"
YOLOV8S_AUG_WEIGHTS = "/kaggle/input/yolov8s-augmentation/transformers/default/1/best.pt"
SIAMESE_WEIGHTS    = "/kaggle/input/clip-siamese-student-distillation/transformers/default/1/student_siamese_final.pt"

OUT_JSON = "/kaggle/working/submission_private.json"
VIS_DIR  = "/kaggle/working/vis_videos"
os.makedirs(VIS_DIR, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
FP16 = (DEVICE == "cuda")

# ============================================================
# KNOBS
# ============================================================
FRAME_STRIDE = 1
IMG_SIZE = 640
CONF_THRES = 0.2
IOU_THRES = 0.5
MAX_DETS = 50
TOPK_PER_FRAME = 15
CROP_PAD = 0.1

SIM_THRES = 0.45
FILL_SKIPPED_FRAMES = False

# ============================================================
# UTILS
# ============================================================
def cv2_to_pil(img_bgr):
    return Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))

def clamp_box(x1,y1,x2,y2,w,h):
    x1 = max(0, min(int(x1), w-1))
    y1 = max(0, min(int(y1), h-1))
    x2 = max(0, min(int(x2), w-1))
    y2 = max(0, min(int(y2), h-1))
    if x2 <= x1: x2 = min(w-1, x1+1)
    if y2 <= y1: y2 = min(h-1, y1+1)
    return x1,y1,x2,y2

def pad_box(x1,y1,x2,y2,pad_frac,w,h):
    bw = x2-x1
    bh = y2-y1
    padw = bw * pad_frac
    padh = bh * pad_frac
    return clamp_box(x1-padw, y1-padh, x2+padw, y2+padh, w, h)

def cosine_sim(a, b):
    return (a @ b.transpose(0,1)).squeeze(0)

# ============================================================
# BLIP
# ============================================================
class BLIPPrompter:
    def __init__(self, model_id="Salesforce/blip-image-captioning-base", device=DEVICE):
        self.processor = BlipProcessor.from_pretrained(model_id)
        self.model = BlipForConditionalGeneration.from_pretrained(model_id).to(device).eval()
        self.device = device

    @torch.inference_mode()
    def caption(self, pil_img, prompt="The single most important object in this image is a", max_new_tokens=6):
        inputs = self.processor(images=pil_img, text=prompt, return_tensors="pt").to(self.device)
        out = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
        txt = self.processor.decode(out[0], skip_special_tokens=True)
        ref = txt.lower().replace(prompt.lower(), "").strip()
        ref = ref.split(".")[0].strip()
        ref = ref.replace("a ", "").replace("an ", "").strip()
        return ref

def build_prompts_from_ref_images(ref_paths, blip: BLIPPrompter, debug=False, case_id=""):
    base_prompt = "The single most important object in this image is a"
    caps = []
    for p in ref_paths:
        img = Image.open(p).convert("RGB")
        c = blip.caption(img, prompt=base_prompt, max_new_tokens=6)
        if c:
            caps.append(c)
        if debug:
            print(f"[BLIP][{case_id}] {Path(p).name} -> '{c}'")

    def clean(s):
        s = s.strip().lower()
        s = s.replace(",", " ").replace("  ", " ")
        return s

    caps = [clean(c) for c in caps if len(c.strip()) >= 2]
    caps = list(dict.fromkeys(caps))

    prompts = []
    for c in caps:
        prompts.append(c)
        toks = c.split()
        if len(toks) >= 2:
            prompts.append(toks[-1])
    prompts = [p for p in prompts if p]
    prompts = list(dict.fromkeys(prompts))

    if debug:
        print(f"[PROMPTS][{case_id}] {prompts}")

    return prompts if prompts else ["object"]

# ============================================================
# SIAMESE/CLIP embedder
# ============================================================
class ClipEmbedder:
    def __init__(self, siamese_path=SIAMESE_WEIGHTS, device=DEVICE):
        self.device = device
        self.model = None
        self.preprocess = None
        self.using_fallback = False

        # 1) TorchScript
        try:
            m = torch.jit.load(siamese_path, map_location=device)
            m.eval()
            self.model = m
            self.preprocess = None
            print("[Siamese] Loaded as TorchScript:", siamese_path)
            return
        except Exception as e:
            print("[Siamese] TorchScript load failed:", type(e).__name__)

        # 2) state_dict -> open_clip (có thể cần đổi arch)
        try:
            arch = "ViT-B-32"
            pretrained = "laion2b_s34b_b79k"
            model, _, preprocess = open_clip.create_model_and_transforms(arch, pretrained=pretrained)
            ckpt = torch.load(siamese_path, map_location="cpu")
            sd = ckpt.get("state_dict", ckpt)
            new_sd = {k.replace("module.", ""): v for k, v in sd.items()}
            model.load_state_dict(new_sd, strict=False)
            self.model = model.to(device).eval()
            self.preprocess = preprocess
            print("[Siamese] Loaded as open_clip state_dict:", siamese_path)
            return
        except Exception as e:
            print("[Siamese] state_dict load failed:", type(e).__name__)

        # 3) fallback
        arch = "ViT-B-32"
        pretrained = "laion2b_s34b_b79k"
        model, _, preprocess = open_clip.create_model_and_transforms(arch, pretrained=pretrained)
        self.model = model.to(device).eval()
        self.preprocess = preprocess
        self.using_fallback = True
        print("[Siamese] Fallback to open_clip pretrained:", arch, pretrained)

    @torch.inference_mode()
    def encode_image(self, pil_img: Image.Image):
        if self.preprocess is not None:
            x = self.preprocess(pil_img).unsqueeze(0).to(self.device)
        else:
            img = pil_img.resize((224,224))
            arr = np.array(img).astype(np.float32) / 255.0
            mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
            std  = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
            arr = (arr - mean) / std
            x = torch.from_numpy(arr).permute(2,0,1).unsqueeze(0).to(self.device)
    
        # ---- FIX: dùng autocast thay vì x.half() ----
        if self.device.startswith("cuda"):
            with torch.cuda.amp.autocast(dtype=torch.float16):
                if hasattr(self.model, "encode_image"):
                    feat = self.model.encode_image(x)
                else:
                    feat = self.model(x)
        else:
            if hasattr(self.model, "encode_image"):
                feat = self.model.encode_image(x)
            else:
                feat = self.model(x)
    
        feat = feat.float()
        feat = feat / (feat.norm(dim=-1, keepdim=True) + 1e-8)
        return feat

# ============================================================
# LIST CASES (names are duplicated across folders -> use folder name as video_id)
# ============================================================
def list_cases(samples_dir):
    samples_dir = Path(samples_dir)
    cases = []
    for d in sorted(samples_dir.iterdir()):
        if not d.is_dir():
            continue

        video = d / "drone_video.mp4"
        if not video.exists():
            vids = list(d.glob("*.mp4"))
            if len(vids) == 0:
                continue
            video = vids[0]

        obj_dir = d / "object_images"   # <-- đúng như bạn chụp
        refs = []
        if obj_dir.exists():
            for ext in ["*.jpg","*.jpeg","*.png","*.webp"]:
                refs += sorted(obj_dir.glob(ext))
        refs = [str(x) for x in refs][:3]

        cases.append({"case_id": d.name, "video": str(video), "refs": refs})
    return cases

# ============================================================
# INFER ONE CASE -> submission entry + frame_to_box for visualization
# ============================================================
def split_into_intervals(frame_to_box, gap_tolerance=1):
    """
    gap_tolerance=1: chỉ coi là liên tiếp nếu frame sau = frame trước + 1.
    """
    if not frame_to_box:
        return []

    frames = sorted(frame_to_box.keys())
    detections = []

    cur = []
    prev = None

    for f in frames:
        if prev is None or (f - prev) <= gap_tolerance:
            cur.append(f)
        else:
            # đóng interval cũ
            bboxes = []
            for ff in cur:
                x1,y1,x2,y2 = frame_to_box[ff]
                bboxes.append({"frame": int(ff), "x1": int(x1), "y1": int(y1), "x2": int(x2), "y2": int(y2)})
            detections.append({"bboxes": bboxes})
            # mở interval mới
            cur = [f]
        prev = f

    # đóng interval cuối
    if cur:
        bboxes = []
        for ff in cur:
            x1,y1,x2,y2 = frame_to_box[ff]
            bboxes.append({"frame": int(ff), "x1": int(x1), "y1": int(y1), "x2": int(x2), "y2": int(y2)})
        detections.append({"bboxes": bboxes})

    return detections
    
def infer_one_case(case, yolo_world_model, yolo11s_aug, yolov8s_aug, blip, embedder: ClipEmbedder):
    case_id = case["case_id"]
    video_path = case["video"]
    ref_paths = case["refs"]

    video_id = case_id  # unique id theo folder name

    # ---- BLIP prompts (log console) ----
    DEBUG_BLIP = True  # đổi True nếu muốn in hết; khuyên debug 1-2 case thôi
    # DEBUG_BLIP = (video_id == "Helmet_0")
    prompts = build_prompts_from_ref_images(ref_paths, blip, debug=DEBUG_BLIP, case_id=video_id)

    # ---- reference embedding (mean of 3 refs) ----
    ref_feats = []
    for p in ref_paths:
        if os.path.exists(p):
            ref_img = Image.open(p).convert("RGB")
            ref_feats.append(embedder.encode_image(ref_img))

    if len(ref_feats) == 0:
        return {"video_id": video_id, "detections": []}, {}

    ref_feat = torch.cat(ref_feats, dim=0).mean(dim=0, keepdim=True)
    ref_feat = ref_feat / (ref_feat.norm(dim=-1, keepdim=True) + 1e-8)

    # ---- video loop ----
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return {"video_id": video_id, "detections": []}, {}

    nframes = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_to_box = {}
    last_good = None

    frame_idx = 0
    pbar = tqdm(total=nframes, desc=f"[{video_id}]")

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        h, w = frame.shape[:2]

        # 1. Logic Stride: Bỏ qua frame theo bước nhảy để tăng tốc độ
        if frame_idx % FRAME_STRIDE != 0:
            if FILL_SKIPPED_FRAMES and (last_good is not None):
                frame_to_box[frame_idx] = last_good[1]
            frame_idx += 1
            pbar.update(1)
            continue

        # 2. CHẠY SONG SONG 2 MÔ HÌNH STUDENT
        # Model 1: YOLO-World (Open-Vocabulary - Nhận diện theo Prompt văn bản)
        res_world = yolo_world.predict(frame, prompts=prompts, imgsz=IMG_SIZE, conf=CONF_THRES, iou=IOU_THRES, verbose=False)[0]
        
        # Model 2: YOLO11s (Fine-tuned - Tinh tường bối cảnh drone thực tế)
        res_yolo11 = yolo11s_aug.predict(frame, imgsz=IMG_SIZE, conf=CONF_THRES, iou=IOU_THRES, verbose=False)[0]

        # Model 3: YOLOv8s (Fine-tuned - Tinh tường bối cảnh drone thực tế)
        res_yolov8 = yolov8s_aug.predict(frame, imgsz=IMG_SIZE, conf=CONF_THRES, iou=IOU_THRES, verbose=False)[0]

        # 3. GOM KẾT QUẢ TỪ CẢ 2 NGUỒN (Ensemble Proposals)
        all_boxes = []
        all_confs = []
        
        if res_world.boxes is not None and len(res_world.boxes) > 0:
            all_boxes.append(res_world.boxes.xyxy.cpu().numpy())
            all_confs.append(res_world.boxes.conf.cpu().numpy())
            
        if res_yolo11.boxes is not None and len(res_yolo11.boxes) > 0:
            all_boxes.append(res_yolo11.boxes.xyxy.cpu().numpy())
            all_confs.append(res_yolo11.boxes.conf.cpu().numpy())

        if res_yolov8.boxes is not None and len(res_yolov8.boxes) > 0:
            all_boxes.append(res_yolov8.boxes.xyxy.cpu().numpy())
            all_confs.append(res_yolov8.boxes.conf.cpu().numpy())

        best_box = None
        best_sim = -1.0

        if len(all_boxes) > 0:
            # Hợp nhất các mảng box và confidence
            combined_boxes = np.concatenate(all_boxes)
            combined_confs = np.concatenate(all_confs)
            
            # Sắp xếp và chỉ giữ lại TOP-K vùng khả nghi nhất để Siamese xử lý
            order = np.argsort(-combined_confs)[:TOPK_PER_FRAME]
            final_proposals = combined_boxes[order]

            crop_feats = []
            crop_boxes = []

            # 4. XÁC MINH CHI TIẾT BẰNG SIAMESE STUDENT
            for (x1, y1, x2, y2) in final_proposals:
                # Mở rộng vùng cắt để lấy thêm bối cảnh
                x1, y1, x2, y2 = pad_box(x1, y1, x2, y2, CROP_PAD, w, h)
                crop = frame[y1:y2, x1:x2]
                if crop.size == 0:
                    continue

                # Trích xuất đặc trưng hình ảnh của vùng cắt
                feat = embedder.encode_image(cv2_to_pil(crop))
                crop_feats.append(feat)
                crop_boxes.append([int(x1), int(y1), int(x2), int(y2)])

            if len(crop_feats) > 0:
                # Tính độ tương đồng Cosine với ảnh tham chiếu
                crop_feats = torch.cat(crop_feats, dim=0)
                sims = cosine_sim(ref_feat, crop_feats).detach().cpu().numpy()

                # Chọn box có độ tương đồng cao nhất với vật thể mẫu
                bi = int(np.argmax(sims))
                best_sim = float(sims[bi])
                best_box = crop_boxes[bi]

        # 5. CHẤP NHẬN HOẶC TỪ CHỐI DỰA TRÊN NGƯỠNG SIMILARITY
        if (best_box is not None) and (best_sim >= SIM_THRES):
            frame_to_box[frame_idx] = best_box
            last_good = (frame_idx, best_box)
        else:
            # Nếu không tìm thấy, có thể điền bằng kết quả của frame trước đó
            if FILL_SKIPPED_FRAMES and (last_good is not None):
                frame_to_box[frame_idx] = last_good[1]

        frame_idx += 1
        pbar.update(1)

    pbar.close()
    cap.release()

    # ---- IMPORTANT: build output by intervals (đúng format BTC) ----
    detections = split_into_intervals(frame_to_box, gap_tolerance=1)
    entry = {"video_id": video_id, "detections": detections}

    return entry, frame_to_box

# ============================================================
# VISUALIZE VIDEO
# ============================================================
def write_visualized_video(video_path, frame_to_box, out_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Cannot open for vis:", video_path)
        return

    fps = cap.get(cv2.CAP_PROP_FPS)
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h))

    idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if idx in frame_to_box:
            x1,y1,x2,y2 = frame_to_box[idx]
            cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)
            cv2.putText(frame, f"{Path(out_path).stem}  f={idx}", (10,30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2)

        writer.write(frame)
        idx += 1

    cap.release()
    writer.release()

# ============================================================
# RUN ALL
# ============================================================
yolo_world = YOLO(YOLO_WORLD_WEIGHTS)
yolo11s_aug = YOLO(YOLO11S_AUG_WEIGHTS)
yolov8s_aug = YOLO(YOLOV8S_AUG_WEIGHTS)
blip = BLIPPrompter(device=DEVICE)
embedder = ClipEmbedder(siamese_path=SIAMESE_WEIGHTS, device=DEVICE)

cases = list_cases(SAMPLES_DIR)
print("Found cases:", len(cases))
print("First case:", cases[0] if cases else None)

submission = []
for case in cases:
    entry, frame_to_box = infer_one_case(case, yolo_world, yolo11s_aug, yolov8s_aug, blip, embedder)
    submission.append(entry)

    # save vis video (same folder name -> unique)
    out_video = os.path.join(VIS_DIR, f"{entry['video_id']}_vis.mp4")
    write_visualized_video(case["video"], frame_to_box, out_video)

with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(submission, f, ensure_ascii=False, indent=2)

print("Saved JSON:", OUT_JSON)
print("Saved VIS videos folder:", VIS_DIR)

In [None]:
!zip -r videos.zip $VIS_DIR