Kaggle URL: [Siamese Train Distillation](https://www.kaggle.com/code/phatle1578/siamese-train-distillation)

In [None]:
!pip -q install ultralytics==8.3.27 open_clip_torch==2.24.0

In [None]:
!pip install clip

In [None]:
from ultralytics import YOLO
import cv2
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
import os, json
import requests
import math
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import re

import warnings
warnings.filterwarnings("ignore")

In [None]:
from transformers import CLIPProcessor, CLIPModel
import open_clip

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models

class DistillDataset(Dataset):
    def __init__(self, metadata_path, img_dir):
        with open(metadata_path, 'r') as f:
            self.data = json.load(f)
        self.img_dir = img_dir
        self.transform = clip_preprocess # Dùng cùng transform với CLIP để đồng bộ

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        img = Image.open(f"{self.img_dir}/{item['crop_name']}").convert("RGB")
        return self.transform(img), torch.tensor(item['embedding']).squeeze()

# Model Student (Dưới 50M tham số)
class StudentSiamese(nn.Module):
    def __init__(self, out_dim=768):
        super().__init__()
        # ResNet18 ~11M tham số
        base = models.resnet18(pretrained=True)
        self.backbone = nn.Sequential(*list(base.children())[:-1])
        self.fc = nn.Linear(512, out_dim)

    def forward(self, x):
        x = self.backbone(x).view(x.size(0), -1)
        return F.normalize(self.fc(x), p=2, dim=1)

# --- Training Loop (Mimic Loss) ---
def train_student_siamese(metadata_json, crop_dir):
    # Truyền clip_preprocess vào để đồng bộ hóa kích thước ảnh
    dataset = DistillDataset(metadata_json, crop_dir)
    dataset.transform = clip_preprocess # Đảm bảo biến này đã được khởi tạo ở đầu notebook
    
    loader = DataLoader(dataset, batch_size=32, shuffle=True)
    model = StudentSiamese(out_dim=768).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.MSELoss() 

    for epoch in range(10):
        model.train()
        epoch_loss = 0
        for imgs, teacher_embs in loader:
            imgs, teacher_embs = imgs.to(device), teacher_embs.to(device)
            student_embs = model(imgs)
            loss = criterion(student_embs, teacher_embs)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            
        print(f"Epoch {epoch} - Avg Loss: {epoch_loss/len(loader):.6f}")
    return model

In [None]:
# 1. Cấu hình file metadata và ảnh đã trích xuất
DISTILL_WORKING_DIR="/kaggle/input/groundingdino-distillation-dataset/distill_data"
METADATA_FILE = f"{DISTILL_WORKING_DIR}/siamese_metadata.json"
CROP_IMAGES_DIR = f"{DISTILL_WORKING_DIR}/siamese/crops"

# 2. Chạy hàm huấn luyện
print("--- Đang bắt đầu huấn luyện Student (ResNet18) ---")
student_model = train_student_siamese(METADATA_FILE, CROP_IMAGES_DIR)

# 3. Lưu model sau khi học xong để dùng cho inference
torch.save(student_model.state_dict(), "/kaggle/working/student_siamese_final.pt")
print("--- Đã huấn luyện xong và lưu model tại /kaggle/working/student_siamese_final.pt ---")

In [None]:
def check_similarity(img_path_1, img_path_2, model):
    model.eval()
    transform = clip_preprocess # Dùng transform của CLIP
    
    img1 = transform(Image.open(img_path_1).convert("RGB")).unsqueeze(0).to(device)
    img2 = transform(Image.open(img_path_2).convert("RGB")).unsqueeze(0).to(device)
    
    with torch.no_grad():
        feat1 = model(img1)
        feat2 = model(img2)
        
    # Tính Cosine Similarity
    similarity = F.cosine_similarity(feat1, feat2)
    return similarity.item()

# Chạy thử nghiệm
sim_score = check_similarity("/kaggle/input/zaic-test-frames/CardboardBox_1/object_images/img_1.jpg", "/kaggle/input/zaic-test-frames/CardboardBox_1/object_frames/100.jpg", student_model)
print(f"Độ tương đồng: {sim_score:.4f}")