In [None]:
import gc
import datetime
import logging

import numpy as np
import pandas as pd
import tensorflow as tf
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from ultralytics import YOLO

from utils.models import Detector, EmbeddingModel
from utils import common, tal, losses
from utils.dataset import DecklistDataset
from utils import db

2025-02-11 20:30:27.036075: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-11 20:30:27.036221: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-11 20:30:27.048868: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-11 20:30:27.100189: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
X_train = pd.read_csv("datasets/train.csv")
id_list = X_train["id"].tolist()
prefix = "datasets/card_images_small/"
id_list = list(map(lambda x: prefix + str(x) + ".jpg", id_list))
card_type = list(map(lambda x: x.lower().startswith("pendulum"), X_train["type"].tolist()))
train_dataset = DecklistDataset(id_list, card_type)
del X_train, card_type, id_list

batch_size = 8
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn
)

In [3]:
X_valid = pd.read_csv("datasets/valid.csv")
id_list = X_valid["id"].tolist()
prefix = "datasets/card_images_small/"
id_list = list(map(lambda x: prefix + str(x) + ".jpg", id_list))
card_type = list(map(lambda x: x.lower().startswith("pendulum"), X_valid["type"].tolist()))
valid_dataset = DecklistDataset(id_list, card_type)
del X_valid, card_type, id_list

batch_size = 8
valid_loader = DataLoader(
    valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=valid_dataset.collate_fn
)

In [39]:
pre_model = YOLO("weights/yolov8n_detector.pt")
pre_model_dict = pre_model.model.model.state_dict()

student_model = Detector(1000)
model_dict = student_model.state_dict()

for key in pre_model_dict.keys():
    new_key = f"layer{key}"
    model_dict[new_key] = pre_model_dict[key].clone().detach()

student_model.load_state_dict(model_dict)
student_model = student_model.cuda()

for name, param in student_model.named_parameters():
    param.requires_grad_(False if "dfl" in name else True)

del pre_model, pre_model_dict

In [34]:
state_dict = torch.load("best_134 copy.pt")
student_model.load_state_dict(state_dict)
student_model = student_model.cuda()

for name, param in student_model.named_parameters():
    param.requires_grad_(False if "dfl" in name else True)

In [6]:
# prevent tenosrflow occupy all gpu memory
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)


teach_preprocess = common.EmbeddingPreprocessor()
teacher_model = EmbeddingModel()
teacher_model.load("weights/embedding.h5")

2025-02-11 20:30:36.642812: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-11 20:30:36.647123: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-11 20:30:36.647252: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-11 20:30:36.669538: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-02-11 20:30:36.669831: I external/local_xla/xla/stream_executor



In [7]:
epoches = 400
scaler = torch.cuda.amp.GradScaler(enabled=True)
task_aligned_assigner = tal.TaskAlignedAssigner(num_classes=1, alpha=0.5, beta=6.0)

In [8]:
def make_optimizer(model, lr=0.002, momentum=0.9, decay=1e-5):
    g = [], [], []  # optimizer parameter groups
    bn = tuple(v for k, v in nn.__dict__.items() if "Norm" in k)

    for param_name, param in model.named_parameters():
        if "bias" in param_name:  # bias (no decay)
            g[2].append(param)
        elif isinstance(param, bn):  # weight (no decay)
            g[1].append(param)
        else:  # weight (width decay)
            g[0].append(param)

    optimizer = torch.optim.AdamW(
        g[2],
        lr=lr,
        betas=(momentum, 0.999),
        weight_decay=0.0,
    )
    optimizer.add_param_group({"params": g[0], "weight_decay": decay})
    optimizer.add_param_group({"params": g[1], "weight_decay": 0.0})

    return optimizer


def optimizer_step(model, optimizer, scaler):
    scaler.unscale_(optimizer)
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad()


optimizer = make_optimizer(student_model)

In [9]:
device = next(student_model.parameters()).device
detection_loss = losses.v8DetectionLoss(head=student_model.layer22, device=device, tal_topk=10)

In [10]:
teacher_preprocessor = common.EmbeddingPreprocessor()

In [11]:
teacher_batch_size = 8

In [43]:
torch.device("cpus")

RuntimeError: Expected one of cpu, cuda, ipu, xpu, mkldnn, opengl, opencl, ideep, hip, ve, fpga, ort, xla, lazy, vulkan, mps, meta, hpu, mtia, privateuseone device type at start of device string: cpus

In [42]:
device

device(type='cuda', index=0)

In [12]:
def run_one_epoch(epoch, dataloader, student_model, teacher_model, is_train, optimizer=None, scaler=None):
    device = next(student_model.parameters()).device
    dtype = next(student_model.parameters()).dtype

    total_loss = torch.zeros(4)
    sample_count = 0
    for batch in dataloader:
        images = batch["image"]

        # preprocessing
        student_inputs = common.detector_preprocessing(images).to(device=device)
        batch["bboxes"] = torch.from_numpy(batch["xywh"])
        batch["batch_idx"] = torch.from_numpy(batch["batch_idx"])
        # groud-truth classes
        batch["cls"] = torch.zeros(
            size=[batch["bboxes"].shape[0], 1],
            dtype=dtype
        )

        # make embeddings with teacher model
        gt_embedding = []
        for i, img in enumerate(images):
            mask = batch["batch_idx"] == i
            teacher_inputs = []
            img_width, img_height, _ = img.shape
            xyxy_list = batch["xyxy"][mask]
            xyxy_list[:, [0, 2]] *= img_width
            xyxy_list[:, [1, 3]] *= img_height
            xyxy_list = np.round(xyxy_list).astype(np.int32)
            for xyxy in xyxy_list:
                x1, y1, x2, y2 = xyxy
                crop = img[y1:y2, x1:x2, :]
                crop = teacher_preprocessor(crop)
                teacher_inputs.append(crop)
            teacher_inputs = tf.stack(teacher_inputs)
            teacher_embedding = teacher_model.pred(teacher_inputs, 8)
            teacher_embedding = torch.from_numpy(teacher_embedding)
            gt_embedding.append(teacher_embedding)
        gt_embedding = torch.concatenate(gt_embedding, dim=0)
        batch["embedding"] = gt_embedding

        # inference
        if is_train:
            student_model.train()
        else:
            student_model.eval()
        preds = student_model(student_inputs)

        # loss
        use_cosine = True if epoch < 100 else False
        loss, loss_item = detection_loss(preds, batch, use_cosine)
        total_loss += loss_item.detach().cpu()
        sample_count += batch["image"].shape[0]

        # back propagation
        if is_train:
            scaler.scale(loss).backward()
            optimizer_step(student_model, optimizer, scaler)

    return total_loss / sample_count

In [None]:
best_loss = torch.inf
for epoch in range(135, epoches):
    train_dataset.shuffle()
    train_loss = run_one_epoch(epoch, train_loader, student_model, teacher_model, True, optimizer, scaler)
    logging.info(f"epoch: {epoch}")
    logging.info(f"\ttrain loss")
    logging.info(f"\t\tdet_loss: {train_loss[0]} cls_loss: {train_loss[1]} dfl_loss: {train_loss[2]} embed_loss: {train_loss[3]}")
    gc.collect()
    torch.cuda.empty_cache()

    valid_loss = run_one_epoch(epoch, valid_loader, student_model, teacher_model, False)
    logging.info(f"\tvalid loss")
    logging.info(f"\t\tdet_loss: {valid_loss[0]} cls_loss: {valid_loss[1]} dfl_loss: {valid_loss[2]} embed_loss: {valid_loss[3]}")
    gc.collect()
    torch.cuda.empty_cache()

    loss = train_loss.sum().detach().cpu().item()
    if loss < best_loss:
        torch.save(student_model.state_dict(), f"best_{epoch}.pt")
        best_loss = loss
        logging.info(f"model saved: best_{epoch}.pt")

In [13]:
valid_dataset.shuffle()
valid_data = next(iter(valid_loader))

In [14]:
images = valid_data["image"]
gt_embedding = []
for i, img in enumerate(images):
    mask = valid_data["batch_idx"] == i
    teacher_inputs = []
    img_width, img_height, _ = img.shape
    xyxy_list = valid_data["xyxy"][mask]
    xyxy_list[:, [0, 2]] *= img_width
    xyxy_list[:, [1, 3]] *= img_height
    xyxy_list = np.round(xyxy_list).astype(np.int32)
    for xyxy in xyxy_list:
        x1, y1, x2, y2 = xyxy
        crop = img[y1:y2, x1:x2, :]
        crop = teacher_preprocessor(crop)
        teacher_inputs.append(crop)
    teacher_inputs = tf.stack(teacher_inputs)
    teacher_embedding = teacher_model.pred(teacher_inputs, 8)
    gt_embedding.append(teacher_embedding.tolist())

2025-02-11 19:35:52.927382: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907


In [40]:
student_inputs = common.detector_preprocessing(valid_data["image"]).to(device)
student_model.eval()
pred_det, pred_embed = student_model(student_inputs)
result = student_model.postprocess(pred_det, pred_embed, (640, 640), valid_data["ori_image"])

In [24]:
def get_card_info(result):
    chromadb = db.ChromaDB()
    card_names = []
    for np_embed in result.embeds:
        embed = np_embed.tolist()
        res = chromadb.search_by_embed(embed)
        name = res[0][0]["name"]
        card_names.append(name)
    result.names = card_names

In [41]:
get_card_info(result[0])
result[0].save("test.png")