In [1]:
import boxmot
from transformers import pipeline, DetrConfig, DetrForObjectDetection, DetrImageProcessor
from torchvision.transforms import ToTensor
from PIL import Image
import cv2
import numpy as np
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
model_name = "facebook/detr-resnet-50"
processor = DetrImageProcessor.from_pretrained(model_name)
model = DetrForObjectDetection.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tracker = boxmot.BoostTrack(reid_weights="mars-small128.pb", device=device, half=False)

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


self.max_obs 65


[32m2025-11-10 12:16:23.467[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m56[0m | __init__ - [1mBaseTracker initialization parameters:[0m
[32m2025-11-10 12:16:23.467[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m57[0m | __init__ - [1mdet_thresh: 0.6[0m
[32m2025-11-10 12:16:23.467[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m58[0m | __init__ - [1mmax_age: 60[0m
[32m2025-11-10 12:16:23.467[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m59[0m | __init__ - [1mmax_obs: 50[0m
[32m2025-11-10 12:16:23.467[0m | MainProcess/MainTh

[32m2025-11-10 12:16:23.467[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m64[0m | __init__ - [1masso_func: iou[0m
[32m2025-11-10 12:16:23.467[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m65[0m | __init__ - [1mis_obb: False[0m
[32m2025-11-10 12:16:23.468[0m | MainProcess/MainThread | [32m[1mSUCCESS [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/boosttrack/boosttrack.py[0m:[36m253[0m | __init__ - [32m[1mInitialized BoostTrack[0m


In [None]:
def track_image_from_folder(folder_path, output_folder):
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)
            pil_image = Image.open(image_path).convert("RGB")
            inputs = processor(pil_image, return_tensors="pt")

            with torch.no_grad():
                outputs = model(**inputs)

            target_sizes = torch.tensor([pil_image.size[::-1]])

            outputs = processor.post_process_object_detection(
                outputs,
                target_sizes=target_sizes
            )[0]

            detections = np.concatenate([
                outputs["boxes"].cpu().numpy(),
                outputs["scores"].cpu().numpy().reshape(-1, 1),
                outputs["labels"].cpu().numpy().reshape(-1, 1)
            ], axis=1)


            cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
            frame_ndarray = np.array(pil_image)
            result = tracker.update(detections, frame_ndarray)
            tracker.plot_results(frame_ndarray, show_trajectories=False)

            for score, label, box, id in zip(outputs["scores"], outputs["labels"], outputs["boxes"], result[:4].tolist()):
                if score < 0.7:
                    continue
                box = [int(i) for i in box.tolist()]
                x0, y0, x1, y1 = box
                cv2.rectangle(cv_image, (x0, y0), (x1, y1), (0, 255, 0), 2)
                cv2.putText(
                    cv_image,
                    f"{model.config.id2label[label.item()]}: {score:.2f}, ID: {id}",
                    (x0, y0 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (0, 255, 0),
                    1,
                    cv2.LINE_AA,
                )

            output_path = os.path.join(output_folder, filename)
            cv2.imwrite(output_path, cv_image)

In [13]:
track_image_from_folder("../data/pohang/", "experiments/detr/tracking/")

[[7.47834595e+02 5.76333374e+02 8.18141235e+02 6.21803101e+02
  7.62000000e+02 9.10518050e-01 9.00000000e+00 1.40000000e+01]
 [1.14314978e+03 3.86915924e+02 1.23458801e+03 6.35661316e+02
  7.64000000e+02 9.35649753e-01 9.00000000e+00 1.60000000e+01]
 [9.61434876e+02 3.41630951e+02 1.07894116e+03 6.29571777e+02
  7.65000000e+02 8.82851422e-01 9.00000000e+00 1.70000000e+01]
 [1.41797827e+03 2.87304260e+02 1.52614648e+03 6.56010864e+02
  7.74000000e+02 8.56563985e-01 9.00000000e+00 3.00000000e+01]
 [1.28124341e+03 4.60541809e+02 1.35968286e+03 6.39151306e+02
  7.75000000e+02 7.27814853e-01 9.00000000e+00 3.10000000e+01]]
[[8.13939164e+02 5.69750438e+02 8.90250678e+02 6.23897142e+02
  7.62000000e+02 7.61576355e-01 9.00000000e+00 2.70000000e+01]
 [1.20734532e+03 3.82799103e+02 1.30238135e+03 6.37724900e+02
  7.64000000e+02 9.29907978e-01 9.00000000e+00 1.60000000e+01]
 [1.02880454e+03 3.42532288e+02 1.15656871e+03 6.35955719e+02
  7.65000000e+02 6.18141711e-01 9.00000000e+00 2.60000000e+01]

KeyboardInterrupt: 