In [32]:
import boxmot
from transformers import pipeline, DetrConfig, DetrForObjectDetection, DetrImageProcessor
from torchvision.transforms import ToTensor
from PIL import Image
import cv2
import numpy as np
import torch
import os

In [33]:
model_name = "facebook/detr-resnet-50"
processor = DetrImageProcessor.from_pretrained(model_name)
model = DetrForObjectDetection.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tracker = boxmot.BoostTrack(reid_weights="mars-small128.pb", device=device, half=False)
def track_image_from_folder(folder_path, output_folder):
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)
            pil_image = Image.open(image_path).convert("RGB")
            inputs = processor(pil_image, return_tensors="pt")

            with torch.no_grad():
                outputs = model(**inputs)

            target_sizes = torch.tensor([pil_image.size[::-1]])

            outputs = processor.post_process_object_detection(
                outputs,
                target_sizes=target_sizes
            )[0]

            detections = np.concatenate([
                outputs["boxes"].cpu().numpy(),
                outputs["scores"].cpu().numpy().reshape(-1, 1),
                outputs["labels"].cpu().numpy().reshape(-1, 1)
            ], axis=1)


            cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
            frame_ndarray = np.array(pil_image)
            result = tracker.update(detections, frame_ndarray)
            tracker.plot_results(frame_ndarray, show_trajectories=False)

            for score, label, box, id in zip(outputs["scores"], outputs["labels"], outputs["boxes"], result[:, 4].tolist()):
                if score < 0.7:
                    continue
                box = [int(i) for i in box.tolist()]
                x0, y0, x1, y1 = box
                cv2.rectangle(cv_image, (x0, y0), (x1, y1), (0, 255, 0), 2)
                cv2.putText(
                    cv_image,
                    f"{model.config.id2label[label.item()]}: {score:.2f}, ID: {id}",
                    (x0, y0 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (0, 255, 0),
                    1,
                    cv2.LINE_AA,
                )

            output_path = os.path.join(output_folder, filename)
            cv2.imwrite(output_path, cv_image)
            

self.max_obs 65


[32m2025-11-10 15:02:34.916[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m56[0m | __init__ - [1mBaseTracker initialization parameters:[0m
[32m2025-11-10 15:02:34.917[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m57[0m | __init__ - [1mdet_thresh: 0.6[0m
[32m2025-11-10 15:02:34.917[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m58[0m | __init__ - [1mmax_age: 60[0m
[32m2025-11-10 15:02:34.917[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m59[0m | __init__ - [1mmax_obs: 50[0m
[32m2025-11-10 15:02:34.917[0m | MainProcess/MainTh

[32m2025-11-10 15:02:34.917[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m61[0m | __init__ - [1miou_threshold: 0.3[0m
[32m2025-11-10 15:02:34.917[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m62[0m | __init__ - [1mper_class: False[0m
[32m2025-11-10 15:02:34.917[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m63[0m | __init__ - [1mnr_classes: 80[0m
[32m2025-11-10 15:02:34.917[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m64[0m | __init__ - [1masso_func: iou[0m
[32m2025-11-10 15:02:34.917[0m | MainProcess/MainThread | [1mIN

In [41]:
track_image_from_folder("../data/pohang/", "experiments/detr/tracking/")

In [26]:
image_path = "../data/pohang/008824.png"
pil_image = Image.open(image_path).convert("RGB")
inputs = processor(pil_image, return_tensors="pt")
tracker = boxmot.BoostTrack(reid_weights="mars-small128.pb", device=device, half=False)


with torch.no_grad():
    outputs = model(**inputs)

target_sizes = torch.tensor([pil_image.size[::-1]])

outputs = processor.post_process_object_detection(
    outputs,
    target_sizes=target_sizes
)[0]

detections = np.concatenate([
    outputs["boxes"].cpu().numpy(),
    outputs["scores"].cpu().numpy().reshape(-1, 1),
    outputs["labels"].cpu().numpy().reshape(-1, 1)
], axis=1)


cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
frame_ndarray = np.array(pil_image)
result = tracker.update(detections, frame_ndarray)
active_tracks = tracker.active_tracks
tracker.plot_results(frame_ndarray, show_trajectories=False)

for score, label, box, id in zip(outputs["scores"], outputs["labels"], outputs["boxes"], result[:, 4].tolist()):
    if score < 0.7:
        continue
    box = [int(i) for i in box.tolist()]
    x0, y0, x1, y1 = box
    cv2.rectangle(cv_image, (x0, y0), (x1, y1), (0, 255, 0), 2)
    cv2.putText(
        cv_image,
        f"{model.config.id2label[label.item()]}: {score:.2f}, ID: {id}",
        (x0, y0 - 10),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.5,
        (0, 255, 0),
        1,
        cv2.LINE_AA,
    )
cv2.imwrite("tracked.png", cv_image)


[32m2025-11-10 14:28:17.598[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m56[0m | __init__ - [1mBaseTracker initialization parameters:[0m
[32m2025-11-10 14:28:17.598[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m57[0m | __init__ - [1mdet_thresh: 0.6[0m
[32m2025-11-10 14:28:17.598[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m58[0m | __init__ - [1mmax_age: 60[0m
[32m2025-11-10 14:28:17.598[0m | MainProcess/MainThread | [1mINFO    [0m | [36m/Users/krisnol/.pyenv/versions/detr/lib/python3.11/site-packages/boxmot/trackers/basetracker.py[0m:[36m59[0m | __init__ - [1mmax_obs: 50[0m
[32m2025-11-10 14:28:17.598[0m | MainProcess/MainTh

self.max_obs 65


True

In [None]:
active_tracks[0].get_state()

In [None]:
outputs["boxes"][0]

In [29]:
result

array([[9.29825196e+02, 4.09415192e+02, 1.01326477e+03, 6.28612549e+02,
        7.53000000e+02, 8.69943440e-01, 9.00000000e+00, 0.00000000e+00],
       [1.62840320e+03, 2.22542511e+02, 1.87926965e+03, 6.89017334e+02,
        7.57000000e+02, 7.07655907e-01, 9.00000000e+00, 4.00000000e+00],
       [6.05536377e+02, 3.90488251e+02, 7.49567260e+02, 6.20095093e+02,
        7.65000000e+02, 7.20430493e-01, 9.00000000e+00, 1.30000000e+01],
       [1.15418103e+03, 3.48918884e+02, 1.23517078e+03, 6.41509827e+02,
        7.69000000e+02, 6.99322820e-01, 9.00000000e+00, 1.70000000e+01],
       [7.16180969e+02, 3.70115051e+02, 8.49787048e+02, 6.28934692e+02,
        7.72000000e+02, 7.06675231e-01, 9.00000000e+00, 2.20000000e+01],
       [1.92861414e+03, 5.95285400e+02, 2.04791736e+03, 6.74803955e+02,
        7.73000000e+02, 8.69668543e-01, 9.00000000e+00, 2.40000000e+01],
       [1.26289050e+03, 3.79475525e+02, 1.36799280e+03, 6.39499695e+02,
        7.78000000e+02, 9.03512239e-01, 9.00000000e+00, 2.

In [30]:
print(f"Length of result: {len(active_tracks)}, len detections: {len(detections)}")

Length of result: 32, len detections: 36


In [44]:
import cv2
import torch
import numpy as np
from pathlib import Path
from boxmot import BoostTrack, BotSort
   
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load detector with pretrained weights and preprocessing transforms
model_name = "facebook/detr-resnet-50"
processor = DetrImageProcessor.from_pretrained(model_name)
detector = DetrForObjectDetection.from_pretrained(model_name)

# Initialize tracker
# tracker = BoostTrack(reid_weights=Path('osnet_x0_25_msmt17.pt'), device=device, half=False)
tracker = BoostTrack(reid_weights=Path('osnet_x0_25_msmt17.pt'), device=device, half=False)

# Start video capture
video_path = "../data/pohang/sequence.mp4"
cap = cv2.VideoCapture(video_path)
frame_number = 0
with torch.inference_mode():
    while True:
        success, frame = cap.read()
        if not success:
            break

       
        pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        inputs = processor(pil_image, return_tensors="pt")

        # Run detection
        output = detector(**inputs)
        outputs = processor.post_process_object_detection(
            output,
            target_sizes=[pil_image.size[::-1]]
        )[0]
        scores = outputs['scores'].cpu().numpy()
        keep = scores >= 0.5

        # Prepare detections for tracking
        boxes = outputs['boxes'][keep].cpu().numpy()
        labels = outputs['labels'][keep].cpu().numpy()
        filtered_scores = scores[keep]
        detections = np.concatenate([boxes, filtered_scores[:, None], labels[:, None]], axis=1)

        # Update tracker and draw results
        #   INPUT:  M X (x, y, x, y, conf, cls)
        #   OUTPUT: M X (x, y, x, y, id, conf, cls, ind)
        res = tracker.update(detections, frame)
        active_tracks = tracker.active_tracks
        tracker.plot_results(frame, show_trajectories=False)
        print(f"Length of result: {len(res)}, len detections: {len(detections)}, active tracks: {len(active_tracks)}")

        # Show output
        cv2.imshow('BoXMOT + Torchvision', frame)
        cv2.imwrite(f"experiments/detr/predictions/tracking/frame_{frame_number:05d}.png", frame)
        frame_number += 1
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

# Clean up
cap.release()
cv2.destroyAllWindows()

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[32m2025-11-10 15:26:46.374[0m | MainProcess/MainThread | [1mINFO    [0m | [36m

self.max_obs 65
Length of result: 10, len detections: 31, active tracks: 30
Length of result: 12, len detections: 35, active tracks: 32
Length of result: 10, len detections: 32, active tracks: 28
Length of result: 6, len detections: 36, active tracks: 22
Length of result: 7, len detections: 35, active tracks: 22
Length of result: 6, len detections: 32, active tracks: 21
Length of result: 8, len detections: 34, active tracks: 23
Length of result: 9, len detections: 35, active tracks: 23
Length of result: 8, len detections: 32, active tracks: 21
Length of result: 5, len detections: 33, active tracks: 20
Length of result: 6, len detections: 38, active tracks: 21
Length of result: 8, len detections: 41, active tracks: 21
Length of result: 10, len detections: 40, active tracks: 21
Length of result: 9, len detections: 41, active tracks: 22
Length of result: 6, len detections: 37, active tracks: 19
Length of result: 6, len detections: 38, active tracks: 21
Length of result: 7, len detections:

In [None]:
from transformers import RTD