In [1]:
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
from ultralytics import YOLO

In [3]:
ENGINE_PATH = "../data/models/yolo11m-pose.engine"
VIDEO_INPUT = "../data/3196221-uhd_3840_2160_25fps.mp4"
VIDEO_OUTPUT = "../data/yolo11m_pose_tracked.mp4"
CSV_OUTPUT = "../data/yolo11m_pose_tracked.csv"

In [4]:
POSE_CONNECTIONS = [
    (5, 7), (7, 9),      # left arm
    (6, 8), (8, 10),     # right arm
    (5, 6),              # shoulders
    (5, 11), (6, 12),    # torso
    (11, 13), (13, 15),  # left leg
    (12, 14), (14, 16)   # right leg
]

In [8]:
def draw_pose(frame, keypoints, track_id=None):
    h, w, _ = frame.shape

    # draw keypoints
    for x, y, conf in keypoints:
        if conf > 0.4:
            cv2.circle(frame, (int(x), int(y)), 3, (0, 255, 0), -1)

    # draw skeleton
    for i, j in POSE_CONNECTIONS:
        if keypoints[i][2] > 0.4 and keypoints[j][2] > 0.4:
            x1, y1 = int(keypoints[i][0]), int(keypoints[i][1])
            x2, y2 = int(keypoints[j][0]), int(keypoints[j][1])
            cv2.line(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)

    # draw track id
    if track_id is not None:
        x, y = int(keypoints[0][0]), int(keypoints[0][1])
        cv2.putText(
            frame,
            f"ID {track_id}",
            (x, y - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.6,
            (0, 0, 255),
            2
        )

In [5]:
model = YOLO(ENGINE_PATH)

In [6]:
results = model.track(VIDEO_INPUT, stream=True, show=True, tracker="bytetrack.yaml")

Loading ../data/models/yolo11m-pose.engine for TensorRT inference...


In [7]:
results

<generator object BasePredictor.stream_inference at 0x000001FE8E244EE0>

In [10]:
cap = cv2.VideoCapture(VIDEO_INPUT)

fps = cap.get(cv2.CAP_PROP_FPS)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(VIDEO_OUTPUT, fourcc, fps, (w, h))

records = []

with tqdm(total=frame_count, desc="Tracking") as pbar:
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        results = model.track(
            frame,
            persist=True,
            conf=0.4,
            iou=0.5,
            tracker="bytetrack.yaml",
            verbose=False,
            show=False
        )

        if results and results[0].keypoints is not None:
            kpts = results[0].keypoints.xy.cpu().numpy()
            confs = results[0].keypoints.conf.cpu().numpy()

            boxes = results[0].boxes
            track_ids = (
                boxes.id.cpu().numpy().astype(int)
                if boxes.id is not None else [None] * len(kpts)
            )

            for kp, kp_conf, tid in zip(kpts, confs, track_ids):
                keypoints_full = np.hstack([kp, kp_conf[:, None]])
                draw_pose(frame, keypoints_full, tid)

                records.append({
                    "track_id": tid,
                    "keypoints": keypoints_full.tolist()
                })

        writer.write(frame)
        pbar.update(1)

cap.release()
writer.release()

pd.DataFrame(records).to_csv(CSV_OUTPUT, index=False)

print(f"Saved video to: {VIDEO_OUTPUT}")
print(f"Saved CSV to: {CSV_OUTPUT}")

Tracking: 100%|██████████| 385/385 [00:36<00:00, 10.50it/s]


Saved video to: ../data/yolo11m_pose_tracked.mp4
Saved CSV to: ../data/yolo11m_pose_tracked.csv
