In [1]:
from ultralytics import YOLO
import cv2
import torch

# Load a model
pose_model = YOLO("models/yolo11n-pose.pt")
detect_model = YOLO("models/detect.pt")

In [2]:
# Open the video file
video_path = "sample_front.mp4"
cap = cv2.VideoCapture(video_path)

# Loop through the video frames
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()

    if success:
        # Run YOLO inference on the frame
        pose_results = pose_model(frame)
        detect_results = detect_model(frame)

        # Visualize the results on the frame
        pose_annotated = pose_results[0].plot()
        detect_annotated = detect_results[0].plot()

        # Overlay annotated frames
        annotated_frame = cv2.addWeighted(pose_annotated, 0.5, detect_annotated, 0.5, 0)

        # Display the annotated frame
        cv2.imshow("YOLO Inference", annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        break

# Release the video capture object and close the display window
cap.release()
cv2.destroyAllWindows()


0: 640x384 1 person, 69.8ms
Speed: 3.0ms preprocess, 69.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 44.7ms
Speed: 2.3ms preprocess, 44.7ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 41.1ms
Speed: 2.6ms preprocess, 41.1ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 39.2ms
Speed: 1.6ms preprocess, 39.2ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 43.1ms
Speed: 1.5ms preprocess, 43.1ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 37.9ms
Speed: 1.8ms preprocess, 37.9ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 42.4ms
Speed: 1.4ms preprocess, 42.4ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 40.9ms
Speed: 1.3ms preprocess, 40.9ms inference, 0.6ms postprocess per image at shape (1, 3, 64

In [5]:
pose_results

[ultralytics.engine.results.Results object with attributes:
 
 boxes: ultralytics.engine.results.Boxes object
 keypoints: ultralytics.engine.results.Keypoints object
 masks: None
 names: {0: 'person'}
 obb: None
 orig_img: array([[[229, 209, 199],
         [230, 210, 200],
         [231, 213, 205],
         ...,
         [116, 113, 114],
         [114, 111, 112],
         [114, 111, 112]],
 
        [[229, 209, 199],
         [230, 210, 200],
         [231, 213, 205],
         ...,
         [116, 113, 114],
         [114, 111, 112],
         [114, 111, 112]],
 
        [[229, 209, 199],
         [230, 210, 200],
         [231, 213, 205],
         ...,
         [116, 113, 114],
         [114, 111, 112],
         [114, 111, 112]],
 
        ...,
 
        [[ 12,  13,   8],
         [ 15,  16,  11],
         [ 20,  21,  16],
         ...,
         [149, 149, 149],
         [149, 149, 149],
         [149, 149, 149]],
 
        [[ 12,  13,   8],
         [ 15,  16,  11],
         [ 20,  21,

In [6]:
pose_results[0].keypoints.data

tensor([[[3.4592e+02, 4.7591e+02, 9.8805e-01],
         [3.6553e+02, 4.5869e+02, 9.8924e-01],
         [3.3415e+02, 4.5857e+02, 8.8440e-01],
         [4.0683e+02, 4.6538e+02, 9.7442e-01],
         [0.0000e+00, 0.0000e+00, 2.4847e-01],
         [4.5739e+02, 5.7038e+02, 9.9882e-01],
         [3.1205e+02, 5.6708e+02, 9.9058e-01],
         [4.9573e+02, 7.0065e+02, 9.9370e-01],
         [2.9750e+02, 6.8727e+02, 8.9018e-01],
         [5.1039e+02, 8.0501e+02, 9.8697e-01],
         [2.9840e+02, 7.9071e+02, 8.7687e-01],
         [4.2727e+02, 8.2340e+02, 9.9978e-01],
         [3.3304e+02, 8.1967e+02, 9.9932e-01],
         [4.1431e+02, 1.0005e+03, 9.9949e-01],
         [3.1224e+02, 9.9912e+02, 9.9853e-01],
         [4.0342e+02, 1.1516e+03, 9.9587e-01],
         [3.2327e+02, 1.1519e+03, 9.9239e-01]]])

In [7]:
keypoints = torch.tensor(pose_results[0].keypoints.xy)
keypoints[0]

  keypoints = torch.tensor(pose_results[0].keypoints.xy)


tensor([[ 345.9173,  475.9119],
        [ 365.5261,  458.6903],
        [ 334.1494,  458.5687],
        [ 406.8260,  465.3811],
        [   0.0000,    0.0000],
        [ 457.3876,  570.3845],
        [ 312.0522,  567.0842],
        [ 495.7341,  700.6537],
        [ 297.4963,  687.2670],
        [ 510.3861,  805.0095],
        [ 298.4033,  790.7139],
        [ 427.2724,  823.3993],
        [ 333.0393,  819.6724],
        [ 414.3120, 1000.4695],
        [ 312.2359,  999.1199],
        [ 403.4224, 1151.5669],
        [ 323.2685, 1151.9468]])

In [8]:
left_shoulder = keypoints[0,5]
right_shoulder = keypoints[0,6]
left_ankle = keypoints[0,15]
right_ankle = keypoints[0,16]

In [9]:
right_ankle

tensor([ 323.2685, 1151.9468])