In [2]:
from ultralytics import YOLO
import cv2
import torch

# Load a model
pose_model = YOLO("models/yolo11n-pose.pt")
detect_model = YOLO("models/detect.pt")

In [47]:
# Open the video file
video_path = "sample_front.mp4"
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width  = cap.get(cv2.CAP_PROP_FRAME_WIDTH)   # float
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output = cv2.VideoWriter("output.mp4", fourcc, fps,(int(width),int(height)))

# Loop through the video frames
init = False
counted = False
prev_below = False
count = 0
frame_count = 0
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()
    if not success:
        break  # Exit if the video ends

    frame_count += 1
    if frame_count % 2 != 0:
        continue

    if success:
        # Run YOLO inference on the frame
        pose_results = pose_model(frame)
        pose_keypoints = torch.tensor(pose_results[0].keypoints.xy)

        # Run detection till initialisation (assumes fixed camera)
        if init == False:
            detect_results = detect_model(frame)
            left_wrist = pose_keypoints[0,9]
            right_wrist = pose_keypoints[0,10]

            # Extract 'bar' bounding boxes
            bar_boxes = []
            for box, cls in zip(detect_results[0].boxes.xyxy, detect_results[0].boxes.cls):
                if int(cls) == 0:  # class index 0 corresponds to 'bar'
                    bar_boxes.append(box)

            # Visualize the results on the frame
            pose_annotated = pose_results[0].plot()
            detect_annotated = detect_results[0].plot()

            # Overlay annotated frames
            annotated_frame = cv2.addWeighted(pose_annotated, 0.5, detect_annotated, 0.5, 0)

            # Function to check if a point is inside a bounding box
            def is_inside_box(point, box):
                x, y = point
                x1, y1, x2, y2 = box
                return x1 <= x <= x2 and y1 <= y <= y2

            # Check if both wrists are inside any bar bounding box
            for bar_box in bar_boxes:
                if is_inside_box(left_wrist, bar_box) and is_inside_box(right_wrist, bar_box):
                    x1, y1, x2, y2 = map(int, bar_box)
                    mid_y = (y1 + y2) // 2  # Midpoint of the bounding box height
                    init = True
                    break

        else:
            cv2.line(frame, (x1, mid_y), (x2, mid_y), (0, 255, 0), 3)  # Green line
            annotated_frame = pose_results[0].plot()

            left_ear = pose_keypoints[0, 3]
            right_ear = pose_keypoints[0, 4]
            if left_ear[1].item() < mid_y and right_ear[1].item() < mid_y and not counted:
                counted = True
                count += 1

            # Reset counted only if the head stays below mid_y for multiple frames
            elif left_ear[1].item() > mid_y and right_ear[1].item() > mid_y:
                if prev_below:  # Only reset if it was below previously
                    counted = False  
                prev_below = True  # Mark that it was below mid_y
            else:
                prev_below = False  # Reset tracking if it's in an in-between state
  
        # inserting text on video 
        font = cv2.FONT_HERSHEY_SIMPLEX 
        cv2.putText(annotated_frame, f'Bar initialised: {init}', (50, 50), font, 1, (0, 255, 255), 1, cv2.LINE_4) 
        cv2.putText(annotated_frame, f'Reps: {count}', (50, 90), font, 1, (0, 255, 255), 2, cv2.LINE_4) 


        # Display the annotated frame
        cv2.imshow("YOLO Inference", annotated_frame)
        output.write(annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        break

# Release the video capture object and close the display window
cap.release()
output.release()
cv2.destroyAllWindows()




0: 640x384 1 person, 156.0ms
Speed: 5.3ms preprocess, 156.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 bars, 1 person, 95.3ms
Speed: 2.4ms preprocess, 95.3ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)


  pose_keypoints = torch.tensor(pose_results[0].keypoints.xy)



0: 640x384 1 person, 104.9ms
Speed: 5.3ms preprocess, 104.9ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 bar, 97.1ms
Speed: 2.4ms preprocess, 97.1ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 106.5ms
Speed: 3.1ms preprocess, 106.5ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 bar, 87.1ms
Speed: 2.6ms preprocess, 87.1ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 90.0ms
Speed: 2.3ms preprocess, 90.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 bar, 81.6ms
Speed: 1.8ms preprocess, 81.6ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 91.8ms
Speed: 2.6ms preprocess, 91.8ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 bars, 80.4ms
Speed: 2.6ms preprocess, 80.4ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

In [43]:
count

3

In [5]:
pose_results

[ultralytics.engine.results.Results object with attributes:
 
 boxes: ultralytics.engine.results.Boxes object
 keypoints: ultralytics.engine.results.Keypoints object
 masks: None
 names: {0: 'person'}
 obb: None
 orig_img: array([[[229, 209, 199],
         [230, 210, 200],
         [231, 213, 205],
         ...,
         [116, 113, 114],
         [114, 111, 112],
         [114, 111, 112]],
 
        [[229, 209, 199],
         [230, 210, 200],
         [231, 213, 205],
         ...,
         [116, 113, 114],
         [114, 111, 112],
         [114, 111, 112]],
 
        [[229, 209, 199],
         [230, 210, 200],
         [231, 213, 205],
         ...,
         [116, 113, 114],
         [114, 111, 112],
         [114, 111, 112]],
 
        ...,
 
        [[ 12,  13,   8],
         [ 15,  16,  11],
         [ 20,  21,  16],
         ...,
         [149, 149, 149],
         [149, 149, 149],
         [149, 149, 149]],
 
        [[ 12,  13,   8],
         [ 15,  16,  11],
         [ 20,  21,

In [6]:
pose_results[0].keypoints.data

tensor([[[3.4592e+02, 4.7591e+02, 9.8805e-01],
         [3.6553e+02, 4.5869e+02, 9.8924e-01],
         [3.3415e+02, 4.5857e+02, 8.8440e-01],
         [4.0683e+02, 4.6538e+02, 9.7442e-01],
         [0.0000e+00, 0.0000e+00, 2.4847e-01],
         [4.5739e+02, 5.7038e+02, 9.9882e-01],
         [3.1205e+02, 5.6708e+02, 9.9058e-01],
         [4.9573e+02, 7.0065e+02, 9.9370e-01],
         [2.9750e+02, 6.8727e+02, 8.9018e-01],
         [5.1039e+02, 8.0501e+02, 9.8697e-01],
         [2.9840e+02, 7.9071e+02, 8.7687e-01],
         [4.2727e+02, 8.2340e+02, 9.9978e-01],
         [3.3304e+02, 8.1967e+02, 9.9932e-01],
         [4.1431e+02, 1.0005e+03, 9.9949e-01],
         [3.1224e+02, 9.9912e+02, 9.9853e-01],
         [4.0342e+02, 1.1516e+03, 9.9587e-01],
         [3.2327e+02, 1.1519e+03, 9.9239e-01]]])

In [19]:
keypoints = torch.tensor(pose_results[0].keypoints.xy)
keypoints[0]

  keypoints = torch.tensor(pose_results[0].keypoints.xy)


tensor([[ 348.8192,  477.1858],
        [ 368.5421,  460.0144],
        [ 336.8237,  459.9761],
        [ 409.4200,  467.1314],
        [   0.0000,    0.0000],
        [ 459.3442,  572.5975],
        [ 314.8810,  570.4692],
        [ 500.3615,  701.1915],
        [ 299.0328,  689.4351],
        [ 520.6007,  804.6921],
        [ 292.0893,  789.7985],
        [ 429.0633,  822.0613],
        [ 336.0569,  818.5392],
        [ 416.3361,  999.4310],
        [ 316.2916,  998.3321],
        [ 403.8281, 1147.8667],
        [ 327.6841, 1148.2466]])

In [20]:
left_shoulder = keypoints[0,5]
right_shoulder = keypoints[0,6]
left_ankle = keypoints[0,15]
right_ankle = keypoints[0,16]

In [23]:
right_ankle[1].item()

1148.24658203125