In [1]:
!unzip models.zip
!pip install ultralytics --quiet

Archive:  models.zip
replace models/detect.pt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: models/detect.pt        
  inflating: models/yolo11n-pose.pt  


In [2]:
from ultralytics import YOLO
import cv2
import torch

import numpy as np

# Load a model
pose_model = YOLO("models/yolo11n-pose.pt")
detect_model = YOLO("models/detect.pt")

In [3]:
def calculate_angle(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
  '''
  Calculate angle between 3 points a, b, c.
  Computes vectors, then calculates angle between the 2 vectors.

  Args:
    a (torch.Tensor): first point
    b (torch.Tensor): second point
    c (torch.Tensor): third point
  Returns:
    angle (float): angle in degrees
  '''
  # compute vectors
  ba = a - b
  bc = c - b

  # compute angle
  cosine_angle = torch.dot(ba, bc) / (torch.norm(ba) * torch.norm(bc))
  angle = torch.arccos(cosine_angle)

  return torch.rad2deg(angle)

In [4]:
# Open the video file
video_path = "sample_front.mp4"
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width  = cap.get(cv2.CAP_PROP_FRAME_WIDTH)   # float
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output = cv2.VideoWriter("output.mp4", fourcc, fps,(int(width),int(height)))

# use previous shoulder position to track going up or down
# if going down, then need to check arms are straight
# if going up no need to check
prev_shoulder_y = None
prev_state = None
max_avg_angle = None

# Loop through the video frames
init = False
counted = False
prev_below = False

count = 0
frame_count = 0
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()
    if not success:
        break  # Exit if the video ends

    frame_count += 1
    if frame_count % 2 != 0:
        continue

    if success:
        # Run YOLO inference on the frame
        pose_results = pose_model(frame)
        pose_keypoints = torch.tensor(pose_results[0].keypoints.xy)

        # Run detection till initialisation (assumes fixed camera)
        if init == False:
            detect_results = detect_model(frame)
            left_wrist = pose_keypoints[0,9]
            right_wrist = pose_keypoints[0,10]

            # Extract 'bar' bounding boxes
            bar_boxes = []
            for box, cls in zip(detect_results[0].boxes.xyxy, detect_results[0].boxes.cls):
                if int(cls) == 0:  # class index 0 corresponds to 'bar'
                    bar_boxes.append(box)

            # Visualize the results on the frame
            pose_annotated = pose_results[0].plot()
            detect_annotated = detect_results[0].plot()

            # Overlay annotated frames
            annotated_frame = cv2.addWeighted(pose_annotated, 0.5, detect_annotated, 0.5, 0)

            # Function to check if a point is inside a bounding box
            def is_inside_box(point, box):
                x, y = point
                x1, y1, x2, y2 = box
                return x1 <= x <= x2 and y1 <= y <= y2

            # Check if both wrists are inside any bar bounding box
            for bar_box in bar_boxes:
                if is_inside_box(left_wrist, bar_box) and is_inside_box(right_wrist, bar_box):
                    x1, y1, x2, y2 = map(int, bar_box)
                    mid_y = (y1 + y2) // 2  # Midpoint of the bounding box height
                    init = True
                    break

        else:
            cv2.line(frame, (x1, mid_y), (x2, mid_y), (0, 255, 0), 3)  # Green line
            annotated_frame = pose_results[0].plot()

            left_ear = pose_keypoints[0, 3]
            right_ear = pose_keypoints[0, 4]

            if left_ear[1].item() < mid_y and right_ear[1].item() < mid_y and not counted and max_avg_angle >= 130:
                counted = True
                count += 1
            # Reset counted only if the head stays below mid_y for multiple frames
            elif left_ear[1].item() > mid_y and right_ear[1].item() > mid_y:
                if prev_below:  # Only reset if it was below previously
                    counted = False
                prev_below = True  # Mark that it was below mid_y
            else:
                prev_below = False  # Reset tracking if it's in an in-between state

            # get keypoints for shoulder, elbow, wrist
            left_shoulder = pose_keypoints[0, 5]
            left_elbow = pose_keypoints[0, 7]
            left_wrist = pose_keypoints[0, 9]

            right_shoulder = pose_keypoints[0, 6]
            right_elbow = pose_keypoints[0, 8]
            right_wrist = pose_keypoints[0, 10]

            # compute angle between arms
            left_angle = calculate_angle(left_shoulder, left_elbow, left_wrist)
            right_angle = calculate_angle(right_shoulder, right_elbow, right_wrist)
            avg_angle = (left_angle + right_angle) / 2

            # Calculate average shoulder position to track going up or down
            current_shoulder_y = (left_shoulder[1] + right_shoulder[1]) / 2

            if prev_shoulder_y is not None:
                if current_shoulder_y <= prev_shoulder_y:
                    # means person is going up, no need to ask person to straighten arms
                    state = 'UP'
                else:
                    # means person is going down, need to ask person to straighten arms
                    state = 'DOWN'

                    # reset the max angle when going down for the first time
                    if prev_state != state:
                      max_avg_angle = 0

                # if state down, need to ask person to straighten arms by finding min angle
                # this min angle needs to reset when the person goes up though

                # update previous state
                prev_state = state

                if max_avg_angle is None:
                  max_avg_angle = avg_angle
                else:
                  max_avg_angle = max(max_avg_angle, avg_angle)

                if state == 'DOWN' and max_avg_angle < 130:
                  cv2.putText(annotated_frame, "STRAIGHTEN YOUR ARMS!", (50, 250), font, 1, (0, 0, 255), 2, cv2.LINE_4)

                # write on the frames
                cv2.putText(annotated_frame, f'Left angle: {left_angle: .2f}', (50, 130), font, 1, (0, 255, 255), 2, cv2.LINE_4)
                cv2.putText(annotated_frame, f'Right angle: {right_angle: .2f}', (50, 170), font, 1, (0, 255, 255), 2, cv2.LINE_4)
                cv2.putText(annotated_frame, f"Max Angle: {max_avg_angle: .2f}", (50, 210), font, 1, (0, 255, 255), 2, cv2.LINE_4)

            prev_shoulder_y = current_shoulder_y

        # inserting text on video
        font = cv2.FONT_HERSHEY_SIMPLEX
        cv2.putText(annotated_frame, f'Bar initialised: {init}', (50, 50), font, 1, (0, 255, 255), 1, cv2.LINE_4)
        cv2.putText(annotated_frame, f'Reps: {count}', (50, 90), font, 1, (0, 255, 255), 2, cv2.LINE_4)

        # Display the annotated frame
        # cv2.imshow("YOLO Inference", annotated_frame)
        output.write(annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        break

# Release the video capture object and close the display window
cap.release()
output.release()
cv2.destroyAllWindows()


0: 640x384 1 person, 64.4ms
Speed: 4.5ms preprocess, 64.4ms inference, 305.7ms postprocess per image at shape (1, 3, 640, 384)


  pose_keypoints = torch.tensor(pose_results[0].keypoints.xy)



0: 640x384 2 bars, 1 person, 45.8ms
Speed: 3.3ms preprocess, 45.8ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 40.2ms
Speed: 18.3ms preprocess, 40.2ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 bar, 34.6ms
Speed: 7.4ms preprocess, 34.6ms inference, 17.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 66.2ms
Speed: 8.1ms preprocess, 66.2ms inference, 2.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 bar, 29.8ms
Speed: 2.6ms preprocess, 29.8ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 28.6ms
Speed: 9.7ms preprocess, 28.6ms inference, 10.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 bar, 27.2ms
Speed: 2.5ms preprocess, 27.2ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 37.2ms
Speed: 2.7ms preprocess, 37.2ms inference, 2.4ms postprocess per image at shape (1, 3, 