In [1]:
!unzip models.zip
!pip install ultralytics --quiet

Archive:  models.zip
replace models/detect.pt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: models/detect.pt        
  inflating: models/yolo11n-pose.pt  


In [2]:
from ultralytics import YOLO
import cv2
import torch

import numpy as np

# Load a model
pose_model = YOLO("models/yolo11n-pose.pt")
detect_model = YOLO("models/detect.pt")

In [14]:
def calculate_angle(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
  '''
  Calculate angle between 3 points a, b, c.
  Computes vectors, then calculates angle between the 2 vectors.

  Args:
    a (torch.Tensor): first point
    b (torch.Tensor): second point
    c (torch.Tensor): third point
  Returns:
    angle (float): angle in degrees
  '''
  # compute vectors
  ba = a - b
  bc = c - b

  # compute angle
  cosine_angle = torch.dot(ba, bc) / (torch.norm(ba) * torch.norm(bc))
  angle = torch.arccos(cosine_angle)

  return torch.rad2deg(angle)

# TODO: add logic to prompt person to straighten arms when going down (but not show this when going up)
# probably need to check previous frame person's position to check person is going up or down
# if going up we can suppress warning
# if going down or stay the same need to warn to straighten arms

def previously_straight_arms(left_angle, right_angle):
  '''Function to check the person has straight arms previously before doing another rep'''
  mean_angle = (left_angle + right_angle) / 2
  # either straightened or in the process of going up
  if mean_angle >= 140 or 40 <= mean_angle <= 100:
    return True
  else:
    return False

In [17]:
# Open the video file
video_path = "sample_front.mp4"
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width  = cap.get(cv2.CAP_PROP_FRAME_WIDTH)   # float
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output = cv2.VideoWriter("output.mp4", fourcc, fps,(int(width),int(height)))

# Loop through the video frames
init = False
counted = False
prev_below = False

count = 0
frame_count = 0
while cap.isOpened():
    # Read a frame from the video
    success, frame = cap.read()
    if not success:
        break  # Exit if the video ends

    frame_count += 1
    if frame_count % 2 != 0:
        continue

    if success:
        # Run YOLO inference on the frame
        pose_results = pose_model(frame)
        pose_keypoints = torch.tensor(pose_results[0].keypoints.xy)

        # Run detection till initialisation (assumes fixed camera)
        if init == False:
            detect_results = detect_model(frame)
            left_wrist = pose_keypoints[0,9]
            right_wrist = pose_keypoints[0,10]

            # Extract 'bar' bounding boxes
            bar_boxes = []
            for box, cls in zip(detect_results[0].boxes.xyxy, detect_results[0].boxes.cls):
                if int(cls) == 0:  # class index 0 corresponds to 'bar'
                    bar_boxes.append(box)

            # Visualize the results on the frame
            pose_annotated = pose_results[0].plot()
            detect_annotated = detect_results[0].plot()

            # Overlay annotated frames
            annotated_frame = cv2.addWeighted(pose_annotated, 0.5, detect_annotated, 0.5, 0)

            # Function to check if a point is inside a bounding box
            def is_inside_box(point, box):
                x, y = point
                x1, y1, x2, y2 = box
                return x1 <= x <= x2 and y1 <= y <= y2

            # Check if both wrists are inside any bar bounding box
            for bar_box in bar_boxes:
                if is_inside_box(left_wrist, bar_box) and is_inside_box(right_wrist, bar_box):
                    x1, y1, x2, y2 = map(int, bar_box)
                    mid_y = (y1 + y2) // 2  # Midpoint of the bounding box height
                    init = True
                    break

        else:
            cv2.line(frame, (x1, mid_y), (x2, mid_y), (0, 255, 0), 3)  # Green line
            annotated_frame = pose_results[0].plot()

            left_ear = pose_keypoints[0, 3]
            right_ear = pose_keypoints[0, 4]

            # get keypoints for shoulder, elbow, wrist
            left_shoulder = pose_keypoints[0, 5]
            left_elbow = pose_keypoints[0, 7]
            left_wrist = pose_keypoints[0, 9]

            right_shoulder = pose_keypoints[0, 6]
            right_elbow = pose_keypoints[0, 8]
            right_wrist = pose_keypoints[0, 10]

            # compute angle between arms
            left_angle = calculate_angle(left_shoulder, left_elbow, left_wrist)
            right_angle = calculate_angle(right_shoulder, right_elbow, right_wrist)

            if left_ear[1].item() < mid_y and right_ear[1].item() < mid_y and not counted:
                counted = True
                count += 1

            # Reset counted only if the head stays below mid_y for multiple frames
            elif left_ear[1].item() > mid_y and right_ear[1].item() > mid_y:
                if prev_below:  # Only reset if it was below previously
                    counted = False
                prev_below = True  # Mark that it was below mid_y
            else:
                prev_below = False  # Reset tracking if it's in an in-between state

        # inserting text on video
        font = cv2.FONT_HERSHEY_SIMPLEX
        cv2.putText(annotated_frame, f'Bar initialised: {init}', (50, 50), font, 1, (0, 255, 255), 1, cv2.LINE_4)
        cv2.putText(annotated_frame, f'Reps: {count}', (50, 90), font, 1, (0, 255, 255), 2, cv2.LINE_4)
        try:
            cv2.putText(annotated_frame, f'Left angle: {left_angle: .2f}', (50, 130), font, 1, (0, 255, 255), 2, cv2.LINE_4)
            cv2.putText(annotated_frame, f'Right angle: {right_angle: .2f}', (50, 170), font, 1, (0, 255, 255), 2, cv2.LINE_4)
        # catch case when left_angle and right_angle not yet initialized
        except NameError:
            pass


        # Display the annotated frame
        cv2.imshow("YOLO Inference", annotated_frame)
        output.write(annotated_frame)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    else:
        # Break the loop if the end of the video is reached
        break

# Release the video capture object and close the display window
cap.release()
output.release()
cv2.destroyAllWindows()


0: 640x384 1 person, 13.7ms
Speed: 3.5ms preprocess, 13.7ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 bars, 1 person, 10.1ms
Speed: 2.2ms preprocess, 10.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 10.7ms
Speed: 2.7ms preprocess, 10.7ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 bar, 8.7ms
Speed: 2.7ms preprocess, 8.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 10.1ms
Speed: 2.7ms preprocess, 10.1ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 bar, 8.8ms
Speed: 4.2ms preprocess, 8.8ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 10.1ms
Speed: 3.0ms preprocess, 10.1ms inference, 2.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 bar, 9.2ms
Speed: 2.1ms preprocess, 9.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

  pose_keypoints = torch.tensor(pose_results[0].keypoints.xy)



0: 640x384 1 person, 10.3ms
Speed: 2.6ms preprocess, 10.3ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 bars, 10.4ms
Speed: 2.6ms preprocess, 10.4ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 10.5ms
Speed: 2.5ms preprocess, 10.5ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 bar, 9.3ms
Speed: 3.3ms preprocess, 9.3ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 10.8ms
Speed: 2.5ms preprocess, 10.8ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 bars, 11.3ms
Speed: 3.2ms preprocess, 11.3ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 10.3ms
Speed: 2.5ms preprocess, 10.3ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 bars, 1 person, 10.4ms
Speed: 3.0ms preprocess, 10.4ms inference, 1.1ms postprocess per image at shape (1, 3, 640

In [None]:
# need keypoints for: wrist, elbow, shoulder
# approach: calculate angle for left and right using the keypoints -> take average

# if average of the angle < some threshold we set, then set some boolean to false and no count or something