In [1]:
# Install dependencies
!pip install ultralytics opencv-python numpy ffmpeg-python


Collecting ultralytics
  Downloading ultralytics-8.3.107-py3-none-any.whl.metadata (37 kB)
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.

In [4]:
import cv2
import numpy as np
import ffmpeg
from ultralytics import YOLO
from google.colab import files

# Define input video (must be manually uploaded to Colab first)
input_video = "all.mp4"  # Change this to your uploaded file
output_video = "action_output.mp4"

# Load YOLOv8 Pose Model
model = YOLO("yolov8n-pose.pt")

# Open the input video
cap = cv2.VideoCapture(input_video)
if not cap.isOpened():
    print("❌ ERROR: Cannot open video.")
    exit()

# Get video properties
fps = int(cap.get(cv2.CAP_PROP_FPS)) or 20
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define output video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

# Ground level for detecting jumps (initialize dynamically)
ground_level = None
jump_threshold = 20  # Pixels above ground level to detect jump
jump_status = "Standing"

# Function to classify Jumping, Standing, or Sitting
def detect_action(keypoints):
    global ground_level, jump_status

    # Keypoints for lower body
    left_hip, right_hip = keypoints[11], keypoints[12]
    left_knee, right_knee = keypoints[13], keypoints[14]
    left_ankle, right_ankle = keypoints[15], keypoints[16]

    # Convert to NumPy arrays
    left_hip, right_hip = np.array(left_hip), np.array(right_hip)
    left_knee, right_knee = np.array(left_knee), np.array(right_knee)
    left_ankle, right_ankle = np.array(left_ankle), np.array(right_ankle)

    # Compute average positions
    avg_hip_y = (left_hip[1] + right_hip[1]) / 2
    avg_knee_y = (left_knee[1] + right_knee[1]) / 2
    avg_ankle_y = (left_ankle[1] + right_ankle[1]) / 2

    # Initialize ground level on first frame
    if ground_level is None:
        ground_level = avg_ankle_y

    # Check if the subject is Jumping
    if avg_ankle_y < ground_level - jump_threshold:
        jump_status = "Jumping"
    # Check for Sitting: hips close to knees, and ankles are not too high
    #elif abs(avg_hip_y - avg_knee_y) < 20 and avg_ankle_y > avg_knee_y:
    elif abs(avg_hip_y - avg_knee_y) < 70:
        jump_status = "Sitting"
    else:
        jump_status = "Standing"

    return jump_status

# Process frames
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break  # End of video

    # Run YOLO Pose Estimation
    results = model(frame)

    # Draw keypoints and classify action
    for r in results:
        for kp in r.keypoints.xy:
            keypoints = kp.cpu().numpy()
            action = detect_action(keypoints)

            # Get bounding box (person detection)
            x1, y1, x2, y2 = map(int, r.boxes.xyxy[0])

            # Define color based on action
            if action == "Jumping":
                color = (0, 0, 255)  # Red
            elif action == "Sitting":
                color = (255, 0, 0)  # Blue
            else:
                color = (0, 255, 0)  # Green

            # Draw bounding box & label
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            cv2.putText(frame, action, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

            # Draw keypoints
            for (x, y) in keypoints:
                cv2.circle(frame, (int(x), int(y)), 3, (0, 255, 255), -1)

    out.write(frame)

# Cleanup
cap.release()
out.release()
print("✅ Processing complete. Video saved as", output_video)

# Compress video using FFmpeg
compressed_video = "final_action_output.mp4"
!ffmpeg -i action_output.mp4 -c:v libx264 -preset slow -crf 23 -pix_fmt yuv420p {compressed_video}



0: 640x384 1 person, 310.1ms
Speed: 14.5ms preprocess, 310.1ms inference, 3.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 271.8ms
Speed: 6.6ms preprocess, 271.8ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 260.4ms
Speed: 6.4ms preprocess, 260.4ms inference, 2.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 277.1ms
Speed: 10.3ms preprocess, 277.1ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 273.7ms
Speed: 8.2ms preprocess, 273.7ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 303.9ms
Speed: 4.1ms preprocess, 303.9ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 294.5ms
Speed: 7.6ms preprocess, 294.5ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 193.9ms
Speed: 6.7ms preprocess, 193.9ms inference, 1.4ms postprocess per image 

In [6]:
# Download the output video
files.download(compressed_video)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>