In [None]:
!pip install tensorflow opencv-python ultralytics mediapipe==0.10.5

In [None]:

from google.colab import drive
drive.mount('/content/drive')


# Mediapipe

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

def plot_3d_landmarks(landmarks):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    x_vals = [l.x for l in landmarks]
    y_vals = [l.y for l in landmarks]
    z_vals = [l.z for l in landmarks]
    ax.scatter(x_vals, y_vals, z_vals, c='r')

    # Draw connections like POSE_CONNECTIONS if desired
    for connection in mp_pose.POSE_CONNECTIONS:
        start_idx = connection[0].value
        end_idx = connection[1].value
        ax.plot([x_vals[start_idx], x_vals[end_idx]],
                [y_vals[start_idx], y_vals[end_idx]],
                [z_vals[start_idx], z_vals[end_idx]], c='b')

    ax.view_init(elev=10, azim=10)  # Customize angle
    plt.show()


In [None]:
import cv2
import mediapipe as mp

mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils
pose = mp_pose.Pose()

cap = cv2.VideoCapture("/content/drive/MyDrive/smai_project/data_set/1.mp4")
out = cv2.VideoWriter("/content/drive/MyDrive/smai_project/output/1_mp.mp4",
                      cv2.VideoWriter_fourcc(*'mp4v'),
                      cap.get(cv2.CAP_PROP_FPS),
                      (int(cap.get(3)), int(cap.get(4))))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret: break

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    res = pose.process(rgb)

    if res.pose_landmarks:
        mp_drawing.draw_landmarks(frame, res.pose_landmarks, mp_pose.POSE_CONNECTIONS)

    out.write(frame)

cap.release()
out.release()
pose.close()


# Movenet

In [None]:
import cv2
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

# Load the MoveNet MultiPose model
model = hub.load("https://tfhub.dev/google/movenet/multipose/lightning/1")

# Set up video I/O
cap = cv2.VideoCapture("/content/drive/MyDrive/smai_project/data_set/1.mp4")
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(
    "/content/drive/MyDrive/smai_project/output/1_mn.mp4",
    cv2.VideoWriter_fourcc(*'mp4v'),
    fps,
    (width, height)
)

# Standard MoveNet 17-keypoint skeleton edges
SKELETON = [
    (0, 1), (0, 2),
    (1, 3), (2, 4),
    (0, 5), (0, 6),
    (5, 7), (7, 9),
    (6, 8), (8, 10),
    (5, 6), (5, 11),
    (6, 12), (11, 12),
    (11, 13), (13, 15),
    (12, 14), (14, 16)
]

def draw_keypoints_and_skeleton(frame, keypoints, conf_thresh=0.3):
    """
    Draw keypoints and skeleton on the frame.
    keypoints: list of N_persons arrays of shape (17,3) in (x, y, score) format,
               where x,y are normalized [0,1].
    """
    h, w, _ = frame.shape
    for person in keypoints:
        # draw keypoints
        for idx, (x, y, c) in enumerate(person):
            if c > conf_thresh:
                cv2.circle(frame, (int(x * w), int(y * h)), 4, (0, 255, 0), -1)
        # draw skeleton
        for p1, p2 in SKELETON:
            x1, y1, c1 = person[p1]
            x2, y2, c2 = person[p2]
            if c1 > conf_thresh and c2 > conf_thresh:
                pt1 = (int(x1 * w), int(y1 * h))
                pt2 = (int(x2 * w), int(y2 * h))
                cv2.line(frame, pt1, pt2, (0, 255, 255), 2)

frame_idx = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Only run pose estimation & drawing every 5th frame
    if frame_idx % 5 == 0:
        # Prepare input for MoveNet
        img = tf.image.resize_with_pad(tf.expand_dims(frame, 0), 256, 256)
        inp = tf.cast(img, tf.int32)

        # Inference
        outputs = model.signatures['serving_default'](inp)
        raw = outputs['output_0'].numpy()[0]  # shape: (6, 55) for up to 6 persons

        # Parse the detections into a list of (17,3) arrays
        persons = []
        for det in raw:
            kpts = det[:51].reshape(17, 3)
            # MoveNet returns (y, x, score); we want (x, y, score)
            pts = np.stack([kpts[:, 1], kpts[:, 0], kpts[:, 2]], axis=-1)
            persons.append(pts)

        # Draw on the frame
        draw_keypoints_and_skeleton(frame, persons, conf_thresh=0.3)

    # Write the (possibly annotated) frame
    out.write(frame)
    frame_idx += 1

cap.release()
out.release()

# YOLO POSE

In [None]:
import cv2
import numpy as np
from ultralytics import YOLO

# Paths
INPUT_VIDEO  = "/content/drive/MyDrive/smai_project/data_set/1.mp4"
OUTPUT_VIDEO = "/content/drive/MyDrive/smai_project/output/1_yolo_pose.mp4"
YOLO_MODEL   = "yolov8n-pose.pt"  # or your custom .pt file

# COCO-style skeleton connections
SKELETON = [
    (0, 1), (0, 2), (1, 3), (2, 4),
    (0, 5), (0, 6), (5, 7), (7, 9),
    (6, 8), (8, 10), (5, 6), (11, 12),
    (11, 13), (13, 15), (12, 14), (14, 16)
]

# Initialize model
model = YOLO(YOLO_MODEL)

# Open input video
cap    = cv2.VideoCapture(INPUT_VIDEO)
fps    = cap.get(cv2.CAP_PROP_FPS)
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Prepare output writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out    = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (width, height))

frame_idx = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    if frame_idx % 5 != 0:
        out.write(frame)
        frame_idx += 1
        continue

    # Inference
    results = model(frame)[0]

    # If no keypoints at all, skip
    if results.keypoints is None or results.keypoints.data.numel() == 0:
        out.write(frame)
        frame_idx += 1
        continue

    # Pull out the raw (n_people,17,3) tensor
    kp_tensor = results.keypoints.data.cpu()

    # Handle single-person case: (17,3) -> (1,17,3)
    if kp_tensor.ndim == 2 and kp_tensor.shape[1] == 3:
        kp_tensor = kp_tensor.unsqueeze(0)

    # Convert to numpy: now guaranteed (N,17,3)
    kpts = kp_tensor.numpy()

    for person in kpts:
        for x, y, conf in person:
            if conf > 0.3:
                cv2.circle(frame, (int(x), int(y)), 4, (0, 255, 0), -1)

        for i, j in SKELETON:
            if i < person.shape[0] and j < person.shape[0]:
                if person[i, 2] > 0.3 and person[j, 2] > 0.3:
                    pt1 = (int(person[i, 0]), int(person[i, 1]))
                    pt2 = (int(person[j, 0]), int(person[j, 1]))
                    cv2.line(frame, pt1, pt2, (0, 255, 255), 2)

    out.write(frame)
    frame_idx += 1

# Clean up
cap.release()
out.release()
print(f"Output saved to {OUTPUT_VIDEO}")

# Comparing pose estimation models

In [None]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from ultralytics import YOLO
import matplotlib.pyplot as plt
from scipy.spatial import procrustes # Import procrustes for PA

# Process every Nth frame for speed
SAMPLE_EVERY = 1
CONF_THRESH = 0.3 # General confidence threshold, used explicitly in some areas.

# Paths (edit these to your files)
VIDEO_PATH    = "/content/drive/MyDrive/smai_project/data_set/1.mp4"
GROUND_TRUTH  = "/content/drive/MyDrive/smai_project/data_set/1.npz"  # your uploaded .npz
YOLO_MODEL    = "yolov8n-pose.pt"

# COCO-style skeleton (unused for metrics in this script but good for visualization)
SKELETON = [
    (0, 1), (0, 2), (1, 3), (2, 4),
    (0, 5), (0, 6), (5, 7), (7, 9),
    (6, 8), (8, 10), (5, 6), (11, 12),
    (11, 13), (13, 15), (12, 14), (14, 16)
]

# Mapping for common keypoints (COCO 17 keypoints) - used for consistent comparison
# Indices for MediaPipe (33 keypoints) -> COCO (17 keypoints)
# Note: This is an approximate mapping. You might need to adjust based on your specific keypoint definitions.
# MediaPipe indices: https://developers.google.com/mediapipe/solutions/vision/pose_landmarker
# COCO indices: http://cocodataset.org/#keypoints-2017
MEDIAPIPE_TO_COCO = [
    0,  # Nose
    1,  # Left eye (inner) - No direct COCO equivalent, use Left eye
    2,  # Left eye - Left eye
    3,  # Left eye (outer) - No direct COCO equivalent, use Left eye
    4,  # Right eye (inner) - No direct COCO equivalent, use Right eye
    5,  # Right eye - Right eye
    6,  # Right eye (outer) - No direct COCO equivalent, use Right eye
    7,  # Left ear - Left ear
    8,  # Right ear - Right ear
    9,  # Mouth (left) - No direct COCO equivalent
    10, # Mouth (right) - No direct COCO equivalent
    11, # Left shoulder - Left shoulder
    12, # Right shoulder - Right shoulder
    13, # Left elbow - Left elbow
    14, # Right elbow - Right elbow
    15, # Left wrist - Left wrist
    16, # Right wrist - Right wrist
    17, # Left pinky 1 - No direct COCO equivalent
    18, # Right pinky 1 - No direct COCO equivalent
    19, # Left index 1 - No direct COCO equivalent
    20, # Right index 1 - No direct COCO equivalent
    21, # Left thumb 2 - No direct COCO equivalent
    22, # Right thumb 2 - No direct COCO equivalent
    23, # Left hip - Left hip
    24, # Right hip - Right hip
    25, # Left knee - Left knee
    26, # Right knee - Right knee
    27, # Left ankle - Left ankle
    28, # Right ankle - Right ankle
    29, # Left heel - No direct COCO equivalent, maybe left ankle? Using Left ankle
    30, # Right heel - No direct COCO equivalent, maybe right ankle? Using Right ankle
    31, # Left foot index - No direct COCO equivalent, maybe left ankle? Using Left ankle
    32  # Right foot index - No direct COCO equivalent, maybe right ankle? Using Right ankle
]

# Create a mapping from MediaPipe index to target common index (0-16 for COCO)
# If a MediaPipe keypoint doesn't map cleanly to a common keypoint, map it to -1
mp_to_common_indices = [-1] * 33 # Initialize with -1 (no mapping)
coco_keypoint_names = [
    "nose", "left_eye", "right_eye", "left_ear", "right_ear",
    "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
    "left_wrist", "right_wrist", "left_hip", "right_hip",
    "left_knee", "right_knee", "left_ankle", "right_ankle"
]

# Populate the mapping based on common sense and typical uses
mp_to_common_indices[0] = 0  # Nose
mp_to_common_indices[2] = 1  # Left eye
mp_to_common_indices[5] = 2  # Right eye
mp_to_common_indices[7] = 3  # Left ear
mp_to_common_indices[8] = 4  # Right ear
mp_to_common_indices[11] = 5 # Left shoulder
mp_to_common_indices[12] = 6 # Right shoulder
mp_to_common_indices[13] = 7 # Left elbow
mp_to_common_indices[14] = 8 # Right elbow
mp_to_common_indices[15] = 9 # Left wrist
mp_to_common_indices[16] = 10 # Right wrist
mp_to_common_indices[23] = 11 # Left hip
mp_to_common_indices[24] = 12 # Right hip
mp_to_common_indices[25] = 13 # Left knee
mp_to_common_indices[26] = 14 # Right knee
mp_to_common_indices[27] = 15 # Left ankle
mp_to_common_indices[28] = 16 # Right ankle

# MoveNet and YOLO use the COCO 17 keypoints directly.
# YOLO keypoint order: https://docs.ultralytics.com/tasks/pose/
# MoveNet keypoint order: https://www.tensorflow.org/hub/tutorials/movenet
# They match the COCO order.

def extract_mediapipe(video_path):
    """
    Extracts pose keypoints from a video using MediaPipe Pose.
    Returns an array of shape (num_sampled_frames, 33, 3) for (x, y, visibility).
    Coordinates are normalized (0.0 to 1.0).
    """
    mp_pose_solution = mp.solutions.pose
    # Initialize with higher confidence for potentially better quality, adjust as needed
    # Setting static_image_mode=False for video processing
    pose = mp_pose_solution.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return np.array([])

    frame_idx = 0
    keypoints_list = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_idx % SAMPLE_EVERY == 0:
            # Convert BGR frame to RGB for MediaPipe
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # Process the frame. Pass by reference for performance.
            results = pose.process(rgb_frame)

            if results.pose_landmarks:
                # Extract x, y, and visibility for each landmark
                # MediaPipe gives normalized coordinates directly
                pts = [(lm.x, lm.y, lm.visibility) for lm in results.pose_landmarks.landmark]
                keypoints_list.append(np.array(pts))
            else:
                # Append NaNs for 33 landmarks if no pose detected
                keypoints_list.append(np.full((33, 3), np.nan))
        frame_idx += 1
    cap.release()
    pose.close() # Release MediaPipe resources
    if not keypoints_list:
        return np.array([])
    return np.stack(keypoints_list)


def extract_movenet(video_path):
    """
    Extracts pose keypoints from a video using MoveNet SinglePose Lightning.
    Returns an array of shape (num_sampled_frames, 17, 3) for (x, y, score).
    Coordinates are normalized (0.0 to 1.0).
    """
    try:
        # Load the MoveNet model from TensorFlow Hub
        # Consider caching the model if running multiple times
        model = hub.load("https://tfhub.dev/google/movenet/singlepose/lightning/4")
        input_signature = model.signatures['serving_default']
        input_size = 192 # Lightning model input size
    except Exception as e:
        print(f"Error loading MoveNet model: {e}")
        return np.array([])

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return np.array([])

    frame_idx, keypoints_list = 0, []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_idx % SAMPLE_EVERY == 0:
            # 1. Convert BGR frame to RGB
            rgb_cv_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # 2. Convert to TensorFlow tensor, ensure dtype is uint8 for resize_with_pad
            img_tensor_uint8 = tf.convert_to_tensor(rgb_cv_frame, dtype=tf.uint8)

            # 3. Resize and pad the image to keep aspect ratio and fit input size.
            # MoveNet expects input shape (1, height, width, 3) and type int32
            input_image_resized = tf.image.resize_with_pad(img_tensor_uint8, input_size, input_size)

            # 4. Cast to int32 for the model and add batch dimension.
            input_batch = tf.expand_dims(tf.cast(input_image_resized, dtype=tf.int32), axis=0)

            try:
                # Run inference
                outputs = input_signature(input=input_batch)
                # output_0 shape: (1, 1, 17, 3) -> (batch, person, keypoint, (y, x, score))
                kpts = outputs['output_0'].numpy()[0, 0, :, :] # (17, 3) -> (y, x, score)
                # Convert from (y, x, score) to (x, y, score)
                # Normalize coordinates to original image size (if needed, but usually comparison is done normalized 0-1)
                # The coordinates from Movenet are normalized to the padded input size (192x192), not the original frame.
                # For comparison with ground truth normalized to original frame, normalization relative to original frame is needed.
                # A simpler approach for comparison is to keep them normalized 0-1 relative to the model's effective input area.
                # However, if ground truth is normalized to original frame, the predicted keypoints should also be normalized to the original frame dimensions.
                # Let's re-normalize to the original frame dimensions (width, height)
                frame_height, frame_width, _ = frame.shape
                kpts_renormalized = kpts.copy()
                kpts_renormalized[:, 1] *= (frame_width / input_size) # Renormalize x (originally y)
                kpts_renormalized[:, 0] *= (frame_height / input_size) # Renormalize y (originally x)

                pts = np.stack([kpts_renormalized[:, 1], kpts_renormalized[:, 0], kpts_renormalized[:, 2]], axis=-1) # (x, y, score)

                keypoints_list.append(pts)
            except Exception as e:
                print(f"Error during MoveNet inference on frame {frame_idx}: {e}")
                # Append NaNs for 17 landmarks if inference fails
                keypoints_list.append(np.full((17, 3), np.nan))
        frame_idx += 1
    cap.release()
    # Note: TensorFlow Hub models don't have a explicit .close() method like MediaPipe
    if not keypoints_list:
        return np.array([])
    return np.stack(keypoints_list)


def extract_yolo(video_path, model_path):
    """
    Extracts pose keypoints from a video using YOLOv8-Pose.
    Returns an array of shape (num_sampled_frames, 17, 3) for (x, y, confidence).
    Coordinates are initially in PIXEL values. They will be normalized in run_all().
    """
    try:
        model = YOLO(model_path)
    except Exception as e:
        print(f"Error loading YOLO model from {model_path}: {e}")
        return np.array([])

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return np.array([])

    frame_idx, keypoints_list = 0, []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_idx % SAMPLE_EVERY == 0:
            # Running inference with conf threshold
            results = model(frame, verbose=False, conf=CONF_THRESH)

            if results and results[0].keypoints is not None and results[0].keypoints.data.numel() > 0:
                # Keypoints data shape: [NumPersons, NumKeypoints, (x,y,conf)]
                kp_tensor = results[0].keypoints.data.cpu().numpy()

                if kp_tensor.ndim == 3 and kp_tensor.shape[0] > 0:
                    if kp_tensor.shape[0] == 1: # Only one person detected
                        person_keypoints = kp_tensor[0] # Shape [17, 3]
                    else: # Multiple persons, select the one with highest average keypoint confidence
                        # Consider only keypoints with confidence > CONF_THRESH for this average
                        # Handle cases where all keypoints for a person are below threshold
                        confidences = kp_tensor[:, :, 2]
                        # Replace confidences below threshold with NaN for averaging
                        confidences_thresholded = np.where(confidences > CONF_THRESH, confidences, np.nan)
                        avg_confidences = np.nanmean(confidences_thresholded, axis=1) # Avg conf per person, ignoring NaNs

                        if np.all(np.isnan(avg_confidences)): # If no person has any keypoint above threshold
                             person_keypoints = np.full((17, 3), np.nan)
                        else:
                            best_person_idx = np.nanargmax(avg_confidences)
                            person_keypoints = kp_tensor[best_person_idx] # Shape [17, 3]

                    # Set keypoints with confidence below threshold to NaN for metric calculation
                    # This ensures only confident predictions contribute to error
                    person_keypoints[person_keypoints[:, 2] < CONF_THRESH, :2] = np.nan # Set (x,y) to NaN if confidence < threshold

                    keypoints_list.append(person_keypoints)
                else: # Fallback for unexpected shapes
                    keypoints_list.append(np.full((17, 3), np.nan))
            else:
                # If no pose is detected, append NaNs (YOLO default 17 keypoints)
                keypoints_list.append(np.full((17, 3), np.nan))
        frame_idx += 1
    cap.release()
    # YOLO model doesn't have an explicit close method like MediaPipe
    if not keypoints_list:
        return np.array([])
    return np.stack(keypoints_list)


def map_mediapipe_to_common(mp_kpts, common_indices_map, num_common_kpts):
    """
    Maps MediaPipe keypoints (33) to a common set of keypoints (e.g., COCO 17).
    Returns an array of shape (num_frames, num_common_kpts, 3).
    """
    if mp_kpts.size == 0:
        return np.full((0, num_common_kpts, 3), np.nan)

    num_frames = mp_kpts.shape[0]
    mapped_kpts = np.full((num_frames, num_common_kpts, 3), np.nan)

    for common_idx in range(num_common_kpts):
        # Find the corresponding MediaPipe index
        try:
            mp_idx = common_indices_map.index(common_idx)
            mapped_kpts[:, common_idx, :] = mp_kpts[:, mp_idx, :]
        except ValueError:
            # If no MediaPipe index maps to this common index, it remains NaN
            pass
    return mapped_kpts


def calculate_mpjpe_2d(gt_kpts, pred_kpts):
    """
    Calculates the 2D Mean Per Joint Position Error (MPJPE) between ground truth and predicted keypoints.
    Assumes inputs are [num_frames, num_keypoints, 2 (x,y)].
    Returns a single scalar value (mean error across all valid joints and frames).
    """
    if gt_kpts.size == 0 or pred_kpts.size == 0:
        print("Warning: Ground truth or predicted keypoints are empty for MPJPE calculation.")
        return np.nan

    # Ensure inputs have only x, y coordinates
    if gt_kpts.shape[-1] > 2:
        gt_kpts = gt_kpts[..., :2]
    if pred_kpts.shape[-1] > 2:
        pred_kpts = pred_kpts[..., :2]

    # Ensure shapes match for comparison (frames, keypoints, 2)
    min_frames = min(gt_kpts.shape[0], pred_kpts.shape[0])
    min_keypoints = min(gt_kpts.shape[1], pred_kpts.shape[1])

    if min_frames == 0 or min_keypoints == 0:
        print("Warning: No common frames or keypoints for MPJPE calculation.")
        return np.nan

    gt_subset = gt_kpts[:min_frames, :min_keypoints, :]
    pred_subset = pred_kpts[:min_frames, :min_keypoints, :]

    # Calculate Euclidean distance for each keypoint in each frame
    dists = np.linalg.norm(pred_subset - gt_subset, axis=-1) # Shape [num_frames, num_keypoints]

    # Calculate mean over all valid distances (ignoring NaNs if any)
    mean_mpjpe = np.nanmean(dists)

    return mean_mpjpe


def calculate_pa_mpjpe_2d(gt_kpts, pred_kpts):
    """
    Calculates the 2D Procrustes Analysis Mean Per Joint Position Error (PA-MPJPE).
    Aligns predicted pose to ground truth pose for each frame using 2D Procrustes analysis
    and then calculates MPJPE.
    Assumes inputs are [num_frames, num_keypoints, 2 (x,y)].
    Returns a single scalar value (mean error across all valid joints and frames after alignment).
    """
    if gt_kpts.size == 0 or pred_kpts.size == 0:
        print("Warning: Ground truth or predicted keypoints are empty for PA-MPJPE calculation.")
        return np.nan

    # Ensure inputs have only x, y coordinates
    if gt_kpts.shape[-1] > 2:
        gt_kpts = gt_kpts[..., :2]
    if pred_kpts.shape[-1] > 2:
        pred_kpts = pred_kpts[..., :2]

    min_frames = min(gt_kpts.shape[0], pred_kpts.shape[0])
    min_keypoints = min(gt_kpts.shape[1], pred_kpts.shape[1])

    if min_frames == 0 or min_keypoints == 0:
        print("Warning: No common frames or keypoints for PA-MPJPE calculation.")
        return np.nan

    gt_subset = gt_kpts[:min_frames, :min_keypoints, :]
    pred_subset = pred_kpts[:min_frames, :min_keypoints, :]

    aligned_dists_list = []

    for i in range(min_frames):
        gt_frame = gt_subset[i] # Shape [num_keypoints, 2]
        pred_frame = pred_subset[i] # Shape [num_keypoints, 2]

        # Handle frames with insufficient valid keypoints for Procrustes
        valid_indices = np.all(~np.isnan([gt_frame, pred_frame]), axis=(0, 2))

        if np.sum(valid_indices) < 2: # Need at least 2 valid points for Procrustes
            # Cannot perform Procrustes analysis for this frame
            aligned_dists_list.append(np.full(min_keypoints, np.nan))
            continue

        gt_valid = gt_frame[valid_indices]
        pred_valid = pred_frame[valid_indices]

        # Perform 2D Procrustes analysis
        # procrustes(data1, data2) returns (mtx1, mtx2, disparity)
        # mtx1 is the aligned version of data1
        # mtx2 is the aligned version of data2
        # disparity is the Procrustes distance between the two sets (after optimal transformation)
        # We are interested in the aligned predicted pose (mtx2) and the original ground truth (mtx1)
        # The disparity is effectively the PA-MPJPE for this frame across the valid points.
        # To get per-joint error for averaging, we can use the aligned points directly.
        try:
             # Use scipy.spatial.procrustes which handles 2D data correctly
             # Note: scipy procrustes aligns data2 to data1. So pred_valid is aligned to gt_valid.
             mtx_gt_aligned, mtx_pred_aligned, disparity = procrustes(gt_valid, pred_valid)

             # Calculate per-joint distances after alignment for the valid points
             aligned_dists_valid = np.linalg.norm(mtx_pred_aligned - mtx_gt_aligned, axis=-1) # Shape [num_valid_keypoints]

             # Map the aligned distances back to the original keypoint indices
             aligned_dists_frame = np.full(min_keypoints, np.nan)
             aligned_dists_frame[valid_indices] = aligned_dists_valid
             aligned_dists_list.append(aligned_dists_frame)

        except ValueError as e:
             print(f"Warning: Procrustes analysis failed for frame {i}. Error: {e}. Skipping frame.")
             aligned_dists_list.append(np.full(min_keypoints, np.nan))
        except Exception as e:
             print(f"An unexpected error occurred during Procrustes analysis for frame {i}: {e}. Skipping frame.")
             aligned_dists_list.append(np.full(min_keypoints, np.nan))


    if not aligned_dists_list:
        return np.nan # Should not happen if min_frames > 0, but as a safeguard

    aligned_dists_all_frames = np.stack(aligned_dists_list) # Shape [num_frames, num_keypoints]

    # Calculate mean over all valid aligned distances
    mean_pa_mpjpe = np.nanmean(aligned_dists_all_frames)

    return mean_pa_mpjpe


def run_all():
    # Optional: TensorFlow GPU diagnosis
    # print("TensorFlow version:", tf.__version__)
    # physical_devices = tf.config.list_physical_devices('GPU')
    # print("Num GPUs Available: ", len(physical_devices))
    # if physical_devices:
    #     try:
    #         for gpu in physical_devices:
    #             tf.config.experimental.set_memory_growth(gpu, True)
    #         print("GPU memory growth enabled.")
    #     except RuntimeError as e:
    #         print(f"Could not set memory growth (this is okay if already set or on CPU): {e}")
    # else:
    #     print("No GPU detected by TensorFlow. Running on CPU.")
    # # To force CPU for MoveNet if DNN errors persist (for testing):
    # # tf.config.set_visible_devices([], 'GPU')
    # # print("Attempting to force CPU for TensorFlow operations.")


    # Number of common keypoints for comparison (COCO 17)
    NUM_COMMON_KEYPOINTS = 17

    # Load ground truth
    try:
        data = np.load(GROUND_TRUTH)
        # Assuming the first array in the npz is the keypoints
        gt_keypoints_all_frames = data[list(data.keys())[0]]
        # Sample ground truth keypoints
        gt_keypoints = gt_keypoints_all_frames[::SAMPLE_EVERY, :, :]
        print(f"Ground truth keypoints loaded and sampled. Shape: {gt_keypoints.shape}")

        # Ensure ground truth has at least 2 components (x, y)
        if gt_keypoints.ndim < 3 or gt_keypoints.shape[2] < 2:
            print(f"Error: Ground truth keypoints do not have expected shape (frames, keypoints, >= 2 components). Actual shape: {gt_keypoints.shape}")
            return

        # Use the number of keypoints in the ground truth for determining common keypoints if it's less than 17
        # This prevents errors if the ground truth has fewer keypoints than the models.
        actual_gt_keypoints = gt_keypoints.shape[1]
        if actual_gt_keypoints < NUM_COMMON_KEYPOINTS:
             print(f"Warning: Ground truth has {actual_gt_keypoints} keypoints, less than the target {NUM_COMMON_KEYPOINTS}. Metrics will be based on {actual_gt_keypoints} keypoints.")
             NUM_COMMON_KEYPOINTS_FOR_METRICS = actual_gt_keypoints
        else:
             NUM_COMMON_KEYPOINTS_FOR_METRICS = NUM_COMMON_KEYPOINTS


    except FileNotFoundError:
        print(f"Error: Ground truth file not found at {GROUND_TRUTH}")
        return
    except Exception as e:
        print(f"Error loading or sampling ground truth from {GROUND_TRUTH}: {e}")
        return

    print("Extracting MediaPipe keypoints...")
    kp_mp_full = extract_mediapipe(VIDEO_PATH)
    # Map MediaPipe keypoints to the common set (e.g., COCO 17)
    kp_mp = map_mediapipe_to_common(kp_mp_full, mp_to_common_indices, NUM_COMMON_KEYPOINTS)

    if kp_mp.size > 0:
        np.savez("mediapipe_keypoints_mapped.npz", kp_mp)
        print(f"MediaPipe keypoints extracted and mapped. Shape: {kp_mp.shape if kp_mp.size > 0 else 'Empty'}")
    else:
        print("MediaPipe keypoint extraction resulted in an empty array.")


    print("Extracting MoveNet keypoints...")
    kp_mn = extract_movenet(VIDEO_PATH)
    if kp_mn.size > 0:
        # Ensure MoveNet also has confidence/score for filtering
        if kp_mn.shape[-1] < 3:
             print("Warning: MoveNet keypoints do not have confidence scores. Assuming confidence 1.0 for all.")
             kp_mn_temp = np.ones((kp_mn.shape[0], kp_mn.shape[1], 3))
             kp_mn_temp[:,:,:2] = kp_mn[:,:,:2]
             kp_mn = kp_mn_temp

        # Set keypoints with confidence below threshold to NaN for metric calculation
        kp_mn[kp_mn[:, :, 2] < CONF_THRESH, :2] = np.nan

        np.savez("movenet_keypoints.npz", kp_mn)
        print(f"MoveNet keypoints extracted. Shape: {kp_mn.shape if kp_mn.size > 0 else 'Empty'}")
    else:
        print("MoveNet keypoint extraction resulted in an empty array.")


    print("Extracting YOLO-Pose keypoints (pixel coordinates initially)...")
    kp_yo_pixels = extract_yolo(VIDEO_PATH, YOLO_MODEL)
    kp_yo_normalized = np.array([]) # Initialize as empty array

    if kp_yo_pixels.size > 0:
         # Normalize YOLO keypoints if extraction was successful
         cap_temp = cv2.VideoCapture(VIDEO_PATH)
         if cap_temp.isOpened():
             width = int(cap_temp.get(cv2.CAP_PROP_FRAME_WIDTH))
             height = int(cap_temp.get(cv2.CAP_PROP_FRAME_HEIGHT))
             cap_temp.release()

             if width > 0 and height > 0:
                 kp_yo_normalized = kp_yo_pixels.copy()
                 # Normalize x and y coordinates to be between 0 and 1
                 kp_yo_normalized[..., 0] /= width  # Normalize x
                 kp_yo_normalized[..., 1] /= height # Normalize y
                 # Confidence (kp_yo_normalized[..., 2]) remains unchanged
                 np.savez("yolo_keypoints_normalized.npz", kp_yo_normalized)
                 print(f"YOLO-Pose keypoints extracted and normalized. Shape: {kp_yo_normalized.shape}")
             else:
                 print("Error: Could not get valid video dimensions for YOLO normalization.")
                 # If dimensions are invalid, normalization is not possible, set normalized kpts to NaN
                 kp_yo_normalized = np.full_like(kp_yo_pixels, np.nan)
         else:
             print("Error: Could not open video to get dimensions for YOLO normalization.")
             # If video cannot be opened, set normalized kpts to NaN
             kp_yo_normalized = np.full_like(kp_yo_pixels, np.nan)
    else:
        print("YOLO-Pose keypoint extraction resulted in an empty array.")


    # Prepare ground truth for comparison (slice to common keypoints and only take x, y)
    gt_xy_common = gt_keypoints[:, :NUM_COMMON_KEYPOINTS_FOR_METRICS, :2]

    # Prepare predicted keypoints for comparison (slice to common keypoints and only take x, y)
    # Handle potential differences in number of keypoints returned by models if not already 17
    kp_mp_xy_common = kp_mp[:, :NUM_COMMON_KEYPOINTS_FOR_METRICS, :2] if kp_mp.size > 0 else np.full((gt_xy_common.shape[0], NUM_COMMON_KEYPOINTS_FOR_METRICS, 2), np.nan)
    kp_mn_xy_common = kp_mn[:, :NUM_COMMON_KEYPOINTS_FOR_METRICS, :2] if kp_mn.size > 0 else np.full((gt_xy_common.shape[0], NUM_COMMON_KEYPOINTS_FOR_METRICS, 2), np.nan)
    kp_yo_xy_common = kp_yo_normalized[:, :NUM_COMMON_KEYPOINTS_FOR_METRICS, :2] if kp_yo_normalized.size > 0 else np.full((gt_xy_common.shape[0], NUM_COMMON_KEYPOINTS_FOR_METRICS, 2), np.nan)


    # Compute MPJPE (2D adaptation)
    print("\nComputing 2D MPJPE...")
    mpjpe_mp = calculate_mpjpe_2d(gt_xy_common, kp_mp_xy_common)
    mpjpe_mn = calculate_mpjpe_2d(gt_xy_common, kp_mn_xy_common)
    mpjpe_yo = calculate_mpjpe_2d(gt_xy_common, kp_yo_xy_common)

    print(f"2D MPJPE Results:")
    print(f"  MediaPipe: {mpjpe_mp:.4f}")
    print(f"  MoveNet:   {mpjpe_mn:.4f}")
    print(f"  YOLO-Pose: {mpjpe_yo:.4f}")

    # Compute PA-MPJPE (2D adaptation)
    print("\nComputing 2D PA-MPJPE...")
    pa_mpjpe_mp = calculate_pa_mpjpe_2d(gt_xy_common, kp_mp_xy_common)
    pa_mpjpe_mn = calculate_pa_mpjpe_2d(gt_xy_common, kp_mn_xy_common)
    pa_mpjpe_yo = calculate_pa_mpjpe_2d(gt_xy_common, kp_yo_xy_common)

    print(f"2D PA-MPJPE Results:")
    print(f"  MediaPipe: {pa_mpjpe_mp:.4f}")
    print(f"  MoveNet:   {pa_mpjpe_mn:.4f}")
    print(f"  YOLO-Pose: {pa_mpjpe_yo:.4f}")

    # --- Original Similarity and Distance Metrics (using compute_metrics) ---
    # The original compute_metrics calculated mean distance and a simple similarity score
    # per joint. We can keep this for a per-joint view.
    # We need to ensure the inputs to compute_metrics also have the confidence/score dimension
    # if needed, but the current implementation only uses x, y for distance/similarity.
    # So, passing the sliced and potentially NaN'd keypoints (shape [frames, common_kpts, 3]) is fine.
    print("\nComputing per-joint distance and similarity (original metrics)...")
    dist_mp, sim_mp = compute_metrics(gt_keypoints[:, :NUM_COMMON_KEYPOINTS_FOR_METRICS, :], kp_mp[:, :NUM_COMMON_KEYPOINTS_FOR_METRICS, :], NUM_COMMON_KEYPOINTS_FOR_METRICS)
    dist_mn, sim_mn = compute_metrics(gt_keypoints[:, :NUM_COMMON_KEYPOINTS_FOR_METRICS, :], kp_mn[:, :NUM_COMMON_KEYPOINTS_FOR_METRICS, :], NUM_COMMON_KEYPOINTS_FOR_METRICS)
    dist_yo, sim_yo = compute_metrics(gt_keypoints[:, :NUM_COMMON_KEYPOINTS_FOR_METRICS, :], kp_yo_normalized[:, :NUM_COMMON_KEYPOINTS_FOR_METRICS, :], NUM_COMMON_KEYPOINTS_FOR_METRICS)


    # Calculate overall average of the original metrics
    avg_dist = [np.nanmean(dist_mp), np.nanmean(dist_mn), np.nanmean(dist_yo)]
    avg_sim  = [np.nanmean(sim_mp),  np.nanmean(sim_mn),  np.nanmean(sim_yo)]
    methods  = ['MediaPipe', 'MoveNet', 'YOLO-Pose']

    print(f"\nOverall Average Original Metrics:")
    print(f"  Average Distances: MediaPipe={avg_dist[0]:.4f}, MoveNet={avg_dist[1]:.4f}, YOLO={avg_dist[2]:.4f}")
    print(f"  Average Similarities: MediaPipe={avg_sim[0]:.4f}, MoveNet={avg_sim[1]:.4f}, YOLO={avg_sim[2]:.4f}\n")


    # Plotting - Updated to potentially use the actual number of common keypoints
    joints_for_plot = np.arange(NUM_COMMON_KEYPOINTS_FOR_METRICS)
    joint_labels = [str(j) for j in joints_for_plot] # Simple index labels for now

    if not (np.all(np.isnan(sim_mp)) and np.all(np.isnan(sim_mn)) and np.all(np.isnan(sim_yo))):
        plt.figure(figsize=(14, 7))
        w = 0.25
        plt.bar(joints_for_plot - w, sim_mp, w, label='MediaPipe', alpha=0.8)
        plt.bar(joints_for_plot,     sim_mn, w, label='MoveNet', alpha=0.8)
        plt.bar(joints_for_plot + w, sim_yo, w, label='YOLO-Pose', alpha=0.8)
        plt.xlabel('Joint Index', fontsize=12); plt.ylabel('Similarity Score', fontsize=12)
        plt.title('Pose Estimation Similarity per Joint', fontsize=14)
        plt.xticks(joints_for_plot, joint_labels, fontsize=10); plt.yticks(fontsize=10)
        plt.legend(fontsize=10); plt.tight_layout();
        plt.savefig('similarity_per_joint.png')
        print("Saved similarity_per_joint.png")
        plt.close()

    if not (np.all(np.isnan(dist_mp)) and np.all(np.isnan(dist_mn)) and np.all(np.isnan(dist_yo))):
        plt.figure(figsize=(14, 7))
        w = 0.25
        plt.bar(joints_for_plot - w, dist_mp, w, label='MediaPipe', alpha=0.8)
        plt.bar(joints_for_plot,     dist_mn, w, label='MoveNet', alpha=0.8)
        plt.bar(joints_for_plot + w, dist_yo, w, label='YOLO-Pose', alpha=0.8)
        plt.xlabel('Joint Index', fontsize=12); plt.ylabel('Average Euclidean Distance', fontsize=12)
        plt.title('Pose Estimation Average Distance per Joint', fontsize=14)
        plt.xticks(joints_for_plot, joint_labels, fontsize=10); plt.yticks(fontsize=10)
        plt.legend(fontsize=10); plt.tight_layout()
        plt.savefig('distance_per_joint.png')
        print("Saved distance_per_joint.png")
        plt.close()

    if not (np.all(np.isnan(avg_dist)) and np.all(np.isnan(avg_sim))):
        idx = np.arange(len(methods))
        fig, ax1 = plt.subplots(figsize=(10, 7))
        bar_width_avg = 0.35

        color_dist = 'skyblue'
        ax1.set_xlabel('Method', fontsize=12)
        ax1.set_ylabel('Average Distance (Original Metric)', color=color_dist, fontsize=12)
        bars1 = ax1.bar(idx - bar_width_avg/2, avg_dist, bar_width_avg, label='Avg Distance', alpha=0.8, color=color_dist)
        ax1.tick_params(axis='y', labelcolor=color_dist)
        ax1.set_xticks(idx)
        ax1.set_xticklabels(methods, fontsize=10)

        ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
        color_sim = 'salmon'
        ax2.set_ylabel('Average Similarity (Original Metric)', color=color_sim, fontsize=12)
        bars2 = ax2.bar(idx + bar_width_avg/2, avg_sim,  bar_width_avg, label='Avg Similarity', alpha=0.8, color=color_sim)
        ax2.tick_params(axis='y', labelcolor=color_sim)

        # Add MPJPE and PA-MPJPE to a table or text output instead of a separate plot
        # because they are overall metrics, not per-joint.
        mpjpe_values = [mpjpe_mp, mpjpe_mn, mpjpe_yo]
        pa_mpjpe_values = [pa_mpjpe_mp, pa_mpjpe_mn, pa_mpjpe_yo]

        # Combine legend handles and labels from both axes
        lines, labels = ax1.get_legend_handles_labels()
        lines2, labels2 = ax2.get_legend_handles_labels()
        ax2.legend(lines + lines2, labels + labels2, loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=2, fontsize=10) # Adjust bbox_to_anchor

        plt.title('Overall Average Original Metrics per Method', fontsize=14)
        fig.tight_layout(rect=[0, 0.1, 1, 1]) # Adjust layout to make space for legend below

        plt.savefig('average_original_metrics.png', bbox_inches='tight')
        print("Saved average_original_metrics.png")
        plt.close()

    print("\nDone! Plots saved to disk (if metrics were valid).")

if __name__ == '__main__':
    run_all()

Ground truth keypoints loaded and sampled. Shape: (4450, 17, 3)
Extracting MediaPipe keypoints...
MediaPipe keypoints extracted and mapped. Shape: (4450, 17, 3)
Extracting MoveNet keypoints...
MoveNet keypoints extracted. Shape: (4450, 17, 3)
Extracting YOLO-Pose keypoints (pixel coordinates initially)...
YOLO-Pose keypoints extracted and normalized. Shape: (4450, 17, 3)

Computing 2D MPJPE...
2D MPJPE Results:
  MediaPipe: 0.7717
  MoveNet:   5.6846
  YOLO-Pose: 0.7225

Computing 2D PA-MPJPE...
2D PA-MPJPE Results:
  MediaPipe: 0.1909
  MoveNet:   0.2099
  YOLO-Pose: 0.2010

Computing per-joint distance and similarity (original metrics)...

Overall Average Original Metrics:
  Average Distances: MediaPipe=0.7717, MoveNet=5.6888, YOLO=0.7107
  Average Similarities: MediaPipe=0.5699, MoveNet=0.1497, YOLO=0.5906

Saved similarity_per_joint.png
Saved distance_per_joint.png
Saved average_original_metrics.png

Done! Plots saved to disk (if metrics were valid).
