## understanding the pose estimation

In [2]:
import os
import cv2
from ultralytics import YOLO

# Load YOLOv8 pose estimation model
model = YOLO("yolov8x-pose.pt")

# Create output directory
output_dir = "output_pose"
os.makedirs(output_dir, exist_ok=True)

# Open the tacticam video
video_path = "dataset/tacticam.mp4"
cap = cv2.VideoCapture(video_path)

frame_idx = 0
max_frames = 5

while frame_idx < max_frames:
    ret, frame = cap.read()
    if not ret:
        print("End of video or error reading frame.")
        break

    # Run pose detection
    results = model.predict(source=frame, save=False, conf=0.1, verbose=False)[0]

    # Plot keypoints on frame
    annotated_frame = results.plot()

    # Save annotated frame
    save_path = os.path.join(output_dir, f"pose_frame_{frame_idx}.jpg")
    cv2.imwrite(save_path, annotated_frame)
    print(f"Saved pose frame: {save_path}")

    frame_idx += 1

cap.release()

Saved pose frame: output_pose/pose_frame_0.jpg
Saved pose frame: output_pose/pose_frame_1.jpg
Saved pose frame: output_pose/pose_frame_2.jpg
Saved pose frame: output_pose/pose_frame_3.jpg
Saved pose frame: output_pose/pose_frame_4.jpg


## above code does not give pose estimation as the model is not able to pick small scale humans

## use fine tuneed yolo 11 to get player and then run pose estimation

In [1]:
import os
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from torchreid.utils import FeatureExtractor
import torch.nn.functional as F

# Load fine-tuned player detector
detector = YOLO("model/best.pt")

# Load pose estimation model
pose_model = YOLO("yolov8x-pose.pt")

# Load ReID model
extractor = FeatureExtractor(
    model_name='osnet_x1_0',
    model_path='osnet_x1_0_market1501.pth',
    device='cpu'
)

# Video input/output
video_path = "dataset/broadcast.mp4"
output_dir = "output_pose"
os.makedirs(output_dir, exist_ok=True)
cap = cv2.VideoCapture(video_path)

# Parameters
frame_count = 0
max_frames = 5
font = cv2.FONT_HERSHEY_SIMPLEX
global_player_id = 0
embed_array = []

while frame_count < max_frames:
    ret, frame = cap.read()
    if not ret:
        print("End of video or frames.")
        break

    # Step 1: Detect players using fine-tuned YOLO
    detection_result = detector.predict(frame, conf=0.4, save=False, verbose=False)[0]
    boxes = detection_result.boxes
    annotated_frame = frame.copy()

    if boxes is not None:
        for i, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            cls_id = int(box.cls[0].item())
            class_name = detector.names[cls_id]

            if class_name.lower() == "player":
                # Draw detection
                cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                crop = frame[y1:y2, x1:x2]
                crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)

                # Step 2: Extract embedding
                emb = extractor(crop_rgb)
                emb = F.normalize(emb, p=2, dim=1)

                cx, cy = (x1 + x2) / 2, (y1 + y2) / 2
                matched_id = None
                best_score = 0

                for saved_emb, pid, (prev_cx, prev_cy) in embed_array:
                    sim = torch.mm(emb, saved_emb.t()).item()
                    spatial_dist = np.sqrt((cx - prev_cx)**2 + (cy - prev_cy)**2)
                    box_diag = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
                    norm_dist = spatial_dist / (box_diag + 1e-6)
                    final_score = (sim * 6 + (1 - norm_dist)) / 7

                    if final_score > best_score and final_score > 0.7:
                        best_score = final_score
                        matched_id = pid

                if matched_id is not None:
                    for idx, (e, pid, pos) in enumerate(embed_array):
                        if pid == matched_id:
                            new_emb = F.normalize(e * 0.6 + emb * 0.4, p=2, dim=1)
                            embed_array[idx] = (new_emb, matched_id, (cx, cy))
                            break
                    player_id = matched_id
                else:
                    player_id = global_player_id
                    embed_array.append((emb, player_id, (cx, cy)))
                    global_player_id += 1

                label_text = f"ID: {player_id}, Conf: {box.conf[0]:.2f}"
                cv2.putText(annotated_frame, label_text, (x1, y1 - 10), font, 0.6, (0, 255, 0), 2)

                # Step 3: Pose estimation on full frame
                pose_results = pose_model.predict(frame, conf=0.1, save=False, verbose=False)[0]

                if pose_results.keypoints is not None:
                    # Draw pose on the annotated frame
                    annotated_frame = pose_results.plot(img=annotated_frame)

    # Save the annotated frame
    out_path = os.path.join(output_dir, f"pose_frame_{frame_count}.jpg")
    cv2.imwrite(out_path, annotated_frame)
    print(f"Saved: {out_path}")

    frame_count += 1

cap.release()

: 