In [1]:
import cv2
import mediapipe as mp
from ultralytics import YOLO
import time
import numpy as np
from collections import deque

class PoseDetector:
    def __init__(self):
        # Initialize MediaPipe with higher confidence thresholds
        self.mp_pose = mp.solutions.pose
        self.pose = self.mp_pose.Pose(
            min_detection_confidence=0.7,  # Increased from 0.7
            min_tracking_confidence=0.7,   # Increased from 0.7
            model_complexity=2  # Increased from 1 for better accuracy
        )
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        
        # Initialize YOLO with a more accurate model
        self.yolo_model = YOLO('yolov8n-pose.pt')  # Using x model instead of n for better accuracy
        
        # Performance metrics
        self.fps_history = deque(maxlen=30)
        self.detection_history = deque(maxlen=30)
        self.joint_consistency = deque(maxlen=30)
        
        # Model switching control
        self.current_model = 'mediapipe'
        self.last_switch_time = time.time()
        self.switch_cooldown = 10  # 10 second cooldown
        self.switch_pending = False
        
        # Enhanced detection parameters
        self.key_joints = {
            'shoulders': ([11, 12], [5, 2]),
            'hips': ([23, 24], [11, 8]),
            'knees': ([25, 26], [12, 9])
        }
        
        # Motion tracking with enhanced sensitivity
        self.previous_positions = None
        self.movement_threshold = 10  # Reduced from 15 for more sensitive detection
        self.shoulder_movement_threshold = 5  # Reduced from 15 for more sensitive detection
        self.squat_threshold = 0.25  # Reduced from 0.3 for more accurate squat detection
        
        # Detection stability
        self.posture_history = deque(maxlen=5)  # Store last 5 posture detections
        self.blocked_counter = 0
        self.blocked_threshold = 3  # Number of consecutive "blocked" detections needed
        
    def is_blocked(self, landmarks, model_type):
        """Check if the pose is blocked based on visibility and confidence"""
        if model_type == 'mediapipe':
            if landmarks is None:
                return True
            # Check visibility of key points
            key_points = [0, 11, 12, 23, 24, 25, 26]  # Important landmarks
            visibilities = [landmarks.landmark[i].visibility for i in key_points]
            return np.mean(visibilities) < 0.6  # Increased threshold
        else:  # YOLO
            if landmarks is None or len(landmarks) == 0:
                return True
            # Check confidence of key points
            confidences = [kpt[2] for kpt in landmarks[0] if len(kpt) > 2]
            return np.mean(confidences) < 0.5  # Increased threshold

    def detect_posture(self, landmarks, model_type='mediapipe'):
        # Check for blocked view first
        if self.is_blocked(landmarks, model_type):
            self.blocked_counter += 1
            if self.blocked_counter >= self.blocked_threshold:
                return "blocked"
        else:
            self.blocked_counter = 0

        # Rest of the posture detection logic...
        if model_type == 'mediapipe' and landmarks is not None:
            # Get key points with visibility check
            def get_valid_landmark(idx):
                lm = landmarks.landmark[idx]
                return lm if lm.visibility > 0.7 else None
            
            hip_left = get_valid_landmark(23)
            hip_right = get_valid_landmark(24)
            knee_left = get_valid_landmark(25)
            knee_right = get_valid_landmark(26)
            ankle_left = get_valid_landmark(27)
            ankle_right = get_valid_landmark(28)
            shoulder_left = get_valid_landmark(11)
            shoulder_right = get_valid_landmark(12)
            
            # Check if enough valid points for detection
            key_points = [hip_left, hip_right, knee_left, knee_right, 
                         ankle_left, ankle_right, shoulder_left, shoulder_right]
            if None in key_points:
                return "Insufficient visibility"
            
            # Calculate average heights with confidence weighting
            hip_y = (hip_left.y + hip_right.y) / 2
            knee_y = (knee_left.y + knee_right.y) / 2
            ankle_y = (ankle_left.y + ankle_right.y) / 2
            shoulder_y = (shoulder_left.y + shoulder_right.y) / 2
            
            current_positions = {
                'shoulders': (shoulder_left.x, shoulder_left.y, shoulder_right.x, shoulder_right.y),
                'hips': (hip_left.x, hip_left.y, hip_right.x, hip_right.y),
                'knees': (knee_left.x, knee_left.y, knee_right.x, knee_right.y)
            }
            
        elif model_type == 'yolo' and landmarks is not None:
            frame_height = self.current_frame_shape[0]
            frame_width = self.current_frame_shape[1]
            
            try:
                kpts = landmarks[0]
                keypoint_mapping = {
                    'hip_left': 11,
                    'hip_right': 8,
                    'knee_left': 12,
                    'knee_right': 9,
                    'ankle_left': 13,
                    'ankle_right': 10,
                    'shoulder_left': 5,
                    'shoulder_right': 2
                }
                
                points = {}
                for part, idx in keypoint_mapping.items():
                    if idx < len(kpts) and kpts[idx][2] > 0.5:  # Added confidence threshold
                        x = float(kpts[idx][0]) / frame_width
                        y = float(kpts[idx][1]) / frame_height
                        points[part] = np.array([x, y])
                    else:
                        return "Low confidence detection"
                
                hip_y = (points['hip_left'][1] + points['hip_right'][1]) / 2
                knee_y = (points['knee_left'][1] + points['knee_right'][1]) / 2
                ankle_y = (points['ankle_left'][1] + points['ankle_right'][1]) / 2
                shoulder_y = (points['shoulder_left'][1] + points['shoulder_right'][1]) / 2
                
                current_positions = {
                    'shoulders': (*points['shoulder_left'], *points['shoulder_right']),
                    'hips': (*points['hip_left'], *points['hip_right']),
                    'knees': (*points['knee_left'], *points['knee_right'])
                }
                
            except (IndexError, KeyError) as e:
                return "Detection error"
        else:
            return "No pose detected"

        # Enhanced movement detection with smoothing
        movements = []
        hip_knee_ratio = abs(hip_y - knee_y) / (abs(knee_y - ankle_y) + 1e-6)
        
        # Smooth posture detection using history
        if hip_knee_ratio < self.squat_threshold:
            movements.append("Squatting")
        else:
            movements.append("Standing")

        if self.previous_positions is not None:
            # Enhanced movement detection with weighted average
            shoulder_movement = np.mean([abs(current_positions['shoulders'][i] - 
                                          self.previous_positions['shoulders'][i]) 
                                      for i in range(4)])
            leg_movement = np.mean([abs(current_positions['knees'][i] - 
                                      self.previous_positions['knees'][i]) 
                                  for i in range(4)])
            
            # Apply weighted threshold
            if shoulder_movement > self.shoulder_movement_threshold/1000:
                movements.append("Moving Shoulders")
            if leg_movement > self.movement_threshold/1000:
                movements.append("Moving Legs")

        self.previous_positions = current_positions
        
        # Store detection in history for smoothing
        current_posture = " & ".join(movements) if movements else "Standing Still"
        self.posture_history.append(current_posture)
        
        # Return most common posture from recent history
        if len(self.posture_history) >= 3:
            return max(set(self.posture_history), key=self.posture_history.count)
        return current_posture

    def process_frame(self, frame):
        self.current_frame_shape = frame.shape
        start_time = time.time()
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Process with both models
        mp_results = self.pose.process(frame_rgb)
        yolo_results = self.yolo_model(frame)
        
        # Calculate FPS
        fps = 1.0 / (time.time() - start_time)
        self.fps_history.append(fps)
        
        # Get confidence scores with enhanced calculation
        mp_confidence = 0
        yolo_confidence = 0
        posture = "No pose detected"
        
        if mp_results.pose_landmarks:
            visibilities = [lm.visibility for lm in mp_results.pose_landmarks.landmark]
            mp_confidence = np.mean(visibilities) * (1 + np.std(visibilities))
            
        if len(yolo_results) > 0 and yolo_results[0].keypoints is not None:
            yolo_kpts = yolo_results[0].keypoints.data.cpu().numpy()
            confidences = [kpt[2] for kpt in yolo_kpts[0] if len(kpt) > 2]
            yolo_confidence = np.mean(confidences) * (1 + np.std(confidences))
        
        # Model switching logic with cooldown
        current_time = time.time()
        if current_time - self.last_switch_time >= self.switch_cooldown:
            if mp_confidence > yolo_confidence and self.current_model != 'mediapipe':
                self.current_model = 'mediapipe'
                self.last_switch_time = current_time
            elif yolo_confidence > mp_confidence and self.current_model != 'yolo':
                self.current_model = 'yolo'
                self.last_switch_time = current_time
        
        # Use current model for detection
        if self.current_model == 'mediapipe':
            if mp_results.pose_landmarks:
                posture = self.detect_posture(mp_results.pose_landmarks, 'mediapipe')
                self.mp_drawing.draw_landmarks(
                    frame, 
                    mp_results.pose_landmarks,
                    self.mp_pose.POSE_CONNECTIONS,
                    self.mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=2),
                    self.mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                )
        else:
            if len(yolo_results) > 0 and yolo_results[0].keypoints is not None:
                yolo_kpts = yolo_results[0].keypoints.data.cpu().numpy()
                posture = self.detect_posture(yolo_kpts, 'yolo')
                for kpt in yolo_kpts[0]:
                    x, y = int(kpt[0]), int(kpt[1])
                    conf = float(kpt[2])
                    if conf > 0.5:
                        cv2.circle(frame, (x, y), 4, (0, 255, 0), -1)
        
        # Check for blocked state
        if posture == "blocked":
            cv2.putText(frame, "BLOCKED", (frame.shape[1]//2 - 60, frame.shape[0]//2),
                       cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 0, 255), 3)
        
        # Calculate cooldown remaining
        cooldown_remaining = max(0, self.switch_cooldown - (current_time - self.last_switch_time))
        
        # Display metrics
        metrics_text = [
            f"FPS: {np.mean(self.fps_history):.1f}",
            f"MediaPipe Conf: {mp_confidence:.2f}",
            f"YOLO Conf: {yolo_confidence:.2f}",
            f"Current Model: {self.current_model}",
            f"Switch Cooldown: {cooldown_remaining:.1f}s",
            f"Posture: {posture}"
        ]
        
        for i, text in enumerate(metrics_text):
            cv2.putText(frame, text, (10, 30 + i * 30),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        
        return frame

def main():
    cap = cv2.VideoCapture(0)
    cv2.namedWindow('Enhanced Pose Detection', cv2.WINDOW_NORMAL)
    cv2.setWindowProperty('Enhanced Pose Detection', 
                         cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
    
    detector = PoseDetector()
    
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            break
            
        processed_frame = detector.process_frame(frame)
        cv2.imshow('Enhanced Pose Detection', processed_frame)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()




  _torch_pytree._register_pytree_node(


0: 480x640 1 person, 163.0ms
Speed: 4.6ms preprocess, 163.0ms inference, 14.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.0ms
Speed: 2.0ms preprocess, 66.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 69.0ms
Speed: 1.0ms preprocess, 69.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 72.0ms
Speed: 1.0ms preprocess, 72.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.0ms
Speed: 1.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 1.0ms preprocess, 65.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.0ms
Speed: 1.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.0ms
Speed: 1.0ms preprocess, 66.0ms inference, 1.0ms postprocess per image at shape (1, 3, 



0: 480x640 1 person, 61.0ms
Speed: 2.0ms preprocess, 61.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 62.0ms
Speed: 1.0ms preprocess, 62.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.0ms
Speed: 1.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.0ms
Speed: 1.0ms preprocess, 66.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.0ms
Speed: 1.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 1.0ms preprocess, 65.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.0ms
Speed: 2.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 63.0ms
Speed: 2.0ms preprocess, 63.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480

Speed: 1.0ms preprocess, 66.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.0ms
Speed: 1.0ms preprocess, 66.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.0ms
Speed: 1.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 63.0ms
Speed: 1.0ms preprocess, 63.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 1.0ms preprocess, 65.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,


0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 62.0ms
Speed: 1.0ms preprocess, 62.0ms inference, 1.0ms postprocess per image at shape (1, 3, 48

Speed: 1.0ms preprocess, 61.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.0ms
Speed: 1.0ms preprocess, 61.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 62.0ms
Speed: 1.0ms preprocess, 62.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 63.0ms
Speed: 1.0ms preprocess, 63.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 62.0ms
Speed: 1.0ms preprocess, 62.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,


0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 55.0ms
Speed: 2.0ms preprocess, 55.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.0ms
Speed: 1.0ms preprocess, 57.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.1ms
Speed: 1.0ms preprocess, 61.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 62.0ms
Speed: 1.0ms preprocess, 62.0ms inference, 1.0ms postprocess per image at shape (1, 3, 48

Speed: 1.0ms preprocess, 57.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.0ms
Speed: 2.0ms preprocess, 57.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.6ms
Speed: 1.0ms preprocess, 59.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.0ms
Speed: 1.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,


0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.0ms
Speed: 1.0ms preprocess, 57.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.6ms
Speed: 1.0ms preprocess, 61.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 56.0ms
Speed: 1.0ms preprocess, 56.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 48

Speed: 1.0ms preprocess, 57.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.6ms
Speed: 1.0ms preprocess, 57.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 63.0ms
Speed: 1.0ms preprocess, 63.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.9ms
Speed: 1.0ms preprocess, 59.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 56.0ms
Speed: 1.0ms preprocess, 56.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.0ms
Speed: 1.0ms preprocess, 57.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 1.0ms preprocess, 65.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,


0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.0ms
Speed: 1.0ms preprocess, 61.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 56.0ms
Speed: 2.0ms preprocess, 56.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.0ms
Speed: 1.0ms preprocess, 57.0ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.0ms
Speed: 1.0ms preprocess, 57.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.8ms
Speed: 2.0ms preprocess, 58.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 0.0ms postprocess per image at shape (1, 3, 48

Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.6ms
Speed: 1.0ms preprocess, 59.6ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.0ms
Speed: 1.0ms preprocess, 57.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 2.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 56.0ms
Speed: 1.0ms preprocess, 56.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,


0: 480x640 1 person, 57.0ms
Speed: 1.0ms preprocess, 57.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 63.1ms
Speed: 1.0ms preprocess, 63.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.0ms
Speed: 1.0ms preprocess, 61.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 2.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 57.6ms
Speed: 1.0ms preprocess, 57.6ms inference, 1.0ms postprocess per image at shape (1, 3, 48


0: 480x640 2 persons, 56.0ms
Speed: 1.0ms preprocess, 56.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.0ms
Speed: 1.0ms preprocess, 61.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.8ms
Speed: 1.0ms preprocess, 59.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 2.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 4


0: 480x640 1 person, 57.0ms
Speed: 1.0ms preprocess, 57.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.0ms
Speed: 0.9ms preprocess, 61.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 61.0ms
Speed: 1.0ms preprocess, 61.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 0.0ms postprocess per image at shape (1, 3, 48

Speed: 1.0ms preprocess, 59.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 56.0ms
Speed: 1.0ms preprocess, 56.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 62.0ms
Speed: 1.0ms preprocess, 62.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 60.0ms
Speed: 0.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0: 480x640 (no detections), 59.6ms
Speed: 1.0ms preprocess, 59.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 58.9ms
Speed: 1.0ms preprocess, 58.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 57.0ms
Speed: 1.0ms preprocess, 57.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 56.6ms
Speed: 1.0ms preprocess, 56.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 57.0ms
Speed: 1.0ms preprocess, 57.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 57.8ms
Speed: 1.0ms preprocess, 57.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.6ms
Speed: 1.0ms preprocess, 59.6ms in


0: 480x640 (no detections), 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 61.0ms
Speed: 1.0ms preprocess, 61.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.0ms
Speed: 1.0ms preprocess, 59.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 63.0ms
Speed: 1.0ms preprocess, 63.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 61.7ms
Speed: 1.0ms preprocess, 61.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 63.8ms
Speed: 1.0ms preprocess, 63.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 61.0ms
Speed: 1.0ms preprocess, 61.0ms i


0: 480x640 (no detections), 58.0ms
Speed: 1.6ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 58.4ms
Speed: 1.0ms preprocess, 58.4ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 58.0ms
Speed: 1.0ms preprocess, 58.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.6ms
Speed: 2.0ms preprocess, 59.6ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 60.1ms
Speed: 1.0ms preprocess, 60.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.9ms
Speed: 1.1ms preprocess, 59.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 59.0ms
Speed: 1.0ms preprocess, 59.0ms i

In [1]:
import cv2
import mediapipe as mp
from ultralytics import YOLO
import time
import numpy as np
from collections import deque

class PoseDetector:
    def __init__(self):
        # MediaPipe and YOLO initialization
        self.mp_pose = mp.solutions.pose
        self.pose = self.mp_pose.Pose(
            min_detection_confidence=0.7,
            min_tracking_confidence=0.7,
            model_complexity=2
        )
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        self.yolo_model = YOLO('yolov8n-pose.pt')
        
        # Performance monitoring
        self.fps_history = deque(maxlen=30)
        self.detection_history = deque(maxlen=30)
        self.joint_consistency = deque(maxlen=30)
        
        # Model switching parameters
        self.current_model = 'mediapipe'
        self.last_switch_time = time.time()
        self.switch_cooldown = 10
        self.switch_pending = False
        
        # Joint tracking
        self.key_joints = {
            'shoulders': ([11, 12], [5, 2]),
            'hips': ([23, 24], [11, 8]),
            'knees': ([25, 26], [12, 9])
        }
        self.previous_positions = None
        
        # Threshold values
        self.movement_threshold = 10
        self.shoulder_movement_threshold = 5
        self.squat_threshold = 0.25
        self.standing_hip_knee_ratio = 0.9
        self.sitting_hip_knee_ratio = 0.4
        self.arm_raised_threshold = 0.2
        
        # State tracking
        self.posture_history = deque(maxlen=5)
        self.blocked_counter = 0
        self.blocked_threshold = 3
        
        # Visualization parameters
        self.box_color = (0, 255, 0)  # Green
        self.box_thickness = 2
        self.text_color = (255, 255, 255)  # White
        self.text_bg_color = (0, 0, 0)  # Black
        self.font_scale = 0.8
        self.font = cv2.FONT_HERSHEY_SIMPLEX
        
    def detect_posture(self, landmarks, model_type='mediapipe'):
        """
        Detect the current posture based on pose landmarks
        Returns: string indicating the detected posture
        """
        # Get frame dimensions
        h, w = self.current_frame_shape[:2]
        
        # Extract keypoints based on model type
        if model_type == 'mediapipe':
            keypoints = np.array([[lm.x * w, lm.y * h, lm.visibility] 
                                for lm in landmarks.landmark])
            # Check visibility for MediaPipe
            visible_points = [kp[2] > 0.5 for kp in keypoints]
        else:  # YOLO
            # YOLO keypoints are already in pixel coordinates
            keypoints = landmarks  # landmarks is already the numpy array
            # Check confidence for YOLO
            visible_points = [kp[2] > 0.5 for kp in keypoints]
            
        if sum(visible_points) < 15:  # Require at least 15 visible keypoints
            self.blocked_counter += 1
            if self.blocked_counter > self.blocked_threshold:
                return "blocked"
            return "uncertain"
            
        self.blocked_counter = 0
        
        # Get relevant joint positions
        if model_type == 'mediapipe':
            # MediaPipe indices
            left_shoulder = keypoints[11]
            right_shoulder = keypoints[12]
            left_hip = keypoints[23]
            right_hip = keypoints[24]
            left_knee = keypoints[25]
            right_knee = keypoints[26]
            left_ankle = keypoints[27]
            right_ankle = keypoints[28]
            left_wrist = keypoints[15]
            right_wrist = keypoints[16]
        else:
            # YOLO indices
            left_shoulder = keypoints[5]
            right_shoulder = keypoints[6]
            left_hip = keypoints[11]
            right_hip = keypoints[12]
            left_knee = keypoints[13]
            right_knee = keypoints[14]
            left_ankle = keypoints[15]
            right_ankle = keypoints[16]
            left_wrist = keypoints[9]
            right_wrist = keypoints[10]
        
        # Calculate average heights
        shoulder_height = (left_shoulder[1] + right_shoulder[1]) / 2
        hip_height = (left_hip[1] + right_hip[1]) / 2
        knee_height = (left_knee[1] + right_knee[1]) / 2
        ankle_height = (left_ankle[1] + right_ankle[1]) / 2
        
        # Calculate relative heights
        hip_knee_distance = abs(hip_height - knee_height)
        knee_ankle_distance = abs(knee_height - ankle_height)
        
        # Check if arms are raised
        wrist_shoulder_threshold = shoulder_height + (self.arm_raised_threshold * h)
        arms_raised = (left_wrist[1] < wrist_shoulder_threshold or 
                      right_wrist[1] < wrist_shoulder_threshold)
        
        # Calculate hip-knee ratio for posture detection
        hip_knee_ratio = hip_knee_distance / knee_ankle_distance if knee_ankle_distance > 0 else 0
        
        # Store current pose in history
        current_pose = {
            'shoulder_height': shoulder_height,
            'hip_height': hip_height,
            'knee_height': knee_height,
            'hip_knee_ratio': hip_knee_ratio,
            'arms_raised': arms_raised
        }
        self.posture_history.append(current_pose)
        
        # Determine posture based on measurements and history
        if len(self.posture_history) >= 3:
            recent_ratios = [pose['hip_knee_ratio'] for pose in self.posture_history]
            avg_ratio = sum(recent_ratios) / len(recent_ratios)
            
            if avg_ratio > self.standing_hip_knee_ratio:
                if arms_raised:
                    return "standing with raised arms"
                return "standing"
            elif avg_ratio < self.sitting_hip_knee_ratio:
                return "sitting"
            elif self.sitting_hip_knee_ratio <= avg_ratio <= self.standing_hip_knee_ratio:
                if hip_height < knee_height:
                    return "squatting"
                else:
                    return "bending"
        
        return "transitioning"
    
    def calculate_bounding_box(self, landmarks, model_type='mediapipe'):
        """Calculate bounding box coordinates from landmarks"""
        if model_type == 'mediapipe':
            points = []
            for landmark in landmarks.landmark:
                points.append([landmark.x, landmark.y])
            points = np.array(points)
            
            # Convert normalized coordinates to pixel coordinates
            h, w = self.current_frame_shape[:2]
            points[:, 0] *= w
            points[:, 1] *= h
        else:  # YOLO
            # For YOLO, landmarks is already a numpy array of shape (17, 3)
            points = landmarks[:, :2]  # Take only x, y coordinates

        # Add padding to bounding box
        padding = 30
        x_min = max(0, int(np.min(points[:, 0])) - padding)
        x_max = min(self.current_frame_shape[1], int(np.max(points[:, 0])) + padding)
        y_min = max(0, int(np.min(points[:, 1])) - padding)
        y_max = min(self.current_frame_shape[0], int(np.max(points[:, 1])) + padding)

        return x_min, y_min, x_max, y_max

    def draw_action_box(self, frame, bbox, action):
        """Draw bounding box and action text"""
        x_min, y_min, x_max, y_max = bbox
        
        # Draw the bounding box
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), 
                     self.box_color, self.box_thickness)

        # Prepare action text
        text = f"Action: {action}"
        text_size = cv2.getTextSize(text, self.font, self.font_scale, 2)[0]

        # Draw text background
        text_x = x_min
        text_y = y_min - 10 if y_min - 10 > text_size[1] else y_max + 30
        cv2.rectangle(frame, 
                     (text_x, text_y - text_size[1] - 5),
                     (text_x + text_size[0] + 10, text_y + 5),
                     self.text_bg_color, -1)

        # Draw text
        cv2.putText(frame, text, (text_x + 5, text_y),
                    self.font, self.font_scale, self.text_color, 2)

    def process_frame(self, frame):
        """Process a single frame and return the annotated frame"""
        self.current_frame_shape = frame.shape
        start_time = time.time()
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Process with both models
        mp_results = self.pose.process(frame_rgb)
        yolo_results = self.yolo_model(frame)
        
        # Calculate FPS
        fps = 1.0 / (time.time() - start_time)
        self.fps_history.append(fps)
        
        # Get confidence scores
        mp_confidence = 0
        yolo_confidence = 0
        posture = "No pose detected"
        
        if mp_results.pose_landmarks:
            visibilities = [lm.visibility for lm in mp_results.pose_landmarks.landmark]
            mp_confidence = np.mean(visibilities) * (1 + np.std(visibilities))
            
        if len(yolo_results) > 0 and yolo_results[0].keypoints is not None:
            yolo_kpts = yolo_results[0].keypoints.data.cpu().numpy()
            if len(yolo_kpts) > 0:  # Check if any keypoints were detected
                yolo_kpts = yolo_kpts[0]  # Shape: (17, 3) for 17 keypoints with x, y, conf
                confidences = yolo_kpts[:, 2]  # Get confidence values
                yolo_confidence = np.mean(confidences) * (1 + np.std(confidences))
        
        # Model switching logic
        current_time = time.time()
        if current_time - self.last_switch_time >= self.switch_cooldown:
            if mp_confidence > yolo_confidence and self.current_model != 'mediapipe':
                self.current_model = 'mediapipe'
                self.last_switch_time = current_time
            elif yolo_confidence > mp_confidence and self.current_model != 'yolo':
                self.current_model = 'yolo'
                self.last_switch_time = current_time

        # Draw landmarks and calculate bounding box
        bbox = None
        if self.current_model == 'mediapipe':
            if mp_results.pose_landmarks:
                posture = self.detect_posture(mp_results.pose_landmarks, 'mediapipe')
                self.mp_drawing.draw_landmarks(
                    frame,
                    mp_results.pose_landmarks,
                    self.mp_pose.POSE_CONNECTIONS,
                    self.mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=2),
                    self.mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                )
                bbox = self.calculate_bounding_box(mp_results.pose_landmarks, 'mediapipe')
        else:  # YOLO
            if len(yolo_results) > 0 and yolo_results[0].keypoints is not None:
                yolo_kpts = yolo_results[0].keypoints.data.cpu().numpy()
                if len(yolo_kpts) > 0:
                    yolo_kpts = yolo_kpts[0]  # Get first person's keypoints
                    posture = self.detect_posture(yolo_kpts, 'yolo')
                    # Draw keypoints
                    for x, y, conf in yolo_kpts:
                        if conf > 0.5:
                            cv2.circle(frame, (int(x), int(y)), 4, (0, 255, 0), -1)
                    bbox = self.calculate_bounding_box(yolo_kpts, 'yolo')
                
        # Draw bounding box and action state if person is detected
        if bbox and posture != "blocked" and posture != "No pose detected":
            self.draw_action_box(frame, bbox, posture)
        
        # Check for blocked state
        if posture == "blocked":
            cv2.putText(frame, "BLOCKED", (frame.shape[1]//2 - 60, frame.shape[0]//2),
                       self.font, 1.5, (0, 0, 255), 3)
        
        # Calculate cooldown remaining
        cooldown_remaining = max(0, self.switch_cooldown - (current_time - self.last_switch_time))
        
        # Display metrics
        metrics_text = [
            f"FPS: {np.mean(self.fps_history):.1f}",
            f"MediaPipe Conf: {mp_confidence:.2f}",
            f"YOLO Conf: {yolo_confidence:.2f}",
            f"Current Model: {self.current_model}",
            f"Switch Cooldown: {cooldown_remaining:.1f}s",
            f"Posture: {posture}"
        ]
        
        for i, text in enumerate(metrics_text):
            cv2.putText(frame, text, (10, 30 + i * 30),
                       self.font, 0.7, (0, 255, 0), 2)
        
        return frame

def main():
    """Main function to run the pose detection system"""
    cap = cv2.VideoCapture(0)
    cv2.namedWindow('Enhanced Pose Detection', cv2.WINDOW_NORMAL)
    cv2.setWindowProperty('Enhanced Pose Detection', 
                         cv2.WND_PROP_FULLSCREEN, cv2.WINDOW_FULLSCREEN)
    
    detector = PoseDetector()
    
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            print("Failed to read frame from camera")
            break
            
        processed_frame = detector.process_frame(frame)
        cv2.imshow('Enhanced Pose Detection', processed_frame)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()




  _torch_pytree._register_pytree_node(


0: 480x640 1 person, 94.3ms
Speed: 3.0ms preprocess, 94.3ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 68.1ms
Speed: 1.0ms preprocess, 68.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)





0: 480x640 1 person, 71.3ms
Speed: 1.4ms preprocess, 71.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 71.5ms
Speed: 2.5ms preprocess, 71.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 74.0ms
Speed: 1.0ms preprocess, 74.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 70.2ms
Speed: 2.0ms preprocess, 70.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 68.2ms
Speed: 1.0ms preprocess, 68.2ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 68.7ms
Speed: 1.0ms preprocess, 68.7ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 73.0ms
Speed: 2.1ms preprocess, 73.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 74.4ms
Speed: 2.0ms preprocess, 74.4ms inference, 1.0ms postprocess per image at shape (1, 3, 480

Speed: 1.0ms preprocess, 73.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 71.7ms
Speed: 1.0ms preprocess, 71.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 68.1ms
Speed: 2.0ms preprocess, 68.1ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 67.9ms
Speed: 2.0ms preprocess, 67.9ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 76.1ms
Speed: 2.0ms preprocess, 76.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 69.3ms
Speed: 0.2ms preprocess, 69.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 72.4ms
Speed: 1.0ms preprocess, 72.4ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 69.6ms
Speed: 2.3ms preprocess, 69.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,


0: 480x640 1 person, 74.1ms
Speed: 1.6ms preprocess, 74.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 77.2ms
Speed: 1.1ms preprocess, 77.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 74.5ms
Speed: 1.5ms preprocess, 74.5ms inference, 2.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 76.0ms
Speed: 2.0ms preprocess, 76.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 76.3ms
Speed: 1.3ms preprocess, 76.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 77.9ms
Speed: 1.0ms preprocess, 77.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 71.1ms
Speed: 1.0ms preprocess, 71.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 70.2ms
Speed: 1.0ms preprocess, 70.2ms inference, 1.0ms postprocess per image at shape (1, 3, 4

Speed: 0.6ms preprocess, 71.7ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 68.4ms
Speed: 1.0ms preprocess, 68.4ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 67.3ms
Speed: 1.0ms preprocess, 67.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 67.8ms
Speed: 1.5ms preprocess, 67.8ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 72.0ms
Speed: 1.0ms preprocess, 72.0ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 72.5ms
Speed: 1.0ms preprocess, 72.5ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 68.1ms
Speed: 1.5ms preprocess, 68.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 71.7ms
Speed: 1.0ms preprocess, 71.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,


0: 480x640 1 person, 71.6ms
Speed: 2.0ms preprocess, 71.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 68.9ms
Speed: 1.1ms preprocess, 68.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 71.3ms
Speed: 1.6ms preprocess, 71.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 70.7ms
Speed: 2.0ms preprocess, 70.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)


In [13]:
import cv2
import mediapipe as mp
import numpy as np
from ultralytics import YOLO
import time
from collections import deque
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import json

In [14]:
class PoseDetector:
    def __init__(self):
        # MediaPipe and YOLO initialization
        self.mp_pose = mp.solutions.pose
        self.pose = self.mp_pose.Pose(
            min_detection_confidence=0.7,
            min_tracking_confidence=0.7,
            model_complexity=2
        )
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        self.yolo_model = YOLO('yolov8n-pose.pt')
        
        # Performance monitoring
        self.fps_history = deque(maxlen=30)
        self.detection_history = deque(maxlen=30)
        self.joint_consistency = deque(maxlen=30)
        
        # Model switching parameters
        self.current_model = 'mediapipe'
        self.last_switch_time = time.time()
        self.switch_cooldown = 10
        self.switch_pending = False
        
        # Joint tracking
        self.key_joints = {
            'shoulders': ([11, 12], [5, 2]),
            'hips': ([23, 24], [11, 8]),
            'knees': ([25, 26], [12, 9])
        }
        self.previous_positions = None
        
        # Threshold values
        self.movement_threshold = 10
        self.shoulder_movement_threshold = 5
        self.squat_threshold = 0.25
        self.standing_hip_knee_ratio = 0.9
        self.sitting_hip_knee_ratio = 0.4
        self.arm_raised_threshold = 0.2
        
        # State tracking
        self.posture_history = deque(maxlen=5)
        self.blocked_counter = 0
        self.blocked_threshold = 3
        
        # Visualization parameters
        self.box_color = (0, 255, 0)  # Green
        self.box_thickness = 2
        self.text_color = (255, 255, 255)  # White
        self.text_bg_color = (0, 0, 0)  # Black
        self.font_scale = 0.8
        self.font = cv2.FONT_HERSHEY_SIMPLEX
        
    def detect_posture(self, landmarks, model_type='mediapipe'):
        """
        Detect the current posture based on pose landmarks
        Returns: string indicating the detected posture
        """
        # Get frame dimensions
        h, w = self.current_frame_shape[:2]
        
        # Extract keypoints based on model type
        if model_type == 'mediapipe':
            keypoints = np.array([[lm.x * w, lm.y * h, lm.visibility] 
                                for lm in landmarks.landmark])
            # Check visibility for MediaPipe
            visible_points = [kp[2] > 0.5 for kp in keypoints]
        else:  # YOLO
            # YOLO keypoints are already in pixel coordinates
            keypoints = landmarks  # landmarks is already the numpy array
            # Check confidence for YOLO
            visible_points = [kp[2] > 0.5 for kp in keypoints]
            
        if sum(visible_points) < 15:  # Require at least 15 visible keypoints
            self.blocked_counter += 1
            if self.blocked_counter > self.blocked_threshold:
                return "blocked"
            return "uncertain"
            
        self.blocked_counter = 0
        
        # Get relevant joint positions
        if model_type == 'mediapipe':
            # MediaPipe indices
            left_shoulder = keypoints[11]
            right_shoulder = keypoints[12]
            left_hip = keypoints[23]
            right_hip = keypoints[24]
            left_knee = keypoints[25]
            right_knee = keypoints[26]
            left_ankle = keypoints[27]
            right_ankle = keypoints[28]
            left_wrist = keypoints[15]
            right_wrist = keypoints[16]
        else:
            # YOLO indices
            left_shoulder = keypoints[5]
            right_shoulder = keypoints[6]
            left_hip = keypoints[11]
            right_hip = keypoints[12]
            left_knee = keypoints[13]
            right_knee = keypoints[14]
            left_ankle = keypoints[15]
            right_ankle = keypoints[16]
            left_wrist = keypoints[9]
            right_wrist = keypoints[10]
        
        # Calculate average heights
        shoulder_height = (left_shoulder[1] + right_shoulder[1]) / 2
        hip_height = (left_hip[1] + right_hip[1]) / 2
        knee_height = (left_knee[1] + right_knee[1]) / 2
        ankle_height = (left_ankle[1] + right_ankle[1]) / 2
        
        # Calculate relative heights
        hip_knee_distance = abs(hip_height - knee_height)
        knee_ankle_distance = abs(knee_height - ankle_height)
        
        # Check if arms are raised
        wrist_shoulder_threshold = shoulder_height + (self.arm_raised_threshold * h)
        arms_raised = (left_wrist[1] < wrist_shoulder_threshold or 
                      right_wrist[1] < wrist_shoulder_threshold)
        
        # Calculate hip-knee ratio for posture detection
        hip_knee_ratio = hip_knee_distance / knee_ankle_distance if knee_ankle_distance > 0 else 0
        
        # Store current pose in history
        current_pose = {
            'shoulder_height': shoulder_height,
            'hip_height': hip_height,
            'knee_height': knee_height,
            'hip_knee_ratio': hip_knee_ratio,
            'arms_raised': arms_raised
        }
        self.posture_history.append(current_pose)
        
        # Determine posture based on measurements and history
        if len(self.posture_history) >= 3:
            recent_ratios = [pose['hip_knee_ratio'] for pose in self.posture_history]
            avg_ratio = sum(recent_ratios) / len(recent_ratios)
            
            if avg_ratio > self.standing_hip_knee_ratio:
                if arms_raised:
                    return "standing with raised arms"
                return "standing"
            elif avg_ratio < self.sitting_hip_knee_ratio:
                return "sitting"
            elif self.sitting_hip_knee_ratio <= avg_ratio <= self.standing_hip_knee_ratio:
                if hip_height < knee_height:
                    return "squatting"
                else:
                    return "bending"
        
        return "transitioning"
    
    def calculate_bounding_box(self, landmarks, model_type='mediapipe'):
        """Calculate bounding box coordinates from landmarks"""
        if model_type == 'mediapipe':
            points = []
            for landmark in landmarks.landmark:
                points.append([landmark.x, landmark.y])
            points = np.array(points)
            
            # Convert normalized coordinates to pixel coordinates
            h, w = self.current_frame_shape[:2]
            points[:, 0] *= w
            points[:, 1] *= h
        else:  # YOLO
            # For YOLO, landmarks is already a numpy array of shape (17, 3)
            points = landmarks[:, :2]  # Take only x, y coordinates

        # Add padding to bounding box
        padding = 30
        x_min = max(0, int(np.min(points[:, 0])) - padding)
        x_max = min(self.current_frame_shape[1], int(np.max(points[:, 0])) + padding)
        y_min = max(0, int(np.min(points[:, 1])) - padding)
        y_max = min(self.current_frame_shape[0], int(np.max(points[:, 1])) + padding)

        return x_min, y_min, x_max, y_max

    def draw_action_box(self, frame, bbox, action):
        """Draw bounding box and action text"""
        x_min, y_min, x_max, y_max = bbox
        
        # Draw the bounding box
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), 
                     self.box_color, self.box_thickness)

        # Prepare action text
        text = f"Action: {action}"
        text_size = cv2.getTextSize(text, self.font, self.font_scale, 2)[0]

        # Draw text background
        text_x = x_min
        text_y = y_min - 10 if y_min - 10 > text_size[1] else y_max + 30
        cv2.rectangle(frame, 
                     (text_x, text_y - text_size[1] - 5),
                     (text_x + text_size[0] + 10, text_y + 5),
                     self.text_bg_color, -1)

        # Draw text
        cv2.putText(frame, text, (text_x + 5, text_y),
                    self.font, self.font_scale, self.text_color, 2)

    def process_frame(self, frame):
        """Process a single frame and return the annotated frame"""
        self.current_frame_shape = frame.shape
        start_time = time.time()
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Process with both models
        mp_results = self.pose.process(frame_rgb)
        yolo_results = self.yolo_model(frame)
        
        # Calculate FPS
        fps = 1.0 / (time.time() - start_time)
        self.fps_history.append(fps)
        
        # Get confidence scores
        mp_confidence = 0
        yolo_confidence = 0
        posture = "No pose detected"
        
        if mp_results.pose_landmarks:
            visibilities = [lm.visibility for lm in mp_results.pose_landmarks.landmark]
            mp_confidence = np.mean(visibilities) * (1 + np.std(visibilities))
            
        if len(yolo_results) > 0 and yolo_results[0].keypoints is not None:
            yolo_kpts = yolo_results[0].keypoints.data.cpu().numpy()
            if len(yolo_kpts) > 0:  # Check if any keypoints were detected
                yolo_kpts = yolo_kpts[0]  # Shape: (17, 3) for 17 keypoints with x, y, conf
                confidences = yolo_kpts[:, 2]  # Get confidence values
                yolo_confidence = np.mean(confidences) * (1 + np.std(confidences))
        
        # Model switching logic
        current_time = time.time()
        if current_time - self.last_switch_time >= self.switch_cooldown:
            if mp_confidence > yolo_confidence and self.current_model != 'mediapipe':
                self.current_model = 'mediapipe'
                self.last_switch_time = current_time
            elif yolo_confidence > mp_confidence and self.current_model != 'yolo':
                self.current_model = 'yolo'
                self.last_switch_time = current_time

        # Draw landmarks and calculate bounding box
        bbox = None
        if self.current_model == 'mediapipe':
            if mp_results.pose_landmarks:
                posture = self.detect_posture(mp_results.pose_landmarks, 'mediapipe')
                self.mp_drawing.draw_landmarks(
                    frame,
                    mp_results.pose_landmarks,
                    self.mp_pose.POSE_CONNECTIONS,
                    self.mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=2),
                    self.mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                )
                bbox = self.calculate_bounding_box(mp_results.pose_landmarks, 'mediapipe')
        else:  # YOLO
            if len(yolo_results) > 0 and yolo_results[0].keypoints is not None:
                yolo_kpts = yolo_results[0].keypoints.data.cpu().numpy()
                if len(yolo_kpts) > 0:
                    yolo_kpts = yolo_kpts[0]  # Get first person's keypoints
                    posture = self.detect_posture(yolo_kpts, 'yolo')
                    # Draw keypoints
                    for x, y, conf in yolo_kpts:
                        if conf > 0.5:
                            cv2.circle(frame, (int(x), int(y)), 4, (0, 255, 0), -1)
                    bbox = self.calculate_bounding_box(yolo_kpts, 'yolo')
                
        # Draw bounding box and action state if person is detected
        if bbox and posture != "blocked" and posture != "No pose detected":
            self.draw_action_box(frame, bbox, posture)
        
        # Check for blocked state
        if posture == "blocked":
            cv2.putText(frame, "BLOCKED", (frame.shape[1]//2 - 60, frame.shape[0]//2),
                       self.font, 1.5, (0, 0, 255), 3)
        
        # Calculate cooldown remaining
        cooldown_remaining = max(0, self.switch_cooldown - (current_time - self.last_switch_time))
        
        # Display metrics
        metrics_text = [
            f"FPS: {np.mean(self.fps_history):.1f}",
            f"MediaPipe Conf: {mp_confidence:.2f}",
            f"YOLO Conf: {yolo_confidence:.2f}",
            f"Current Model: {self.current_model}",
            f"Switch Cooldown: {cooldown_remaining:.1f}s",
            f"Posture: {posture}"
        ]
        
        for i, text in enumerate(metrics_text):
            cv2.putText(frame, text, (10, 30 + i * 30),
                       self.font, 0.7, (0, 255, 0), 2)
        
        return frame

In [16]:
def main():
    """Enhanced main function with analytics"""
    detector = PoseDetector()
    cap = cv2.VideoCapture(0)

    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return

    try:
        while True:
            success, frame = cap.read()
            if not success:
                print("Error: Failed to capture frame. Exiting...")
                break

            # Process the frame using the pose detector
            processed_frame = detector.process_frame(frame)

            # Display the processed frame
            cv2.imshow('Advanced Pose Detection', processed_frame)

            # Break the loop if 'q' is pressed
            if cv2.waitKey(1) & 0xFF == ord('q'):
                print("Exiting...")
                break
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Release resources and close windows
        cap.release()
        cv2.destroyAllWindows()

if __name__ == "__main__":
    main()




  _torch_pytree._register_pytree_node(


0: 480x640 1 person, 162.0ms
Speed: 5.6ms preprocess, 162.0ms inference, 14.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.0ms
Speed: 1.0ms preprocess, 66.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 2.0ms preprocess, 65.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 1.0ms preprocess, 65.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.0ms
Speed: 1.0ms preprocess, 66.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 63.0ms
Speed: 1.0ms preprocess, 63.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 63.0ms
Speed: 1.0ms preprocess, 63.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.0ms
Speed: 1.0ms preprocess, 66.0ms inference, 1.0ms postprocess per image at shape (1, 3, 



Speed: 1.0ms preprocess, 64.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 63.0ms
Speed: 1.0ms preprocess, 63.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 70.0ms
Speed: 1.0ms preprocess, 70.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 67.0ms
Speed: 1.0ms preprocess, 67.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.6ms
Speed: 1.0ms preprocess, 64.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.0ms
Speed: 1.0ms preprocess, 66.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.0ms
Speed: 1.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 1.0ms preprocess, 65.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,

Speed: 2.0ms preprocess, 70.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 69.0ms
Speed: 1.0ms preprocess, 69.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 69.0ms
Speed: 1.0ms preprocess, 69.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 69.0ms
Speed: 1.0ms preprocess, 69.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 73.0ms
Speed: 1.0ms preprocess, 73.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 70.0ms
Speed: 2.0ms preprocess, 70.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 70.0ms
Speed: 1.0ms preprocess, 70.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 1.0ms preprocess, 65.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,


0: 480x640 1 person, 64.0ms
Speed: 1.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 1.0ms preprocess, 65.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 73.0ms
Speed: 1.0ms preprocess, 73.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.0ms
Speed: 1.0ms preprocess, 66.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 1.0ms preprocess, 65.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 65.0ms
Speed: 1.0ms preprocess, 65.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 67.0ms
Speed: 2.0ms preprocess, 67.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 64.0ms
Speed: 1.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 48

Speed: 1.0ms preprocess, 66.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 67.1ms
Speed: 1.0ms preprocess, 67.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 68.8ms
Speed: 1.0ms preprocess, 68.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 77.1ms
Speed: 1.0ms preprocess, 77.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.1ms
Speed: 2.0ms preprocess, 66.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 68.0ms
Speed: 1.0ms preprocess, 68.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 66.0ms
Speed: 1.0ms preprocess, 66.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 67.3ms
Speed: 1.0ms preprocess, 67.3ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person,