In [1]:
import cv2
import mediapipe as mp
import numpy as np

In [2]:
# Initialize MediaPipe Face Mesh.
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)
mp_drawing = mp.solutions.drawing_utils

How a Baseline Calibration Helps

Personal Variation:
Every individual has unique facial geometry and a natural resting pose. The generic model might, for example, consider a slight tilt as "not focused," even though that tilt is normal for that person.

Baseline Calibration:
By capturing, say, the first 30 frames when you assume the user is looking directly at the screen, you can compute the average Euler angles (pitch, yaw, and roll) for that user. This average serves as their personal baseline.

Relative Comparison:
For subsequent frames, instead of checking if each angle is within an absolute ±10° range, you compare the current angles to the baseline. If the differences exceed your thresholds, then you can conclude the head position has changed (e.g., the user might be distracted or tilted).

In [12]:
# Start capturing video from the webcam.
cap = cv2.VideoCapture(0)


# Calibration Phase: Capture the first 30 frames for baseline.
calibration_frames=30
calibration_angles=[]


while len(calibration_angles) < calibration_frames:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Flip the frame for a natural selfie-view and convert color space.
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    h, w, _ = frame.shape

    # Process the frame with MediaPipe Face Mesh.
    results = face_mesh.process(rgb_frame)
    

    if results.multi_face_landmarks:
        for face_landmarks in results.multi_face_landmarks:
            # Extract 2D image points using specific landmark indices.
            # Recommended indices for head pose estimation:
            # Nose tip: 1, Chin: 152, Left eye left corner: 33,
            # Right eye right corner: 263, Left mouth corner: 61, Right mouth corner: 291.
            image_points = np.array([
                (face_landmarks.landmark[1].x * w, face_landmarks.landmark[1].y * h),    # Nose tip
                (face_landmarks.landmark[152].x * w, face_landmarks.landmark[152].y * h),  # Chin
                (face_landmarks.landmark[33].x * w, face_landmarks.landmark[33].y * h),    # Left eye left corner
                (face_landmarks.landmark[263].x * w, face_landmarks.landmark[263].y * h),  # Right eye right corner
                (face_landmarks.landmark[61].x * w, face_landmarks.landmark[61].y * h),    # Left mouth corner
                (face_landmarks.landmark[291].x * w, face_landmarks.landmark[291].y * h)   # Right mouth corner
            ], dtype="double")
            
            # Define corresponding 3D model points in a generic face coordinate system.
            model_points = np.array([
                (0.0, 0.0, 0.0),         # Nose tip
                (0.0, -63.6, -12.5),     # Chin
                (-43.3, 32.7, -26.0),    # Left eye left corner
                (43.3, 32.7, -26.0),     # Right eye right corner
                (-28.9, -28.9, -24.1),   # Left mouth corner
                (28.9, -28.9, -24.1)     # Right mouth corner
            ])
            
            # Set up the camera matrix using the frame dimensions.
            focal_length = w
            center = (w / 2, h / 2)
            camera_matrix = np.array([
                [focal_length, 0, center[0]],
                [0, focal_length, center[1]],
                [0, 0, 1]
            ], dtype="double")
            dist_coeffs = np.zeros((4, 1))  # Assuming no lens distortion
            
            # Solve for the head pose using solvePnP.
            success, rotation_vector, translation_vector = cv2.solvePnP(
                model_points, image_points, camera_matrix, dist_coeffs, flags=cv2.SOLVEPNP_ITERATIVE
            )
            
            if success:
                # Convert rotation vector to rotation matrix.
                rotation_matrix, _ = cv2.Rodrigues(rotation_vector)
                # Decompose the rotation matrix to get Euler angles.
                retval, mtxR, mtxQ, Qx, Qy, Qz = cv2.RQDecomp3x3(rotation_matrix)
                # retval contains the Euler angles (pitch, yaw, roll) in degrees.
                euler_angles = np.array(retval).flatten()  # Flatten to ensure a 1D array.
                pitch = float(euler_angles[0])
                yaw   = float(euler_angles[1])
                roll  = float(euler_angles[2])

                calibration_angles.append((pitch, yaw, roll))

    



# Compute average baseline angles.
baseline_pitch = np.mean([angle[0] for angle in calibration_angles])
baseline_yaw = np.mean([angle[1] for angle in calibration_angles])
baseline_roll = np.mean([angle[2] for angle in calibration_angles])


# Monitoring Phase:
while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Flip the frame for a natural selfie-view and convert color space.
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    h, w, _ = frame.shape

    # Process the frame with MediaPipe Face Mesh.
    results = face_mesh.process(rgb_frame)
    
    head_facing_forward = False  # Default state

    if results.multi_face_landmarks:
        for face_landmarks in results.multi_face_landmarks:
            # Extract 2D image points using specific landmark indices.
            # Recommended indices for head pose estimation:
            # Nose tip: 1, Chin: 152, Left eye left corner: 33,
            # Right eye right corner: 263, Left mouth corner: 61, Right mouth corner: 291.
            image_points = np.array([
                (face_landmarks.landmark[1].x * w, face_landmarks.landmark[1].y * h),    # Nose tip
                (face_landmarks.landmark[152].x * w, face_landmarks.landmark[152].y * h),  # Chin
                (face_landmarks.landmark[33].x * w, face_landmarks.landmark[33].y * h),    # Left eye left corner
                (face_landmarks.landmark[263].x * w, face_landmarks.landmark[263].y * h),  # Right eye right corner
                (face_landmarks.landmark[61].x * w, face_landmarks.landmark[61].y * h),    # Left mouth corner
                (face_landmarks.landmark[291].x * w, face_landmarks.landmark[291].y * h)   # Right mouth corner
            ], dtype="double")
            
            # Define corresponding 3D model points in a generic face coordinate system.
            model_points = np.array([
                (0.0, 0.0, 0.0),         # Nose tip
                (0.0, -63.6, -12.5),     # Chin
                (-43.3, 32.7, -26.0),    # Left eye left corner
                (43.3, 32.7, -26.0),     # Right eye right corner
                (-28.9, -28.9, -24.1),   # Left mouth corner
                (28.9, -28.9, -24.1)     # Right mouth corner
            ])
            
            # Set up the camera matrix using the frame dimensions.
            focal_length = w
            center = (w / 2, h / 2)
            camera_matrix = np.array([
                [focal_length, 0, center[0]],
                [0, focal_length, center[1]],
                [0, 0, 1]
            ], dtype="double")
            dist_coeffs = np.zeros((4, 1))  # Assuming no lens distortion
            
            # Solve for the head pose using solvePnP.
            success, rotation_vector, translation_vector = cv2.solvePnP(
                model_points, image_points, camera_matrix, dist_coeffs, flags=cv2.SOLVEPNP_ITERATIVE
            )
            
            if success:
                # Convert rotation vector to rotation matrix.
                rotation_matrix, _ = cv2.Rodrigues(rotation_vector)
                # Decompose the rotation matrix using cv2.RQDecomp3x3.
                # This returns six values: retval (Euler angles in degrees), mtxR, mtxQ, Qx, Qy, Qz.
                retval, mtxR, mtxQ, Qx, Qy, Qz = cv2.RQDecomp3x3(rotation_matrix)

                # Ensure retval is a flat array (if not, flatten it).
                euler_angles = np.array(retval).flatten()

                # Extract pitch, yaw, and roll from the Euler angles.
                pitch = float(euler_angles[0])
                yaw   = float(euler_angles[1])
                roll  = float(euler_angles[2])

                # Now you can compute differences relative to a baseline.
                delta_pitch = abs(pitch - baseline_pitch)
                delta_yaw   = abs(yaw - baseline_yaw)
                delta_roll  = abs(roll - baseline_roll)

                # Check if all Euler angles are within ±10° to determine if the head is facing forward.
                if delta_pitch < 20 and delta_yaw < 20 and delta_roll < 20:
                    status = "Focused"
                else:
                    status = "Not Focused"
                    
                # Display the computed angles on the frame.
                cv2.putText(frame, f"Pitch: {pitch:.1f}", (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                cv2.putText(frame, f"Yaw: {yaw:.1f}", (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                cv2.putText(frame, f"Roll: {roll:.1f}", (30, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
            else:
                status = "Pose Not Detected"
            
            # Show the status (Focused/Not Focused) on the frame.
            cv2.putText(frame, status, (30, 130), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0) if head_facing_forward else (0, 0, 255), 2)
            
            # Optionally, draw face landmarks on the frame.
            mp_drawing.draw_landmarks(
                frame, face_landmarks, mp_face_mesh.FACEMESH_TESSELATION,
                landmark_drawing_spec=None, connection_drawing_spec=mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=1, circle_radius=1)
            )
    
    cv2.imshow("Head Pose Estimation", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()