In [7]:
pip show mediapipe


Name: mediapipe
Version: 0.10.18
Summary: MediaPipe is the simplest way for researchers and developers to build world-class ML solutions and applications for mobile, edge, cloud and the web.
Home-page: https://github.com/google/mediapipe
Author: The MediaPipe Authors
Author-email: mediapipe@google.com
License: Apache 2.0
Location: /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages
Requires: absl-py, attrs, flatbuffers, jax, jaxlib, matplotlib, numpy, opencv-contrib-python, protobuf, sentencepiece, sounddevice
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install opencv-python

Note: you may need to restart the kernel to use updated packages.


In [9]:
import cv2
import mediapipe as mp

In [10]:
# Initialize MediaPipe Face Mesh and Drawing Utilities
mp_drawing = mp.solutions.drawing_utils
mp_face_mesh = mp.solutions.face_mesh
drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1, color=(0, 255, 0))
mp_drawing_styles = mp.solutions.drawing_styles

def getLandmarks(image, face_mesh):
    """
    Detects face landmarks in an image and calculates relative coordinates.
    """
    # To improve performance, optionally mark the image as not writeable to pass by reference.
    #image.flags.writeable = False
    results = face_mesh.process(image)

    
    landmarks = []
    relative_landmarks = []
    
    if results.multi_face_landmarks:
        for face in results.multi_face_landmarks:
            for landmark in face.landmark:
                x = landmark.x
                y = landmark.y
                
                # Convert normalized coordinates to image pixels
                shape = image.shape
                relative_x = int(x * shape[1])  # shape[1] is the width
                relative_y = int(y * shape[0])  # shape[0] is the height
                relative_landmarks.append((relative_x, relative_y))
            landmarks = results.multi_face_landmarks[0].landmark

    return landmarks, relative_landmarks, results


In [22]:
def main():
    # Initialize webcam
    cap = cv2.VideoCapture(1)  # Use 0 for the default camera, or 1 for external cameras
    cap.set(3, 640)  # Set width
    cap.set(4, 420)  # Set height
    cap.set(10, 100)  # Set brightness

    # Initialize FaceMesh model
    face_mesh = mp_face_mesh.FaceMesh(
        max_num_faces=1,
        refine_landmarks=True,  # Enables iris landmarks
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    )

    while True:
        # Capture webcam frames
        success, frame = cap.read()
        if not success:
            print('Ignoring empty camera frame.')
            continue

        # Flip the frame horizontally and convert BGR to RGB
        frame = cv2.flip(frame, 1)
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Get landmarks and results
        #landmarks, results = getLandmarks(rgb_frame, face_mesh)
        landmarks, relative_landmarks, results = getLandmarks(rgb_frame, face_mesh)

        frame_output = frame.copy()
        #Get eye positions
        if len(landmarks) > 0:
            rightEyeImg = getRightEye(frame_output, landmarks)
            rightEyeHeight, rightEyeWidth, _ = rightEyeImg.shape
            
            xRightEye, yRightEye, rightEyeWidth, rightEyeHeight = getRightEyeRect(frame_output, landmarks)
            cv2.rectangle(frame_output, (xRightEye, yRightEye),
                          (xRightEye + rightEyeWidth, yRightEye + rightEyeHeight), (200, 21, 36), 2)
            
            # LEFT EYE
            leftEyeImg = getLeftEye(frame_output, landmarks)
            leftEyeHeight, leftEyeWidth, _ = leftEyeImg.shape
            
            xLeftEye, yLeftEye, leftEyeWidth, leftEyeHeight = getLeftEyeRect(frame_output, landmarks)
            cv2.rectangle(frame_output, (xLeftEye, yLeftEye),
                          (xLeftEye + leftEyeWidth, yLeftEye + leftEyeHeight), (200, 21, 36), 2)
            for face_landmarks in results.multi_face_landmarks:

                #drawing irisis
                mp_drawing.draw_landmarks(
                    image=frame_output,
                    landmark_list=face_landmarks,
                    connections=mp_face_mesh.FACEMESH_TESSELATION,
                    landmark_drawing_spec=None,
                    connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style()
                )
            
                # Draw irises
                draw_iris(frame_output, face_landmarks.landmark, RIGHT_IRIS, (0, 255, 0))  # Right iris
                draw_iris(frame_output, face_landmarks.landmark, LEFT_IRIS, (0, 0, 255))   # Left iris
            
            
                #Gaze detection
                image_shape = frame_output.shape

                # Get bounding boxes for both eyes
                right_eye_bbox = get_eye_bbox(face_landmarks.landmark, RIGHT_EYE, image_shape)
                left_eye_bbox = get_eye_bbox(face_landmarks.landmark, LEFT_EYE, image_shape)

                # Get iris landmarks
                right_iris = [face_landmarks.landmark[i] for i in RIGHT_IRIS]
                left_iris = [face_landmarks.landmark[i] for i in LEFT_IRIS]

                # Detect gaze for each eye
                right_gaze = detect_gaze(right_iris, right_eye_bbox,image_shape)
                left_gaze = detect_gaze(left_iris, left_eye_bbox,image_shape)

                # Draw bounding boxes and annotate gaze direction
                cv2.rectangle(frame_output, (right_eye_bbox[0], right_eye_bbox[1]),
                              (right_eye_bbox[0] + right_eye_bbox[2], right_eye_bbox[1] + right_eye_bbox[3]),
                              (0, 255, 0), 2)
                cv2.rectangle(frame_output, (left_eye_bbox[0], left_eye_bbox[1]),
                              (left_eye_bbox[0] + left_eye_bbox[2], left_eye_bbox[1] + left_eye_bbox[3]),
                              (0, 255, 0), 2)

                cv2.putText(frame_output, f"Right Eye: {right_gaze}", (50, 50), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                cv2.putText(frame_output, f"Left Eye: {left_gaze}", (50, 100), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)

        # Draw face mesh on the frame
        #output_frame = drawFaceMesh(frame, results)
    
        # Display the resulting frame
        cv2.imshow('MediaPipe FaceMesh', frame_output)

        # Press 'q' to exit the loop
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release resources
    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()


In [12]:
RIGHT_IRIS = [469, 470, 471, 472]
LEFT_IRIS = [474, 475, 476, 477]

def draw_iris(image, landmarks, indices, color):
    """
    Draws circles for iris landmarks on the image.
    """
    image_height, image_width, _ = image.shape
    for idx in indices:
        x = int(landmarks[idx].x * image_width)
        y = int(landmarks[idx].y * image_height)
        cv2.circle(image, (x, y), 2, color, -1)  # Draw small circles for the iris

In [13]:

def drawFaceMesh(image, results):
    """
    Draws face mesh landmarks on the image.
    """
    image.flags.writeable = True
    if results.multi_face_landmarks:
        for face_landmarks in results.multi_face_landmarks:
            mp_drawing.draw_landmarks(
                image=image,
                landmark_list=face_landmarks,
                connections=mp.solutions.face_mesh.FACEMESH_TESSELATION,
                landmark_drawing_spec=drawing_spec,
                connection_drawing_spec=drawing_spec)
    return image

In [14]:
mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

# Iris and eye landmarks
RIGHT_EYE = [33, 133, 160, 159, 158, 144, 153, 154, 155]
LEFT_EYE = [362, 263, 387, 386, 385, 373, 380, 374, 381]
RIGHT_IRIS = [469, 470, 471, 472]
LEFT_IRIS = [474, 475, 476, 477]

def get_eye_bbox(landmarks, indices, image_shape):
    """
    Get the bounding box of the eye based on landmarks.
    """
    x_coords = [landmarks[i].x * image_shape[1] for i in indices]
    y_coords = [landmarks[i].y * image_shape[0] for i in indices]

    x_min = int(min(x_coords))
    y_min = int(min(y_coords))
    x_max = int(max(x_coords))
    y_max = int(max(y_coords))

    return (x_min, y_min, x_max - x_min, y_max - y_min)  # x, y, width, height

def detect_gaze(iris_landmarks, eye_bbox, image_shape):
    """
    Detect gaze direction with more granular descriptions based on iris position.
    """
    # Convert normalized iris landmarks to pixel coordinates
    iris_center_x = sum([landmark.x for landmark in iris_landmarks]) / len(iris_landmarks) * image_shape[1]
    iris_center_y = sum([landmark.y for landmark in iris_landmarks]) / len(iris_landmarks) * image_shape[0]

    # Eye bounding box dimensions
    eye_left = eye_bbox[0]
    eye_right = eye_bbox[0] + eye_bbox[2]
    eye_top = eye_bbox[1]
    eye_bottom = eye_bbox[1] + eye_bbox[3]

    # Debug bounding box and iris center
    #print(f"Eye BBox: Left={eye_left}, Right={eye_right}, Top={eye_top}, Bottom={eye_bottom}")
    #print(f"Iris Center (in pixels): X={iris_center_x}, Y={iris_center_y}")

    # Relative position within the eye box
    iris_x_relative = (iris_center_x - eye_left) / (eye_right - eye_left)
    iris_y_relative = (iris_center_y - eye_top) / (eye_bottom - eye_top)

    # Debug relative position
    #print(f"iris_x_relative: {iris_x_relative}, iris_y_relative: {iris_y_relative}")

    # Define thresholds for gaze zones
    horizontal_thresholds = [0.4, 0.6]  # Left, center, right
    vertical_thresholds = [0.4, 0.6]    # Up, center, down

    # Determine horizontal gaze direction
    if iris_x_relative < horizontal_thresholds[0]:
        horizontal_gaze = "Left"
    elif iris_x_relative > horizontal_thresholds[1]:
        horizontal_gaze = "Right"
    else:
        horizontal_gaze = "Center"

    # Determine vertical gaze direction
    if iris_y_relative < vertical_thresholds[0]:
        vertical_gaze = "Up"
    elif iris_y_relative > vertical_thresholds[1]:
        vertical_gaze = "Down"
    else:
        vertical_gaze = "Center"

    # Combine horizontal and vertical directions for detailed gaze description
    if horizontal_gaze == "Center" and vertical_gaze == "Center":
        return "Looking Straight Ahead"
    elif horizontal_gaze == "Center":
        return f"Looking {vertical_gaze}"
    elif vertical_gaze == "Center":
        return f"Looking {horizontal_gaze}"
    else:
        return f"Looking {vertical_gaze}-{horizontal_gaze}"



In [15]:
import numpy as np
import cv2

def head_pose_estimation(face_landmarks, image_shape):
    """
    Estimate head pose using facial landmarks and OpenCV.
    """
    # Camera parameters
    focal_length = image_shape[1]
    center = (image_shape[1] // 2, image_shape[0] // 2)
    camera_matrix = np.array([
        [focal_length, 0, center[0]],
        [0, focal_length, center[1]],
        [0, 0, 1]
    ], dtype="double")

    # Assume no lens distortion
    dist_coeffs = np.zeros((4, 1))

    # 3D model points of the face (average human face)
    model_points = np.array([
        (0.0, 0.0, 0.0),           # Nose tip
        (0.0, -330.0, -65.0),      # Chin
        (-225.0, 170.0, -135.0),   # Left eye corner
        (225.0, 170.0, -135.0),    # Right eye corner
        (-150.0, -150.0, -125.0),  # Left mouth corner
        (150.0, -150.0, -125.0)    # Right mouth corner
    ])

    # 2D image points from facial landmarks
    image_points = np.array([
        (face_landmarks[1].x * image_shape[1], face_landmarks[1].y * image_shape[0]),    # Nose tip
        (face_landmarks[152].x * image_shape[1], face_landmarks[152].y * image_shape[0]),  # Chin
        (face_landmarks[33].x * image_shape[1], face_landmarks[33].y * image_shape[0]),   # Left eye corner
        (face_landmarks[263].x * image_shape[1], face_landmarks[263].y * image_shape[0]), # Right eye corner
        (face_landmarks[78].x * image_shape[1], face_landmarks[78].y * image_shape[0]),   # Left mouth corner
        (face_landmarks[308].x * image_shape[1], face_landmarks[308].y * image_shape[0])  # Right mouth corner
    ], dtype="double")

    # Solve PnP for pose estimation
    success, rotation_vector, translation_vector = cv2.solvePnP(
        model_points, image_points, camera_matrix, dist_coeffs
    )

    # Convert rotation vector to rotation matrix
    rotation_matrix, _ = cv2.Rodrigues(rotation_vector)

    # Calculate pitch, yaw, roll
    pitch, yaw, roll = cv2.decomposeProjectionMatrix(np.hstack((rotation_matrix, translation_vector)))[6]

    return pitch[0], yaw[0], roll[0]  # In degrees


In [16]:
def combine_head_pose_and_iris(yaw, pitch, iris_x_relative, iris_y_relative):
    """
    Combine head pose (yaw, pitch) and iris position (relative to eye) to detect gaze direction.
    Args:
        yaw (float): Rotation around the vertical axis (left/right).
        pitch (float): Rotation around the horizontal axis (up/down).
        iris_x_relative (float): Relative iris x-position within the eye (0 = left, 1 = right).
        iris_y_relative (float): Relative iris y-position within the eye (0 = top, 1 = bottom).
    Returns:
        str: Final gaze direction.
    """
    # Thresholds for head pose
    yaw_thresholds = (-15, 15)  # Left (<-15), Center (-15 to 15), Right (>15)
    pitch_thresholds = (-10, 10)  # Up (<-10), Center (-10 to 10), Down (>10)

    # Thresholds for iris position (relative to eye)
    iris_horizontal_thresholds = (0.4, 0.6)  # Left (<0.4), Center (0.4 to 0.6), Right (>0.6)
    iris_vertical_thresholds = (0.4, 0.6)    # Up (<0.4), Center (0.4 to 0.6), Down (>0.6)

    # Step 1: Head pose classification
    if yaw < yaw_thresholds[0]:
        head_horizontal = "Left"
    elif yaw > yaw_thresholds[1]:
        head_horizontal = "Right"
    else:
        head_horizontal = "Center"

    if pitch < pitch_thresholds[0]:
        head_vertical = "Up"
    elif pitch > pitch_thresholds[1]:
        head_vertical = "Down"
    else:
        head_vertical = "Center"

    # Step 2: Iris position classification
    if iris_x_relative < iris_horizontal_thresholds[0]:
        iris_horizontal = "Left"
    elif iris_x_relative > iris_horizontal_thresholds[1]:
        iris_horizontal = "Right"
    else:
        iris_horizontal = "Center"

    if iris_y_relative < iris_vertical_thresholds[0]:
        iris_vertical = "Up"
    elif iris_y_relative > iris_vertical_thresholds[1]:
        iris_vertical = "Down"
    else:
        iris_vertical = "Center"

    # Step 3: Combine classifications
    # Weighted logic: prioritize iris for finer adjustments, head pose for larger movements
    if head_horizontal == "Center" and iris_horizontal != "Center":
        final_horizontal = iris_horizontal
    else:
        final_horizontal = head_horizontal

    if head_vertical == "Center" and iris_vertical != "Center":
        final_vertical = iris_vertical
    else:
        final_vertical = head_vertical

    # Step 4: Return combined gaze direction
    if final_horizontal == "Center" and final_vertical == "Center":
        return "Looking Straight Ahead"
    elif final_horizontal == "Center":
        return f"Looking {final_vertical}"
    elif final_vertical == "Center":
        return f"Looking {final_horizontal}"
    else:
        return f"Looking {final_vertical}-{final_horizontal}"
