In [1]:
import cv2
import numpy as np
import mediapipe as mp

In [2]:
# Defines used to write text on the images
RED = (0, 0, 255)
THICKNESS = 2
FONT_SCALE = 0.75

In [3]:
def face_detector_init():
    
    face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    
    if face_detector.empty():
        raise Exception("Failed to load cascade classifier.")
    
    return face_detector

In [4]:
def camera_feed_init(video_name=None):

    if video_name:
        camera_feed = cv2.VideoCapture(video_name)
    else:
        camera_feed = cv2.VideoCapture(0)

    return camera_feed

In [5]:
def get_image_dimensions(camera_feed):
    if camera_feed.isOpened():
        frame_retrieved, frame = camera_feed.read()

        if frame_retrieved:
            image_height = frame.shape[0]
            image_width = frame.shape[1]
            image_area = image_height*image_width

            return image_height, image_width, image_area
        else:
            raise Exception("Failed to get the dimension of the image.")

In [6]:
def compute_closeness(frame, image_area, face_detector, display_face_bounding_box, x_min_threshold, x_max_threshold):
    """
    Compute the closeness score based on the face bounding box area relative to the frame's image area.
    Adds a horizontal threshold to ignore bounding boxes outside a certain x range.
    """
    # Convert the frame to grayscale as it is needed for the face detector
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Detect faces in the grayscale image
    faces = face_detector.detectMultiScale(gray, scaleFactor=1.1, 
                                           minNeighbors=5, minSize=(10, 10))

    # If at least one face has been detected
    if len(faces) >= 1:
        # We only consider the first detected face
        (c, r, bounding_box_width, bounding_box_height) = faces[0]

        # Apply x-direction threshold
        if c < x_min_threshold or (c + bounding_box_width) > x_max_threshold:
            return None  # Ignore this face if it doesn't meet the x threshold

        # Compute the closeness (closeness ⊂ [0, 1])
        bounding_box_area = bounding_box_width * bounding_box_height
        closeness = bounding_box_area / image_area

        if display_face_bounding_box:
            # Draw the corresponding bounding box on the original frame
            top_left_corner = (c, r)
            bottom_right_corner = (c + bounding_box_width, r + bounding_box_height)
            cv2.rectangle(frame, top_left_corner, bottom_right_corner, (0, 0, 255), 2)  # Red bounding box

        return closeness
    else:
        return None


In [7]:
def closeness_to_proxemics_space(closeness):

    CLOSENESS_INTIMATE_SPACE = 0.1     # [-]
    CLOSENESS_PERSONAL_SPACE = 0.02    # [-]
    CLOSENESS_SOCIAL_SPACE   = 0.0025  # [-]

    if closeness >= CLOSENESS_INTIMATE_SPACE:
        proxemics_space = "Intimate space"
        closeness_attention_score = 1.0  # Highest attention in intimate space
    elif CLOSENESS_PERSONAL_SPACE <= closeness < CLOSENESS_INTIMATE_SPACE:
        proxemics_space = "Personal space"
        closeness_attention_score = 0.75  # High attention in personal space
    elif CLOSENESS_SOCIAL_SPACE <= closeness < CLOSENESS_PERSONAL_SPACE:
        proxemics_space = "Social space"
        closeness_attention_score = 0.5  # Moderate attention in social space
    else:
        proxemics_space = "Public space"
        closeness_attention_score = 0.25  # Lower attention in public space

    return proxemics_space, closeness_attention_score

User's head orientation

In [8]:
# Generic 3D model of a human face
MODEL_LANDMARKS_3D = np.array([(0.0, 0.0, 0.0),            # Nose tip
                               (225.0, 170.0, -135.0),     # Right eye right corner
                               (150.0, -150.0, -125.0),    # Right mouth corner
                               (0.0, -330.0, -65.0),       # Chin
                               (-225.0, 170.0, -135.0),    # Left eye left corner
                               (-150.0, -150.0, -125.0)])  # Left mouth corner

In [9]:
def face_landmarker_init():
    """
    Initialize the face landmarker.

    :return face_landmarker: An object of type mediapipe.solutions.face_mesh.FaceMesh.
    """

    mp_face_mesh = mp.solutions.face_mesh
    face_landmarker = mp_face_mesh.FaceMesh(static_image_mode=False, 
                                            max_num_faces=1, 
                                            refine_landmarks=True, 
                                            min_detection_confidence=0.5, 
                                            min_tracking_confidence=0.5)
    
    return face_landmarker

In [10]:
# Indices of the MediaPipe landmarks that we use in our 3D model
# Indices of the MediaPipe landmarks that we use in our 3D model
NOSE_LANDMARK_IDX = 1            # Nose tip
RIGHT_EYE_RIGHT_CORNER_LANDMARK_IDX = 33    # Right eye, right corner
RIGHT_MOUTH_CORNER_LANDMARK_IDX = 61        # Right mouth corner
CHIN_LANDMARK_IDX = 199                     # Chin
LEFT_EYE_LEFT_CORNER_LANDMARK_IDX = 263     # Left eye, left corner
LEFT_MOUTH_CORNER_LANDMARK_IDX = 291        # Left mouth corner

MODEL_LANDMARKS_3D_IDX = [NOSE_LANDMARK_IDX, RIGHT_EYE_RIGHT_CORNER_LANDMARK_IDX,
                          RIGHT_MOUTH_CORNER_LANDMARK_IDX, CHIN_LANDMARK_IDX,
                          LEFT_EYE_LEFT_CORNER_LANDMARK_IDX, LEFT_MOUTH_CORNER_LANDMARK_IDX]

In [11]:
def get_camera_intrinsics(image_width, image_height):
    """
    Approximate the intrinsics parameters and distortion coefficients
    of the camera.

    :return distortion_coeffs: The distortion coefficients of the camera.
    :return camera_matrix: The matrix containing the intrinsics parameters
                           of the camera.
    """
    
    # We Assume no radial distortion of the lens
    distortion_coeffs = np.zeros((4, 1))

    # We approximate the optical center by the center of the image,
    # the focal lenght by the width of the image
    # and we consider that there is no skew
    c_x = image_width / 2
    c_y = image_height / 2
    f = image_width
    s = 0  # No skew
    camera_matrix = np.array([[f, s, c_x],
                              [0, f, c_y],
                              [0, 0, 1]], 
                              dtype = "double")
    
    return distortion_coeffs, camera_matrix

In [12]:
def compute_gaze_orientation(frame, face_landmarker, image_width, image_height, distortion_coeffs, camera_matrix, display_gaze_orientation):
    """
    Compute the gaze orientation of the human with respect to the camera.

    :param frame: An OpenCV image that has been read from the camera feed.
    :param face_landmarker: Object of type cv2.CascadeClassifier.
    :param image_width: Width of the images read from camera_feed [px]
    :param image_height: Height of the images read from camera_feed [px].
    :param distortion_coeffs: The distortion coefficients of the camera.
    :param camera_matrix: The matrix containing the intrinsic parameters
                           of the camera.
    :param display_gaze_orientation: Boolean to display the gaze 
                                     orientation on the image.
    :return gaze_orientation: Tuple containing the pitch, yaw and roll 
                              angles of the human face with respect to 
                              the camera, if a face has been detected
                              in the image, None otherwise.
    """
    
    # Convert the frame to RGB for MediaPipe (OpenCV uses the BGR format)
    RGB_image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Compute the face landmarks for all faces detected in the image
    face_landmarker_results = face_landmarker.process(RGB_image)

    # If at least one face has been detected
    if face_landmarker_results.multi_face_landmarks:
        # We only consider the first detected head
        face_landmarks = face_landmarker_results.multi_face_landmarks[0]

        # Array to store the landmarks of interest (the ones corresponding to our 3D model)
        model_landmarks_2D = np.zeros((len(MODEL_LANDMARKS_3D_IDX), 2))

        # Loop through all the landmarks generated by MediaPipe to find
        # the landmarks of interest (the ones corresponding to our 3D model)
        model_landmark_counter = 0
        for idx, lm in enumerate(face_landmarks.landmark):
            if idx in MODEL_LANDMARKS_3D_IDX:
                # Get the 2D Coordinates of the landmarks of interest in the image referential
                # MediaPipe normalizes the landmarks with respect to the image dimension, so we convert them back
                scaled_image_point = (int(lm.x * image_width), int(lm.y * image_height))
                model_landmarks_2D[model_landmark_counter] = scaled_image_point
                model_landmark_counter += 1

        # Solve the pose estimation optimization problem
        # Use cv2.solvePnP to get rotation and translation vectors
        success, rotation_vector, translation_vector = cv2.solvePnP(
            MODEL_LANDMARKS_3D,              # 3D model points
            model_landmarks_2D,              # 2D image points
            camera_matrix,                   # Camera matrix (intrinsic parameters)
            distortion_coeffs,               # Distortion coefficients
            flags=cv2.SOLVEPNP_ITERATIVE
        )

        # Get the rotation matrix from the rotation vector
        rotation_matrix, _ = cv2.Rodrigues(rotation_vector)

        # Get the Euler angles corresponding to the rotation matrix (gaze orientation)
        gaze_orientation, mtxR, mtxQ, Qx, Qy, Qz = cv2.RQDecomp3x3(rotation_matrix)

        if display_gaze_orientation:
            # Project a 3D point (0, 0, 1000.0) onto the image plane.
            # We use this to draw a line sticking out of the nose.
            nose_end_point_3D = np.array([(0.0, 0.0, 1000.0)])
            nose_end_point_2D, _ = cv2.projectPoints(
                nose_end_point_3D, rotation_vector, translation_vector, camera_matrix, distortion_coeffs)

            nose_landmark = (int(model_landmarks_2D[0][0]), int(model_landmarks_2D[0][1]))
            nose_end_point_2D_xy = (int(nose_end_point_2D[0][0][0]), int(nose_end_point_2D[0][0][1]))

            # Draw a line representing the gaze orientation
            cv2.line(frame, nose_landmark, nose_end_point_2D_xy, (0, 0, 255), 2)

            # Draw the landmarks of interest
            for landmark in model_landmarks_2D:
                cv2.circle(frame, (int(landmark[0]), int(landmark[1])), 3, (0, 0, 255), -1)

        return gaze_orientation
    else:
        return None


In [13]:
import numpy as np

def compute_gaze_orientation_attention_score(gaze_orientation):
    """
    Computes an attention score based on gaze orientation (pitch and yaw angles).
    If the pitch angle is not between 45 and 70 degrees, a reminder to "Pay attention" is printed.
    Additionally, prints the pitch and yaw angles.
    """
    pitch_angle, yaw_angle = gaze_orientation[0], gaze_orientation[1]

    # Define thresholds beyond which the attention score will be 0
    min_pitch_threshold = 5  # degrees
    max_pitch_threshold = 60  # degrees
    max_yaw_threshold = 40    # degrees

    # Normalize pitch and yaw to a range between 0 and 1
    pitch_score = max(0, 1 - abs(pitch_angle - min_pitch_threshold) / (max_pitch_threshold - min_pitch_threshold))
    yaw_score = max(0, 1 - abs(yaw_angle) / max_yaw_threshold)

    # Compute the overall attention score as a combination of pitch and yaw scores
    gaze_orientation_attention_score = pitch_score * yaw_score

    return gaze_orientation_attention_score



Attention estimator

In [14]:
def compute_attention_estimation(closeness_attention_score, gaze_orientation_attention_score, alpha):
    """
    Compute an attention score based on both the closeness and the gaze orientation.

    :param closeness_attention_score: Attention score corresponding 
                                      to the proxemics space 
                                      (closeness_attention_score ⊂ [0, 1]).
    :param gaze_orientation_attention_score: Attention score corresponding
                                             to the human gaze orientation
                                             (gaze_orientation_attention_score ⊂ [0, 1]).
    :param alpha: Weight representing the relative importance of the gaze orientation
                  score over the closeness attention score in the computation 
                  of the final attention score
    :return attention_score: Final attention score based on both the 
                             closeness and the gaze orientation.
    """

    # Weighted combination of the gaze and closeness scores
    attention_score = alpha * gaze_orientation_attention_score + (1 - alpha) * closeness_attention_score

    return attention_score


In [15]:
def main():
    """
    Main loop of the program. For each frame of the camera feed, compute 
    the closeness and the gaze orientation of the human in front of the camera 
    and use those to compute the human attention score.
    """

    # Initialize a face detector and a face landmarker. 
    face_detector = face_detector_init()
    face_landmarker = face_landmarker_init()

    # Open the camera feed, get the dimensions of the images captured by 
    # the camera and approximate the intrinsic parameters of the camera.
    camera_feed = camera_feed_init(video_name="Week-2/Participant_3/camera2.avi")
    if camera_feed is None or not camera_feed.isOpened():
        print("Error: Could not open camera feed.")
        return

    image_height, image_width, image_area = get_image_dimensions(camera_feed)
    distortion_coeffs, camera_matrix = get_camera_intrinsics(image_width, image_height)
    
    # Variables to store the last detected scores and angles
    last_closeness_attention_score = None
    last_gaze_orientation_attention_score = None
    last_attention_score = None
    last_pitch_angle = None
    last_yaw_angle = None

    # Variables for message persistence
    message_counter = 0

    while camera_feed.isOpened():
        # Acquire image from camera
        frame_retrieved, frame = camera_feed.read()

        if frame_retrieved:
            # Compute closeness and gaze orientation
            closeness = compute_closeness(frame, image_area, face_detector, display_face_bounding_box=True, x_min_threshold=10, x_max_threshold=1200)
            gaze_orientation = compute_gaze_orientation(frame, face_landmarker, image_width, image_height, 
                                                        distortion_coeffs, camera_matrix, display_gaze_orientation=True)

            # If a face has been detected in the image, and thus if a 
            # closeness and a gaze orientation could be computed
            if closeness and gaze_orientation:
                proxemics_space, closeness_attention_score = closeness_to_proxemics_space(closeness)
                gaze_orientation_attention_score = compute_gaze_orientation_attention_score(gaze_orientation)
                attention_score = compute_attention_estimation(closeness_attention_score, gaze_orientation_attention_score, alpha=0.7)

                # Update the last known scores and angles
                last_closeness_attention_score = closeness_attention_score
                last_gaze_orientation_attention_score = gaze_orientation_attention_score
                last_attention_score = attention_score
                last_pitch_angle = gaze_orientation[0]
                last_yaw_angle = gaze_orientation[1]

                # Check if attention warning is needed based on pitch and yaw angle
                if not (5 <= gaze_orientation[0] <= 60) or not (gaze_orientation[1] <= 40):
                    message_counter = 100  # Trigger the message for 100 frames

            # Display the closeness, gaze orientation, and attention scores on the frame
            if last_closeness_attention_score is not None:
                cv2.putText(frame, f"Closeness Attention Score: {last_closeness_attention_score:.2f}", 
                            (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
            if last_gaze_orientation_attention_score is not None:
                cv2.putText(frame, f"Gaze Orientation Attention Score: {last_gaze_orientation_attention_score:.2f}", 
                            (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
            if last_attention_score is not None:
                cv2.putText(frame, f"Final Attention Score: {last_attention_score:.2f}", 
                            (30, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
            if last_pitch_angle is not None:
                cv2.putText(frame, f"Pitch Angle: {last_pitch_angle:.2f} degrees", 
                            (700, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
            if last_yaw_angle is not None:
                cv2.putText(frame, f"Yaw Angle: {last_yaw_angle:.2f} degrees", 
                            (700, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

            # Display the persistent message if triggered
            if message_counter > 0:
                cv2.putText(frame, "Pay attention! You can do it!", 
                            (200, 120), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 2)
                message_counter -= 1

            # Show the image with scores
            cv2.imshow("Attention Estimation", frame)

        # Press q to stop the program
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release the camera feed and destroy the OpenCV window
    camera_feed.release()
    cv2.destroyAllWindows()


In [16]:
main()

