In [6]:
import numpy as np
import cv2
import dlib

In [8]:
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor('./backend/shape_predictor_68_face_landmarks.dat')

In [10]:
def preprocess_eye_region(frame, eye_coords, target_size=(30, 36)):
    """
    Preprocesses the eye region for the CNN model.
    Args:
        frame: The input image frame (in BGR format).
        eye_coords: Coordinates of the eye region.
        target_size: The target size for each eye region.
    Returns:
        The preprocessed eye region.
    """
    x_min = min(x for x, y in eye_coords)
    x_max = max(x for x, y in eye_coords)
    y_min = min(y for x, y in eye_coords)
    y_max = max(y for x, y in eye_coords)

    # Cropping the eye region based on the extremities of the landmarks
    cropped_eye = frame[y_min:y_max, x_min:x_max]

    # Resizing the cropped eye region to the target size
    resized_eye = cv2.resize(cropped_eye, target_size)

    return resized_eye.astype(np.float32) / 255.0

In [11]:

def normalize_head_pose(head_pose_data, rotation_scale=180, translation_max_displacement=None):
    """
    Normalizes the head pose data.
    Args:
        head_pose_data: List containing the head pose data (rotation and translation vectors).
        rotation_scale: Maximum value for the rotation vector components (180 for degrees, np.pi for radians).
        translation_max_displacement: A tuple (max_x, max_y, max_z) representing the maximum displacement in each axis. If None, standard deviation normalization will be used.

    Returns:
        Normalized head pose data.
    """
    # Normalize rotation vectors
    normalized_rotation = np.array(head_pose_data[:3]) / rotation_scale

    # Normalize translation vectors
    if translation_max_displacement:
        max_x, max_y, max_z = translation_max_displacement
        normalized_translation = np.array(head_pose_data[3:]) / np.array([max_x, max_y, max_z])
    else:
        # Standard deviation normalization
        translation_vector = np.array(head_pose_data[3:])
        std_dev = np.std(translation_vector)
        mean_val = np.mean(translation_vector)
        normalized_translation = (translation_vector - mean_val) / std_dev

    return np.concatenate([normalized_rotation, normalized_translation]).tolist()

In [12]:
def get_combined_eyes(frame):
    """
    Detects and combines the eye regions from the frame.
    Args:
        frame: The input image frame.
    Returns:
        The combined eye regions, or None if not detected.
    """
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = detector(gray)

    for face in faces:
        landmarks = predictor(gray, face)

        # Extract the coordinates for each eye
        left_eye = [(landmarks.part(n).x, landmarks.part(n).y) for n in range(36, 42)]
        right_eye = [(landmarks.part(n).x, landmarks.part(n).y) for n in range(42, 48)]

        # Preprocess each eye region
        left_eye_region = preprocess_eye_region(frame, left_eye)

        right_eye_region = preprocess_eye_region(frame, right_eye)

        # Combine the eyes side by side
        combined_eyes = np.hstack([left_eye_region, right_eye_region])

        # Ensure the combined eyes image has the correct shape
        if combined_eyes.shape[1] != 60:
            raise ValueError("Combined eyes region does not match the expected width.")


        return combined_eyes

    return None

In [43]:
import cv2
import os
import pandas as pd
import json
from glob import glob

# Assuming normalize_head_pose and get_combined_eyes are defined as before

def get_screen_size(metadata_file_path):
    with open(metadata_file_path, 'r') as f:
        metadata = json.load(f)
        return metadata['screenWidth'], metadata['screenHeight']

def parse_head_pose_data(row):
    # Split the strings and convert to float
    rotation_str, translation_str = row['head_pose'], row['head_translation']
    rotation = [float(x) for x in rotation_str.strip('"').split(',')]
    translation = [float(x) for x in translation_str.strip('"').split(',')]
    return rotation + translation  # Combine into a single list

def prepare_dataset(base_dir):
    X, Y = [], []

    column_names = ['image_path', 'cursor_x', 'cursor_y', 'eye_x1', 'eye_y1', 'eye_x2', 'eye_y2', 'eye_x3', 'eye_y3', 'eye_x4', 'eye_y4', 'eye_x5', 'eye_y5', 'eye_x6', 'eye_y6', 'head_pose', 'head_translation']

    for subdir in glob(os.path.join(base_dir, '*/')):
        metadata_file_path = os.path.join(subdir, 'metadata.json')
        screen_width, screen_height = get_screen_size(metadata_file_path)
        print(f"Screen size: {screen_width}x{screen_height}")
        
        data_file_path = os.path.join(subdir, 'data.csv')
        img_folder = os.path.join(subdir, 'images')

        if not os.path.exists(data_file_path) or not os.path.exists(img_folder):
            print(f"Data file or image folder not found for directory: {subdir}")
            continue

        data = pd.read_csv(data_file_path, header=None, names=column_names)

        for index, row in data.iterrows():
            img_path = os.path.join(row['image_path'])
            cursor_x, cursor_y = row['cursor_x'], row['cursor_y']
            eye_box_pupil_data = row[3:15].tolist()
            head_pose_data = parse_head_pose_data(row)

            normalized_eye_box_pupil_data = [float(coord) / screen_width if i % 2 == 0 else float(coord) / screen_height for i, coord in enumerate(eye_box_pupil_data)]
            normalized_head_pose_data = normalize_head_pose(head_pose_data)

            img = cv2.imread(img_path)
            if img is None:
                print(f"Image not found: {img_path}")
                continue


            combined_eyes = get_combined_eyes(img)

            # Append to datasets
            Y.append([cursor_x / screen_width, cursor_y / screen_height] + normalized_eye_box_pupil_data + normalized_head_pose_data)
            X.append(combined_eyes)
    return X, Y

# Example usage:
base_dir = './data'
X, Y = prepare_dataset(base_dir)


Screen size: 1707x960
Screen size: 1536x864
Screen size: 1707x960
Screen size: 1280x720
Screen size: 1707x960
Screen size: 1440x900
Screen size: 1707x960
Screen size: 1707x960


In [44]:
len(X), len(Y)

(195, 195)

In [48]:
Y[1]

[0.9982425307557118,
 0.004166666666666667,
 0.15700058582308143,
 0.184375,
 0.14586994727592267,
 0.17708333333333334,
 0.01757469244288225,
 0.014583333333333334,
 0.19390743995313414,
 0.18541666666666667,
 0.18922085530169888,
 0.18020833333333333,
 0.018746338605741066,
 0.013541666666666667,
 -0.016105093800901846,
 -0.0004740759114885289,
 0.0026191724543021613,
 -0.7459235665832105,
 -0.6675662195088178,
 1.4134897860920284]