In [1]:
import os
import cv2
import pickle
import h5py
import numpy as np

In [2]:
def preprocess_frame(frame, target_size=(200, 100)):
    """
    Preprocesses a frame for gaze prediction.
    Args:
    - frame: The input image frame (assumed to be in BGR format as per OpenCV standard)
    - target_size: The target size to which the frame should be resized (width, height)

    Returns:
    - Preprocessed frame
    """
    # Check if image is loaded correctly
    if frame is None:
        raise ValueError("Invalid input frame")

    # Convert to grayscale
    gray_image = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Apply a binary threshold to get a binary image
    _, binary_image = cv2.threshold(gray_image, 1, 255, cv2.THRESH_BINARY)

    # Find contours
    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        raise ValueError("No contours found in the frame")

    # Find the largest contour based on area
    largest_contour = max(contours, key=cv2.contourArea)

    # Get the bounding box of the largest contour
    x, y, w, h = cv2.boundingRect(largest_contour)

    # Crop the image using the bounding box
    cropped_image = frame[y:y+h, x:x+w]

    # Resize the cropped image to the target size
    resized_image = cv2.resize(cropped_image, target_size)
 
    # Convert to float and normalize
    preprocessed_image = resized_image.astype(np.float32) / 255.0

    return preprocessed_image

In [3]:
def get_h5_file_path(subdir):

    # Extract the parent directory of the 'subdir' to get the user's data directory
    user_data_dir = os.path.dirname(subdir)
    
    # Construct the path to the 'Calibration' directory
    calibration_dir = os.path.join(user_data_dir, 'Calibration')
    
    # Assuming there's only one .h5 file per user in the 'Calibration' directory
    for file in os.listdir(calibration_dir):
        if file.endswith('.h5'):
            if 'screenSize' in file:
                return os.path.join(calibration_dir, file)
    
    # If no .h5 file is found, return None or raise an error
    return None

In [4]:

def ask_to_continue(current_dir, processed_dirs_count, subdir_limit=5):
    if processed_dirs_count % subdir_limit == 0:
        answer = input(f"Processed {processed_dirs_count} directories up to {current_dir}. Continue? [y/n]: ")
        if answer.lower() != 'y':
            return False
    return True


In [1]:
def read_h5(h5_file_path):
    with h5py.File(h5_file_path, 'r') as h5_file:
        h5_data = {key: h5_file[key][:] for key in h5_file.keys()}
    return h5_data

def normalize_annotations(annotation, width_pixel, height_pixel):
    normalized_x = float(annotation[0]) / width_pixel
    normalized_y = float(annotation[1]) / height_pixel
    return [normalized_x, normalized_y]

def append_to_pickle(data, filename):
    try:
        with open(filename, 'rb') as file:
            existing_data = pickle.load(file)
    except FileNotFoundError:
        existing_data = {'X': [], 'Y': []}

    existing_data['X'].extend(data['X'])
    existing_data['Y'].extend(data['Y'])

    with open(filename, 'wb') as file:
        pickle.dump(existing_data, file)


In [2]:
def process_directory(subdir_info):
    subdir, base_path = subdir_info
    X, Y = [], []  # Initialize X and Y locally within the function

    # Assuming get_h5_file_path and other support functions are defined globally
    h5_file_path = get_h5_file_path(subdir)
    if not os.path.exists(h5_file_path):
        print(f"No .h5 file found for user in {subdir}. Skipping.")
        return None  # Skip this directory

    h5_data = read_h5(h5_file_path)
    width_pixel, height_pixel = h5_data['width_pixel'][0], h5_data['height_pixel'][0]

    annotation_file = os.path.join(subdir, 'annotation.txt')
    if not os.path.exists(annotation_file):
        print(f"No annotation file found for directory {subdir}. Skipping.")
        return None  # Skip this directory

    with open(annotation_file, 'r') as ann_file:
        annotations = [line.strip().split() for line in ann_file]

    for file in os.listdir(subdir):
        if file.endswith('.jpg'):
            image_path = os.path.join(subdir, file)
            image_index = int(file.split('.')[0])  # Assuming files are named like "index.jpg"
            if image_index < len(annotations):
                annotation = normalize_annotations(annotations[image_index][24:26], width_pixel, height_pixel)
                image = cv2.imread(image_path)
                if image is not None:
                    preprocessed_image = preprocess_frame(image)
                    X.append(preprocessed_image)
                    Y.append(annotation)

    return {'X': X, 'Y': Y}

In [None]:
from multiprocessing import Pool, cpu_count

def process_images_and_annotations_parallel(base_path, max_dirs=100, batch_size=500):
    # List all subdirectories containing .jpg files
    subdirs = [(os.path.join(base_path, subdir), base_path) for subdir, _, files in os.walk(base_path) if any(file.endswith('.jpg') for file in files)]

    # Limit the number of directories to process to prevent excessive memory usage
    subdirs = subdirs[:max_dirs]

    # Initialize multiprocessing Pool
    with Pool(processes=cpu_count()) as pool:
        results = pool.map(process_directory, subdirs)

    # Filter out None results if any directory was skipped
    results = [result for result in results if result is not None]

    # Merge results from all directories
    all_X, all_Y = [], []
    for result in results:
        all_X.extend(result['X'])
        all_Y.extend(result['Y'])

    append_to_pickle({'X': all_X, 'Y': all_Y}, 'data_batch.pkl')


In [None]:
from multiprocessing import Pool, cpu_count
import os

def load_processed_dirs(log_file_path):
    if os.path.exists(log_file_path):
        with open(log_file_path, 'r') as file:
            processed_dirs = file.read().splitlines()
        return set(processed_dirs)  # Using a set for faster lookups
    return set()

def update_processed_dirs(log_file_path, subdir):
    with open(log_file_path, 'a') as file:
        file.write(f"{subdir}\n")

def process_images_and_annotations_parallel(base_path, max_dirs=100, batch_size=500):
    log_file_path = 'processed_dirs.log'
    processed_dirs = load_processed_dirs(log_file_path)

    # List all subdirectories containing .jpg files
    subdirs = [(os.path.join(base_path, subdir), base_path) for subdir, _, files in os.walk(base_path) if any(file.endswith('.jpg') for file in files)]

    # Filter out already processed directories
    subdirs = [subdir for subdir in subdirs if subdir[0] not in processed_dirs]

    # Limit the number of directories to process
    subdirs = subdirs[:max_dirs]

    # Initialize multiprocessing Pool
    with Pool(processes=cpu_count()) as pool:
        results = pool.map(process_directory, subdirs)

    # After processing, update the log file and merge results
    all_X, all_Y = [], []
    for subdir, result in zip(subdirs, results):
        if result is not None:  # Assuming process_directory returns None if skipped
            all_X.extend(result['X'])
            all_Y.extend(result['Y'])
            update_processed_dirs(log_file_path, subdir[0])

    # Save the merged results
    append_to_pickle({'X': all_X, 'Y': all_Y}, 'data_batch.pkl')


In [None]:
def process_images_and_annotations_parallel(base_path, batch_size=500, subdir_limit=1):
    # Discover all subdirectories that need processing
    # Ensure this captures the structure of your directory and files as needed
    subdirs = [(base_path, os.path.join(base_path, subdir)) for subdir, dirs, files in os.walk(base_path) if any(file.endswith('.jpg') for file in files)]
    
    # Initialize multiprocessing Pool
    with Pool(processes=os.cpu_count()) as pool:
        results = pool.map(process_directory, subdirs)
    
    # After collecting results, you can merge them or write them to a file as needed
    # Example: Merging results and writing to a single pickle file
    all_X, all_Y = [], []
    for result in results:
        all_X.extend(result['X'])
        all_Y.extend(result['Y'])
    
    # Now, you can save all_X and all_Y to a pickle file or process them as needed
