In [None]:
import os
from PIL import Image, UnidentifiedImageError
import cv2
import numpy as np

# Load OpenCV's pre-trained Haar Cascade classifier for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

def crop_face(img):
    """
    Detects and crops the largest face from the image, with a 10% margin.
    Returns None if no face is detected.
    """
    img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    faces = face_cascade.detectMultiScale(img_gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
    if len(faces) == 0:
        return None  # No faces detected
    # Use the largest face
    (x, y, w, h) = sorted(faces, key=lambda rect: rect[2]*rect[3], reverse=True)[0]
    margin = int(0.1 * w)
    x_new = max(0, x - margin)
    y_new = max(0, y - margin)
    w_new = min(img.shape[1] - x_new, w + 2 * margin)
    h_new = min(img.shape[0] - y_new, h + 2 * margin)
    x_new = min(x_new, img.shape[1] - w_new)
    y_new = min(y_new, img.shape[0] - h_new)
    face_img = img[y_new:y_new+h_new, x_new:x_new+w_new]
    return face_img

def preprocess_images(base_dir, output_dir, size=(224, 224)):
    """
    Preprocesses all images in the base_dir by detecting and cropping faces,
    resizing them to the specified size, and saving them as PNGs in output_dir.
    Directory structure is preserved.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for root, dirs, files in os.walk(base_dir):
        # Compute relative path for output structure
        rel_path = os.path.relpath(root, base_dir)
        output_subdir = os.path.join(output_dir, rel_path)
        if not os.path.exists(output_subdir):
            os.makedirs(output_subdir)
        for image_file in files:
            image_path = os.path.join(root, image_file)
            output_image_name = os.path.splitext(image_file)[0] + '.png'
            output_image_path = os.path.join(output_subdir, output_image_name)
            try:
                with Image.open(image_path) as img:
                    img_rgb = img.convert('RGB')
                    img_array = cv2.cvtColor(np.array(img_rgb), cv2.COLOR_RGB2BGR)
                    face_img = crop_face(img_array)
                    if face_img is not None:
                        face_pil = Image.fromarray(cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB))
                        face_pil = face_pil.resize(size)
                        face_pil.save(output_image_path, format='PNG')
                        print(f"Processed and saved {output_image_path}")
                    else:
                        print(f"No face detected in {image_file}")
            except UnidentifiedImageError:
                print(f"Could not identify image file {image_file}. Skipping...")
            except Exception as e:
                print(f"An error occurred with {image_file}: {e}")

if __name__ == "__main__":
    # First round filter
    base_dir = './demographic_group1_raw_images'  # Update with your actual dataset directory
    output_dir = './demographic_group1_faces_v1'
    preprocess_images(base_dir, output_dir)

    # Second round filter
    base_dir = './demographic_group1_faces_v1'
    output_dir = './demographic_group1_faces_v2'
    preprocess_images(base_dir, output_dir)