In [1]:
import os
import sys
import cv2
import numpy as np
from tqdm import tqdm

In [3]:
# ==============================================================================
# --- CONFIGURATION: UPDATE THESE PATHS ---
# ==============================================================================
# 1. Folder containing the original, full-sized images.
ORIGINAL_IMAGES_FOLDER = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_scraped"

# 2. Folder containing the 1,400+ badly cropped face images.
BAD_CROP_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_revisit"

# 3. Folder where the new, correctly cropped faces will be saved.
RESCUED_FACES_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/gemini_rescue"

# 4. Path to the YuNet face detection model.
YUNET_MODEL_PATH = "face_detection_yunet_2023mar.onnx"

# --- TUNING PARAMETERS ---
# Increases the crop area around the face. 0.35 means 35% padding on each side.
PADDING_FACTOR = 0.35

In [4]:
# ==============================================================================
# --- HELPER FUNCTIONS ---
# ==============================================================================
def find_source_image(base_filename, source_folder):
    """Searches for a source image with common extensions."""
    for ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
        path = os.path.join(source_folder, base_filename + ext)
        if os.path.exists(path):
            return path
    return None


def create_source_image_map(source_folder):
    """Walks through all subdirectories to find image files and maps their base names to full paths."""
    image_map = {}
    print("  Scanning source folder and all subdirectories for images...")
    # os.walk traverses the entire directory tree
    for dirpath, _, filenames in tqdm(list(os.walk(source_folder)), desc="Mapping sources"):
        for f in filenames:
            # Process only files with image extensions
            if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
                # Get filename without extension, e.g., 'image_123' from 'image_123.jpg'
                base_name = os.path.splitext(f)[0]
                # Store the full path for this base name
                if base_name not in image_map:
                    image_map[base_name] = os.path.join(dirpath, f)
    print(f"  Mapped {len(image_map)} unique source images.")
    return image_map

In [5]:
# ==============================================================================
# --- UPDATED MAIN RESCUE LOGIC ---
# ==============================================================================
def rescue_improper_crops():
    print("--- Starting Face Rescue Operation ---")
    
    # --- Model and Folder Setup ---
    if not os.path.exists(YUNET_MODEL_PATH):
        print(f"FATAL ERROR: Model file not found at '{YUNET_MODEL_PATH}'")
        sys.exit()

    try:
        face_detector = cv2.FaceDetectorYN.create(
            model=YUNET_MODEL_PATH, config="", input_size=(320, 320),
            score_threshold=0.7
        )
        print("  Successfully initialized OpenCV FaceDetectorYN.")
    except Exception as e:
        print(f"  Could not initialize the face detector. Error: {e}")
        sys.exit()

    os.makedirs(RESCUED_FACES_FOLDER, exist_ok=True)

    # --- Pre-scan and map all source images from the nested folders ---
    source_image_map = create_source_image_map(ORIGINAL_IMAGES_FOLDER)

    # --- Identify Unique Source Images to Process ---
    bad_crop_files = os.listdir(BAD_CROP_FOLDER)
    source_filenames_to_process = set()
    for filename in bad_crop_files:
        if '_face_' in filename:
            base_name = filename.split('_face_')[0]
            source_filenames_to_process.add(base_name)
    
    print(f"  Found {len(source_filenames_to_process)} unique source images to re-process.")

    # --- Process Each Source Image ---
    rescued_count = 0
    for base_name in tqdm(list(source_filenames_to_process), desc="Rescuing faces"):
        # --- MODIFIED LINE: Use the map for a fast lookup instead of searching ---
        source_image_path = source_image_map.get(base_name)
        
        if not source_image_path:
            print(f"\n  WARNING: Could not find mapped source for '{base_name}'. Skipping.")
            continue
            
        try:
            image = cv2.imread(source_image_path)
            if image is None: continue

            img_height, img_width, _ = image.shape
            face_detector.setInputSize((img_width, img_height))
            _, faces = face_detector.detect(image)

            if faces is not None:
                for i, face_data in enumerate(faces):
                    box = list(map(int, face_data[:4]))
                    x, y, w, h = box
                    
                    pad_w = int(w * PADDING_FACTOR)
                    pad_h = int(h * PADDING_FACTOR)
                    
                    x1 = max(0, x - pad_w)
                    y1 = max(0, y - pad_h)
                    x2 = min(img_width, x + w + pad_w)
                    y2 = min(img_height, y + h + pad_h)
                    
                    rescued_crop = image[y1:y2, x1:x2]
                    
                    if rescued_crop.size > 0:
                        output_filename = f"{base_name}_face_{i+1}.jpg"
                        output_path = os.path.join(RESCUED_FACES_FOLDER, output_filename)
                        cv2.imwrite(output_path, rescued_crop)
                        rescued_count += 1

        except Exception as e:
            print(f"\n  ERROR: Failed to process {source_image_path}: {e}")

    print("\n--- Rescue Operation Complete ---")
    print(f"  Successfully rescued and saved {rescued_count} faces to '{RESCUED_FACES_FOLDER}'.")

In [None]:
# ==============================================================================
# --- EXECUTION BLOCK ---
# ==============================================================================
if __name__ == "__main__":
    rescue_improper_crops()