In [None]:
import os
import sys
import cv2
import hashlib
import imagehash
import numpy as np
from PIL import Image
from tqdm import tqdm

In [None]:
# ==============================================================================
# --- Main Configuration ---
# ==============================================================================
INPUT_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_scraped"
OUTPUT_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_curated"
YUNET_MODEL_PATH = "face_detection_yunet_2023mar.onnx"

# 1. MINIMUM SIZE: Lowered to 72 to capture smaller faces in group shots.
MIN_WIDTH = 72
MIN_HEIGHT = 72

# 2. BLUR THRESHOLD (Sharpness): Measures edge clarity. Higher is sharper.
BLUR_THRESHOLD = 100.0

# 3. CONTRAST THRESHOLD: Measures the dynamic range of the image.
CONTRAST_THRESHOLD = 15.0

# --- Landmark and Pose Filtering ---
REQUIRE_ALL_LANDMARKS = True
MAX_EYE_ANGLE = 10.0

# How similar two images can be to be considered duplicates.
# A value of 0-1 is very strict (nearly identical). Higher values are more lenient.
PERCEPTUAL_HASH_THRESHOLD = 1

# <--- NEW: PADDING FACTOR ---
# Increases the crop area around the face. 0.35 means 35% padding on each side.
PADDING_FACTOR = 0.35

In [None]:
# ==============================================================================
# STEP 1: Detect, Filter, and Crop High-Quality Faces (with Padding)
# ==============================================================================
def crop_all_faces(input_folder, output_folder, model_path):
    print("--- Starting Step 1: Face Detection and Cropping ---")
    if not os.path.exists(model_path):
        print(f"FATAL ERROR: Model file not found at '{model_path}'")
        sys.exit()

    try:
        face_detector = cv2.FaceDetectorYN.create(
            model=model_path, config="", input_size=(320, 320),
            score_threshold=0.7
        )
        print("  Successfully initialized OpenCV FaceDetectorYN.")
    except Exception as e:
        print(f"  Could not initialize the face detector. Error: {e}")
        sys.exit()

    os.makedirs(output_folder, exist_ok=True)
    total_faces_saved = 0
    
    all_image_paths = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(input_folder)) for f in fn if f.lower().endswith(('.png', '.jpg', 'jpeg', '.bmp', '.tiff'))]
    print(f"  Found {len(all_image_paths)} total images to process.")

    for file_path in tqdm(all_image_paths, desc="Step 1: Cropping faces"):
        try:
            image = cv2.imread(file_path)
            if image is None: continue
            
            img_height, img_width, _ = image.shape
            face_detector.setInputSize((img_width, img_height))
            _, faces = face_detector.detect(image)
            
            if faces is not None:
                for i, face_data in enumerate(faces):
                    box = list(map(int, face_data[:4]))
                    landmarks = list(map(int, face_data[4:14]))
                    
                    if REQUIRE_ALL_LANDMARKS and len(landmarks) < 10: continue
                    
                    right_eye, left_eye = (landmarks[0], landmarks[1]), (landmarks[2], landmarks[3])
                    angle = np.degrees(np.arctan2(left_eye[1] - right_eye[1], left_eye[0] - right_eye[0]))
                    if abs(angle) > MAX_EYE_ANGLE: continue

                    x, y, w, h = box
                    if not (x >= 0 and y >= 0 and x + w <= img_width and y + h <= img_height): continue
                    
                    if w < MIN_WIDTH or h < MIN_HEIGHT: continue
                    
                    # <--- MODIFIED SECTION: Apply padding to the bounding box ---
                    pad_w = int(w * PADDING_FACTOR)
                    pad_h = int(h * PADDING_FACTOR)
                    
                    # Calculate new coordinates with padding
                    x1 = x - pad_w
                    y1 = y - pad_h
                    x2 = x + w + pad_w
                    y2 = y + h + pad_h
                    
                    # Ensure coordinates are within image bounds
                    x1 = max(0, x1)
                    y1 = max(0, y1)
                    x2 = min(img_width, x2)
                    y2 = min(img_height, y2)
                    # <--- END MODIFIED SECTION ---

                    # <--- MODIFIED LINE: Crop using padded coordinates ---
                    cropped_face = image[y1:y2, x1:x2]

                    # Skip if the final crop is somehow invalid
                    if cropped_face.size == 0: continue
                    
                    gray_face = cv2.cvtColor(cropped_face, cv2.COLOR_BGR2GRAY)
                    
                    sharpness_score = cv2.Laplacian(gray_face, cv2.CV_64F).var()
                    if sharpness_score < BLUR_THRESHOLD: continue
                        
                    contrast_score = gray_face.std()
                    if contrast_score < CONTRAST_THRESHOLD: continue

                    base_filename = os.path.splitext(os.path.basename(file_path))[0]
                    output_filename = f"{base_filename}_face_{i+1}.jpg"
                    output_path = os.path.join(output_folder, output_filename)
                    cv2.imwrite(output_path, cropped_face)
                    total_faces_saved += 1
        except Exception as e:
            print(f"\n  WARNING: An error occurred on {file_path}: {e}")
            continue

    print(f"--- Step 1 Complete ---")
    print(f"  Total high-quality faces found and saved: {total_faces_saved}\n")

In [None]:
# ==============================================================================
# STEP 2: Remove Perceptual Duplicates
# ==============================================================================
def remove_perceptual_duplicates(target_folder, hash_threshold):
    print(f"--- Starting Step 2: Removing Perceptual Duplicates (Threshold: {hash_threshold}) ---")
    hashes = {}
    duplicates_to_remove = []
    
    image_paths = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(target_folder)) for f in fn if f.lower().endswith(('.png', '.jpg', 'jpeg'))]

    for file_path in tqdm(image_paths, desc="Step 2: Hashing images"):
        try:
            img = Image.open(file_path)
            h = imagehash.phash(img)
            
            found_match = False
            for seen_hash in hashes:
                if (h - seen_hash) <= hash_threshold:
                    duplicates_to_remove.append(file_path)
                    found_match = True
                    break
            
            if not found_match:
                hashes[h] = file_path
        except Exception as e:
            print(f"\n  WARNING: Could not process {file_path} for duplicate check: {e}")
            
    if duplicates_to_remove:
        print(f"  Found {len(duplicates_to_remove)} perceptual duplicate images. Removing...")
        for dup_path in duplicates_to_remove:
            try: os.remove(dup_path)
            except Exception as e: print(f"  Failed to remove duplicate {dup_path}: {e}")
    else:
        print("  No duplicate files were found.")
    print(f"--- Step 2 Complete ---\n")

In [None]:
# ==============================================================================
# Main Execution Block
# ==============================================================================
if __name__ == "__main__":
    print("Starting Tuned Image Curation Pipeline...")
    print("=========================================")
    crop_all_faces(INPUT_FOLDER, OUTPUT_FOLDER, YUNET_MODEL_PATH)
    remove_perceptual_duplicates(OUTPUT_FOLDER, hash_threshold=PERCEPTUAL_HASH_THRESHOLD)
    print("=========================================")
    print("Pipeline finished successfully!")