In [None]:
import os
import sys
import cv2
import hashlib
import imagehash
import numpy as np
from PIL import Image
from tqdm import tqdm

In [None]:
# ==============================================================================
# --- Main Configuration ---
# ==============================================================================
INPUT_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_scraped"
OUTPUT_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_curated"
YUNET_MODEL_PATH = "face_detection_yunet_2023mar.onnx"

# 1. MINIMUM SIZE: Lowered to 72 to capture smaller faces in group shots.
MIN_WIDTH = 72
MIN_HEIGHT = 72

# 2. BLUR THRESHOLD (Sharpness): Measures edge clarity. Higher is sharper.
BLUR_THRESHOLD = 100.0

# 3. CONTRAST THRESHOLD: Measures the dynamic range of the image.
CONTRAST_THRESHOLD = 15.0

# --- Landmark and Pose Filtering ---
REQUIRE_ALL_LANDMARKS = True
MAX_EYE_ANGLE = 10.0

# How similar two images can be to be considered duplicates.
# A value of 0-1 is very strict (nearly identical). Higher values are more lenient.
PERCEPTUAL_HASH_THRESHOLD = 1

In [None]:
# # ==============================================================================
# # DIAGNOSTIC TOOL: Visualize Face Detection
# # ==============================================================================
# def run_detection_debug_mode(image_paths, output_folder, model_path):
#     """
#     Runs the face detector on specific images and saves a copy
#     with the detected bounding boxes and confidence scores drawn on it.
#     """
#     print("--- Running in Detector Debug Mode ---")
#     if not os.path.exists(model_path):
#         print(f"FATAL ERROR: Model file not found at '{model_path}'")
#         return

#     try:
#         # Initialize the detector with a LOWER threshold to find less confident faces
#         face_detector = cv2.FaceDetectorYN.create(
#             model=model_path, config="", input_size=(320, 320), score_threshold=0.5
#         )
#         print("  Initialized detector with a lower confidence threshold for debugging.")
#     except Exception as e:
#         print(f"  Could not initialize the face detector. Error: {e}")
#         return

#     os.makedirs(output_folder, exist_ok=True)

#     for file_path in image_paths:
#         print(f"  Analyzing '{os.path.basename(file_path)}'...")
#         image = cv2.imread(file_path)
#         if image is None:
#             print(f"    - Could not read image. Skipping.")
#             continue

#         height, width, _ = image.shape
#         face_detector.setInputSize((width, height))
#         _, faces = face_detector.detect(image)
        
#         if faces is not None:
#             print(f"    - Found {len(faces)} faces.")
#             # Draw results on a copy of the image
#             for face_data in faces:
#                 box = list(map(int, face_data[0:4]))
                
#                 # <--- THIS IS THE CORRECTED LINE ---
#                 confidence = face_data[14] # Corrected index from 15 to 14
                
#                 cv2.rectangle(image, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 255, 0), 4)
                
#                 label = f"Confidence: {confidence:.2f}"
#                 cv2.putText(image, label, (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), 3)
#         else:
#             print("    - No faces found at this confidence level.")

#         # Save the debug image
#         base_filename = os.path.splitext(os.path.basename(file_path))[0]
#         output_filename = f"{base_filename}_DETECTED.jpg"
#         output_path = os.path.join(output_folder, output_filename)
#         cv2.imwrite(output_path, image)
#         print(f"    - Saved debug image to: {output_path}")

In [None]:
# ==============================================================================
# STEP 1: Detect, Filter, and Crop High-Quality Faces (with Tuned Thresholds)
# ==============================================================================
def crop_all_faces(input_folder, output_folder, model_path):
    print("--- Starting Step 1: Face Detection and Cropping ---")
    if not os.path.exists(model_path):
        print(f"FATAL ERROR: Model file not found at '{model_path}'")
        sys.exit()

    # Initialize the detector with a more lenient score_threshold
    try:
        face_detector = cv2.FaceDetectorYN.create(
            model=model_path, config="", input_size=(320, 320),
            score_threshold=0.7 # <--- LOWERED from 0.9 to 0.7
        )
        print("  Successfully initialized OpenCV FaceDetectorYN with tuned threshold.")
    except Exception as e:
        print(f"  Could not initialize the face detector. Error: {e}")
        sys.exit()

    os.makedirs(output_folder, exist_ok=True)
    total_faces_saved = 0
    
    ALLOWED_EXTS = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff')

    all_image_paths = [
        os.path.join(dp, f)
        for dp, dn, fn in os.walk(os.path.expanduser(input_folder))
        for f in fn
        if f.lower().endswith(ALLOWED_EXTS)
    ]
    print(f"  Found {len(all_image_paths)} total images to process in all subfolders.")

    for file_path in tqdm(all_image_paths, desc="Step 1: Cropping faces"):
        try:
            image = cv2.imread(file_path)
            if image is None: continue
            
            height, width, _ = image.shape
            face_detector.setInputSize((width, height))
            _, faces = face_detector.detect(image)
            
            if faces is not None:
                for i, face_data in enumerate(faces):
                    box = list(map(int, face_data[:4]))
                    landmarks = list(map(int, face_data[4:14]))
                    
                    if REQUIRE_ALL_LANDMARKS and len(landmarks) < 10: continue
                    
                    right_eye, left_eye = (landmarks[0], landmarks[1]), (landmarks[2], landmarks[3])
                    angle = np.degrees(np.arctan2(left_eye[1] - right_eye[1], left_eye[0] - right_eye[0]))
                    if abs(angle) > MAX_EYE_ANGLE: continue

                    # Build a square, landmark-centered box with margin
                    # ---- NEW: landmark-centered, square, margin-padded crop ----
                    MARGIN_RATIO = 0.55  # adjust to taste: 0.45 (tighter) ... 0.70 (looser)
                    
                    # prefer landmarks to anchor the face; fallback to box if landmarks are missing
                    if landmarks is not None and len(landmarks) >= 10:
                        x_sq, y_sq, w_sq, h_sq = _expand_square_bbox_from_landmarks(
                            landmarks, MARGIN_RATIO, width, height
                        )
                    else:
                        # fallback: expand the detector box into a square with margin
                        side = int(round(max(w, h) * (1.0 + MARGIN_RATIO)))
                        cx = x + w // 2
                        cy = y + h // 2
                        x_sq = int(cx - side // 2)
                        y_sq = int(cy - side // 2)
                        w_sq = h_sq = side
                    
                    # optional: enforce a minimum saved size (keeps out tiny detections)
                    if min(w_sq, h_sq) < MIN_WIDTH:
                        continue
                    
                    # crop with safe padding so forehead/chin/ears arenâ€™t cut at the edges
                    cropped_face, used_padding = _safe_crop_with_padding(image, x_sq, y_sq, w_sq, h_sq)
                    
                    # quality checks (reuse your existing thresholds/logic)
                    gray_face = cv2.cvtColor(cropped_face, cv2.COLOR_BGR2GRAY)
                    sharpness_score = cv2.Laplacian(gray_face, cv2.CV_64F).var()
                    if sharpness_score < BLUR_THRESHOLD:
                        continue
                    
                    contrast_score = gray_face.std()
                    if contrast_score < CONTRAST_THRESHOLD:
                        continue

                    # keep your original naming; if you prefer, you can append a suffix when padding occurred
                    base_filename = os.path.splitext(os.path.basename(file_path))[0]
                    # e.g., optional: suffix = "_padded" if used_padding else ""
                    suffix = ""
                    output_filename = f"{base_filename}_face_{i+1}{suffix}.jpg"
                    output_path = os.path.join(output_folder, output_filename)
                    cv2.imwrite(output_path, cropped_face)
                    total_faces_saved += 1

        except Exception as e:
            print(f"\n  WARNING: An error occurred on {file_path}: {e}")
            continue

    print(f"--- Step 1 Complete ---")
    print(f"  Total high-quality faces found and saved: {total_faces_saved}\n")


# ---- full-face square cropping helpers ----
def _expand_square_bbox_from_landmarks(landmarks, margin_ratio, img_w, img_h):
    """
    landmarks: [x_r_eye, y_r_eye, x_l_eye, y_l_eye,
                x_nose, y_nose, x_r_mouth, y_r_mouth, x_l_mouth, y_l_mouth]
    Returns: x, y, w, h for a square box centered on landmarks, with margin.
    """
    xs = landmarks[0::2]
    ys = landmarks[1::2]
    x_min, x_max = min(xs), max(xs)
    y_min, y_max = min(ys), max(ys)

    cx = (x_min + x_max) / 2.0
    cy = (y_min + y_max) / 2.0
    side = max(x_max - x_min, y_max - y_min)
    side = int(round(side * (1.0 + margin_ratio)))  # add forehead/chin/ears margin

    half = side // 2
    x0 = int(round(cx - half))
    y0 = int(round(cy - half))
    return x0, y0, side, side


def _safe_crop_with_padding(image, x, y, w, h, pad_mode=cv2.BORDER_REFLECT_101):
    """
    Crops [x:x+w, y:y+h]. If the box spills outside the image, pad the image first.
    Returns the cropped image and a bool indicating if padding was used.
    """
    H, W = image.shape[:2]
    left   = max(0, -x)
    top    = max(0, -y)
    right  = max(0, x + w - W)
    bottom = max(0, y + h - H)

    if any(v > 0 for v in (left, top, right, bottom)):
        padded = cv2.copyMakeBorder(image, top, bottom, left, right, pad_mode)
        x_p = x + left
        y_p = y + top
        return padded[y_p:y_p+h, x_p:x_p+w], True
    else:
        return image[y:y+h, x:x+w], False

In [None]:
# ==============================================================================
# STEP 2: Remove Perceptual Duplicates
# ==============================================================================
def remove_perceptual_duplicates(target_folder, hash_threshold):
    print(f"--- Starting Step 2: Removing Perceptual Duplicates (Threshold: {hash_threshold}) ---")
    hashes = {}
    duplicates_to_remove = []
    
    image_paths = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(target_folder)) for f in fn if f.lower().endswith(('.png', '.jpg', 'jpeg'))]

    for file_path in tqdm(image_paths, desc="Step 2: Hashing images"):
        try:
            img = Image.open(file_path)
            # Use perceptual hash (phash) which is robust against compression and small changes
            h = imagehash.phash(img)
            
            # Check if this hash is a close match to any we've already seen
            found_match = False
            for seen_hash in hashes:
                if (h - seen_hash) <= hash_threshold:
                    duplicates_to_remove.append(file_path)
                    found_match = True
                    break
            
            if not found_match:
                hashes[h] = file_path
        except Exception as e:
            print(f"\n  WARNING: Could not process {file_path} for duplicate check: {e}")
            
    if duplicates_to_remove:
        print(f"  Found {len(duplicates_to_remove)} perceptual duplicate images. Removing...")
        for dup_path in duplicates_to_remove:
            try: os.remove(dup_path)
            except Exception as e: print(f"  Failed to remove duplicate {dup_path}: {e}")
    else:
        print("  No duplicate files were found.")
    print(f"--- Step 2 Complete ---\n")

In [None]:
# ==============================================================================
# Main Execution Block
# ==============================================================================
if __name__ == "__main__":
    print("Starting Tuned Image Curation Pipeline...")
    print("=========================================")

    # Step 1: Crop faces using all the integrated quality filters
    crop_all_faces(INPUT_FOLDER, OUTPUT_FOLDER, YUNET_MODEL_PATH)

    # Step 2: Remove visually similar duplicates from the output
    # Note: Ensure PERCEPTUAL_HASH_THRESHOLD is set in your configuration.
    remove_perceptual_duplicates(OUTPUT_FOLDER, hash_threshold=PERCEPTUAL_HASH_THRESHOLD)
    
    print("=========================================")
    print("Pipeline finished successfully!")