Adding the LFW headshots can be very useful, but there are some important considerations:

1. **Improved Robustness:**  
   - **Diverse, in‑the‑wild data** from LFW can help your model generalize better. Even if it doesn’t push accuracy to 100% on a controlled test set, it will likely perform more reliably on real-world images.
   - More diversity in the training data often improves the model’s resilience to variations (lighting, pose, background) which is critical for deployment.

2. **Accuracy Trade-Off:**  
   - Achieving near 100% accuracy is rare, especially when you introduce challenging, "wild" images. Adding LFW may lower raw accuracy on a held‑out set (if that set is drawn from easier images), but it improves real-world performance.
   - In other words, while your measured accuracy might drop compared to training only on controlled datasets, the model becomes more robust and practical.

3. **Cost Efficiency:**  
   - Manually sorting LFW headshots is a cost-effective way to expand your dataset rather than using more expensive GPU time for additional training iterations.
   - More high-quality data can reduce the need for extensive hyperparameter tuning and retraining.

### **Conclusion:**
For money saving and to build a more robust model, it is **worth finishing the LFW sorting and integrating those photos** with your existing FERCKJA dataset. This will help your model learn a wider variety of facial expressions in different conditions, which is crucial for real-world performance—even if it might make near-perfect accuracy an unrealistic target.

Would you like any further guidance on how to integrate and balance the datasets?

In [None]:
#!/usr/bin/env python
# coding: utf-8

import os
import sys
import cv2
from PIL import Image

# --- Main Configuration ---
INPUT_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_scraped"
OUTPUT_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/pexels_dataset_archive/pexels_curated"
YUNET_MODEL_PATH = "face_detection_yunet_2023mar.onnx"
MIN_WIDTH = 48
MIN_HEIGHT = 48

# ==============================================================================
# STEP 1: Detect and Crop Faces using the High-Level FaceDetectorYN Class
# ==============================================================================
def crop_all_faces(input_folder, output_folder, model_path):
    print("--- Starting Step 1: Face Detection and Cropping (CPU Mode) ---")

    if not os.path.exists(model_path):
        print(f"FATAL ERROR: Model file not found at '{model_path}'")
        sys.exit()

    # Initialize the high-level detector. It handles all the complexity internally.
    try:
        face_detector = cv2.FaceDetectorYN.create(
            model=model_path,
            config="",
            input_size=(320, 320), # The model will handle resizing internally
            score_threshold=0.9
        )
        print("  Successfully initialized OpenCV FaceDetectorYN on CPU.")
    except Exception as e:
        print(f"  Could not initialize the face detector. Error: {e}")
        sys.exit()

    os.makedirs(output_folder, exist_ok=True)
    processed_files = 0
    total_faces_found = 0

    for dirpath, _, filenames in os.walk(input_folder):
        for filename in filenames:
            if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
                continue

            file_path = os.path.join(dirpath, filename)
            
            try:
                image = cv2.imread(file_path)
                if image is None:
                    print(f"  WARNING: Could not read image {filename}. Skipping.")
                    continue
                
                height, width, _ = image.shape
                # Set the detector's input size to the image's actual size for better accuracy
                face_detector.setInputSize((width, height))

                # The .detect() method handles everything: blob creation, forward pass, and post-processing
                _, faces = face_detector.detect(image)
                
                # The result is None if no faces are found
                if faces is not None:
                    base_filename = os.path.splitext(os.path.basename(file_path))[0]
                    for i, face_data in enumerate(faces):
                        # Bounding box format is already [x, y, w, h]
                        box = list(map(int, face_data[:4]))
                        x, y, w, h = box
                        
                        if x >= 0 and y >= 0 and x + w <= width and y + h <= height:
                            cropped_face = image[y:y+h, x:x+w]
                            output_filename = f"{base_filename}_face_{i+1}.jpg"
                            output_path = os.path.join(output_folder, output_filename)
                            cv2.imwrite(output_path, cropped_face)
                            total_faces_found += 1
                
                processed_files += 1
                if processed_files % 100 == 0:
                    print(f"  Processed {processed_files} images...")
            
            except Exception as e:
                print(f"  ERROR: An unexpected error occurred on {file_path}: {e}")
                continue

    print(f"--- Step 1 Complete ---")
    print(f"  Total source images processed: {processed_files}")
    print(f"  Total faces found and saved: {total_faces_found}\n")

# ==============================================================================
# (STEP 2 and 3 are unchanged)
# ==============================================================================
def remove_duplicate_faces(target_folder):
    print("--- Starting Step 2: Removing Duplicate Faces ---")
    seen_files = {}
    duplicates_to_remove = []
    for root, _, files in os.walk(target_folder):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                file_size = os.path.getsize(file_path)
                with open(file_path, "rb") as f:
                    file_hash = hash(f.read())
                if (file_size, file_hash) in seen_files:
                    duplicates_to_remove.append(file_path)
                else:
                    seen_files[(file_size, file_hash)] = file_path
            except Exception as e:
                print(f"Could not process {file_path} for duplicate check: {e}")
    if duplicates_to_remove:
        print(f"  Found {len(duplicates_to_remove)} duplicate images. Removing...")
        for dup_path in duplicates_to_remove:
            try:
                os.remove(dup_path)
            except Exception as e:
                print(f"Failed to remove duplicate {dup_path}: {e}")
    else:
        print("  No duplicate files were found.")
    print(f"--- Step 2 Complete ---\n")

def delete_small_images(target_folder, min_width, min_height):
    print(f"--- Starting Step 3: Deleting Images Smaller Than {min_width}x{min_height} ---")
    deleted_count = 0
    for root, _, files in os.walk(target_folder):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with Image.open(file_path) as img:
                    width, height = img.size
                    if width < min_width or height < min_height:
                        img.close()
                        os.remove(file_path)
                        deleted_count += 1
            except Exception as e:
                print(f"Error processing file {file_path} for size check: {e}")
    if deleted_count > 0:
        print(f"  Deleted {deleted_count} images that were too small.")
    else:
        print("  No images were smaller than the minimum size.")
    print(f"--- Step 3 Complete ---\n")

if __name__ == "__main__":
    if "path/to/your" in INPUT_FOLDER or "path/to/your" in OUTPUT_FOLDER:
        print("Error: Please update the INPUT_FOLDER and OUTPUT_FOLDER variables at the top of the script before running.")
        sys.exit()

    print("Starting Image Curation Pipeline...")
    print("===================================")
    
    crop_all_faces(INPUT_FOLDER, OUTPUT_FOLDER, YUNET_MODEL_PATH)
    remove_duplicate_faces(OUTPUT_FOLDER)
    delete_small_images(OUTPUT_FOLDER, min_width=MIN_WIDTH, min_height=MIN_HEIGHT)
    
    print("===================================")
    print("Pipeline finished successfully!")