In [None]:
import os
import json
import cv2
from insightface.app import FaceAnalysis

# Initialize RetinaFace model
app = FaceAnalysis()
app.prepare(ctx_id=-1, det_size=(640, 640))

def is_valid_image(file_path):
    """
    Check if a file is a valid image.

    Args:
        file_path (str): Path to the image file.

    Returns:
        bool: True if the file is a valid image, False otherwise.
    """
    try:
        with open(file_path, 'rb') as f:
            img = cv2.imdecode(np.frombuffer(f.read(), np.uint8), cv2.IMREAD_COLOR)
            if img is not None:
                return True
    except Exception as e:
        print(f"Invalid image {file_path}: {e}")
    return False

def detect_faces_retinaface(image_path):
    """
    Detect faces in an image using RetinaFace.

    Args:
        image_path (str): Path to the image file.

    Returns:
        bool: True if faces are detected, False otherwise.
    """
    try:
        img = cv2.imread(image_path)
        faces = app.get(img)
        return len(faces) > 0
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return False

def process_images_in_batches(dataset_folder, processed_files_log, batch_size=100):
    """
    Check all images in the dataset folder, delete non-human face images, and log progress.

    Args:
        dataset_folder (str): Path to the dataset folder containing images.
        processed_files_log (str): Path to a JSON file to log processed files.
        batch_size (int): Number of files to process per batch.

    Returns:
        None
    """
    # Load previously processed files or create a new log file
    if os.path.exists(processed_files_log):
        with open(processed_files_log, "r") as log_file:
            processed_files = set(json.load(log_file))
    else:
        processed_files = set()

    # Process images in batches
    current_batch = []
    total_processed = 0
    total_deleted = 0

    for root, _, files in os.walk(dataset_folder):
        for file in files:
            file_path = os.path.join(root, file)

            if file_path in processed_files or not is_valid_image(file_path):
                print(f"Skipping already processed or invalid file: {file_path}")
                continue

            current_batch.append(file_path)

            # Process the batch when it reaches the specified size
            if len(current_batch) >= batch_size:
                deleted_count = process_batch(current_batch, processed_files, processed_files_log)
                total_processed += len(current_batch)
                total_deleted += deleted_count
                current_batch = []

    # Process any remaining images in the batch
    if current_batch:
        deleted_count = process_batch(current_batch, processed_files, processed_files_log)
        total_processed += len(current_batch)
        total_deleted += deleted_count

    print(f"Total images processed: {total_processed}")
    print(f"Total images deleted (non-human faces): {total_deleted}")

def process_batch(batch, processed_files, processed_files_log):
    """
    Process a batch of images, deleting those without human faces.

    Args:
        batch (list): List of image file paths to process.
        processed_files (set): Set of processed files to update.
        processed_files_log (str): Path to a JSON file to log processed files.

    Returns:
        int: Number of images deleted.
    """
    deleted_count = 0

    for file_path in batch:
        try:
            if not detect_faces_retinaface(file_path):  # Use RetinaFace detection
                os.remove(file_path)
                deleted_count += 1
                print(f"Deleted non-human face image: {file_path}")
            else:
                print(f"Human face detected in: {file_path}")

            # Add file to processed log
            processed_files.add(file_path)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    # Save progress to the JSON log
    with open(processed_files_log, "w") as log_file:
        json.dump(list(processed_files), log_file)
    print(f"Progress saved for {len(batch)} files.")

    return deleted_count

def main():
    # Define paths
    DATASET_FOLDER = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets"
    PROCESSED_FILES_LOG = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/processed_faces_log.json"
    BATCH_SIZE = 100

    # Process images in batches
    process_images_in_batches(DATASET_FOLDER, PROCESSED_FILES_LOG, BATCH_SIZE)

if __name__ == "__main__":
    main()
