In [5]:
import os
import json
from PIL import Image
import shutil

In [6]:
def categorize_and_copy_images(source_folder, target_folder, processed_files_log, batch_size=50):
    """
    Categorize images by their pixel size and copy them into respective folders.

    Args:
        source_folder (str): Path to the folder containing the images.
        target_folder (str): Path to the base target folder where categorized images will be copied.
        processed_files_log (str): Path to a JSON file to log processed files.
        batch_size (int): Number of files to process before logging progress.

    Returns:
        None
    """
    # Ensure target folders exist
    small_folder = os.path.join(target_folder, "combined_datasets_small")
    medium_folder = os.path.join(target_folder, "combined_datasets_medium")
    large_folder = os.path.join(target_folder, "combined_datasets_large")
    
    os.makedirs(small_folder, exist_ok=True)
    os.makedirs(medium_folder, exist_ok=True)
    os.makedirs(large_folder, exist_ok=True)

    # Load previously processed files or create a new log file
    if not os.path.exists(processed_files_log):
        with open(processed_files_log, "w") as log_file:
            json.dump([], log_file)
        print(f"Created new log file: {processed_files_log}")

    with open(processed_files_log, "r") as log_file:
        processed_files = set(json.load(log_file))

    total_files_detected = 0
    valid_images = 0
    skipped_files = 0
    categorized_files = {"small": 0, "medium": 0, "large": 0}
    batch_count = 0

    # Process each image in the source folder
    for root, _, files in os.walk(source_folder):
        for file in files:
            total_files_detected += 1
            file_path = os.path.join(root, file)

            if file_path in processed_files:
                print(f"Skipping already processed file: {file_path}")
                continue

            try:
                # Open the image and get its dimensions
                with Image.open(file_path) as img:
                    width, height = img.size
                    pixel_count = width * height

                    print(f"Processing {file}: {width}x{height}, {pixel_count} pixels")

                    # Categorize the image
                    if pixel_count <= 256 * 256:  # Small (up to 256 x 256 pixels)
                        destination = os.path.join(small_folder, file)
                        category = "small"
                    elif pixel_count <= 4932 * 4932:  # Medium (up to 4,932 x 4,932 pixels)
                        destination = os.path.join(medium_folder, file)
                        category = "medium"
                    else:  # Large (above 4,932 x 4,932 pixels)
                        destination = os.path.join(large_folder, file)
                        category = "large"

                    # Copy the image to the correct subfolder
                    shutil.copy2(file_path, destination)
                    print(f"Copied {file} to {destination}")
                    categorized_files[category] += 1
                    valid_images += 1

                    # Add to processed files and increment batch count
                    processed_files.add(file_path)
                    batch_count += 1

                    # Log progress every batch_size files
                    if batch_count >= batch_size:
                        with open(processed_files_log, "w") as log_file:
                            json.dump(list(processed_files), log_file)
                        print(f"Logged progress for {batch_count} files.")
                        batch_count = 0

            except Exception as e:
                print(f"Failed to process {file_path}: {e}")
                skipped_files += 1

    # Final log update
    with open(processed_files_log, "w") as log_file:
        json.dump(list(processed_files), log_file)

    # Summary of the operation
    print("\nImage categorization complete.")
    print(f"Total files detected: {total_files_detected}")
    print(f"Valid images processed: {valid_images}")
    print(f"Skipped files: {skipped_files}")
    print("Categorization summary:")
    print(f"  Small images: {categorized_files['small']}")
    print(f"  Medium images: {categorized_files['medium']}")
    print(f"  Large images: {categorized_files['large']}")

In [7]:
def process_multiple_folders(base_folder, target_folder, processed_files_log, batch_size=50):
    """
    Cycle through multiple folders in a base folder and categorize images.

    Args:
        base_folder (str): Path to the base folder containing subfolders.
        target_folder (str): Path to the target folder for categorized images.
        processed_files_log (str): Path to a JSON file to log processed files.
        batch_size (int): Number of files to process before logging progress.

    Returns:
        None
    """
    try:
        subfolders = [os.path.join(base_folder, f) for f in os.listdir(base_folder) if os.path.isdir(os.path.join(base_folder, f))]

        for subfolder in subfolders:
            print(f"Processing folder: {subfolder}")
            categorize_and_copy_images(subfolder, target_folder, processed_files_log, batch_size)
    except Exception as e:
        print(f"Critical error: {e}")

In [8]:
# Example Usage
if __name__ == "__main__":
    BASE_FOLDER = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/celeba_dataset_split"
    TARGET_FOLDER = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets"
    PROCESSED_FILES_LOG = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/celeba_dataset_split/processed_files_log.json"

    # Process all subfolders
    process_multiple_folders(BASE_FOLDER, TARGET_FOLDER, PROCESSED_FILES_LOG, batch_size=50)

Processing folder: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/celeba_dataset_split/subfolder_100
Skipping already processed file: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/celeba_dataset_split/subfolder_100/201520.png_face1.jpg
Skipping already processed file: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/celeba_dataset_split/subfolder_100/201521.png_face1.jpg
Skipping already processed file: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/celeba_dataset_split/subfolder_100/201522.png_face1.jpg
Skipping already processed file: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/celeba_dataset_split/subfolder_100/201523.png_face1.jpg
Skipping already processed file: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/celeba_dataset_split/subfolder_100/201524.png_face1.jpg
Skipping already processed file: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/celeba_dataset_split/subfolder_100/201525.png_face1.jpg
Skippi