In [1]:
import os
import json
import shutil
from hashlib import md5
from PIL import Image
import time

In [2]:
def calculate_hash(file_path):
    """Calculate MD5 hash of a file."""
    hash_md5 = md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [3]:
def load_processed_files(log_path):
    """Load processed files from a JSON log."""
    if os.path.exists(log_path):
        with open(log_path, "r") as log_file:
            return set(json.load(log_file))
    return set()

In [4]:
def save_processed_files(log_path, processed_files):
    """Save processed files to a JSON log."""
    with open(log_path, "w") as log_file:
        json.dump(list(processed_files), log_file)

In [5]:
def count_images(folder_path, extensions=None):
    """
    Count the total number of images in a folder.

    Args:
        folder_path (str): Path to the folder.
        extensions (set): Set of valid image file extensions.

    Returns:
        int: Total number of images found.
    """
    if extensions is None:
        extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}
    return sum(
        1 for root, _, files in os.walk(folder_path)
        for file in files if os.path.splitext(file)[1].lower() in extensions
    )

In [6]:
def process_images(source_folder, dest_folders, log_path, batch_size=100, extensions=None, force_recheck=False):
    """Process images from source to destination folders."""
    if extensions is None:
        extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}

    processed_files = load_processed_files(log_path)
    dest_hashes = set()

    # Calculate hashes for all destination images
    for folder in dest_folders.values():
        for root, _, files in os.walk(folder):
            for file in files:
                file_path = os.path.join(root, file)
                dest_hashes.add(calculate_hash(file_path))

    current_batch = []
    total_processed = 0
    skipped = 0

    for root, _, files in os.walk(source_folder):
        for file in files:
            if os.path.splitext(file)[1].lower() not in extensions:
                continue

            file_path = os.path.join(root, file)
            file_hash = calculate_hash(file_path)

            # Check if the file should be skipped
            if not force_recheck and (file_hash in processed_files or file_hash in dest_hashes):
                skipped += 1
                with open("skipped_files.log", "a") as skip_log:
                    skip_log.write(f"Skipped {file_path} - Already processed or exists in destination\n")
                continue

            current_batch.append((file_path, file_hash))

            if len(current_batch) >= batch_size:
                total_processed += process_batch(current_batch, dest_folders, processed_files, log_path)
                current_batch = []

    # Process any remaining images in the batch
    if current_batch:
        total_processed += process_batch(current_batch, dest_folders, processed_files, log_path)

    print(f"Total new images processed: {total_processed}")
    print(f"Total images skipped: {skipped}")

In [7]:
def process_batch(batch, dest_folders, processed_files, log_path):
    """Process a batch of images."""
    processed_count = 0

    for file_path, file_hash in batch:
        try:
            with Image.open(file_path) as img:
                width, height = img.size
                pixel_count = width * height

                if pixel_count <= 256 * 256:
                    dest_folder = dest_folders["small"]
                elif pixel_count <= 4932 * 4932:
                    dest_folder = dest_folders["medium"]
                else:
                    dest_folder = dest_folders["large"]

                dest_path = os.path.join(dest_folder, os.path.basename(file_path))
                os.makedirs(dest_folder, exist_ok=True)
                shutil.copy2(file_path, dest_path)

                processed_files.add(file_hash)
                processed_count += 1

        except Exception as e:
            with open("error_log.txt", "a") as error_log:
                error_log.write(f"Error processing {file_path}: {e}\n")

    save_processed_files(log_path, processed_files)
    return processed_count

In [8]:
def main():
    import time
    start_time = time.time()

    source_folders = [
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/celeba_dataset_curated",
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/ck_dataset_filtered",
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/faces_dataset_curated",
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/fer_2013_dataset",
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/flickr_dataset_curated",
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/google_images_curated",
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/humans_dataset_curated",
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/jaffe_dataset",
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/lfw_dataset_curated",
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/pexels_dataset_curated",
        "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/wider_face_dataset_curated"
    ]
    dest_folders = {
        "small": "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_small",
        "medium": "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_medium",
        "large": "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_large",
    }
    log_path = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/processed_files_log.json"

    total_source = sum(count_images(folder, {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}) for folder in source_folders)
    total_dest = sum(count_images(folder, {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}) for folder in dest_folders.values())

    print(f"Total images in source folders: {total_source}")
    print(f"Total images in destination folders: {total_dest}")

    if total_source != total_dest:
        for folder in source_folders:
            process_images(folder, dest_folders, log_path, batch_size=100, force_recheck=False)
    else:
        print("All images are accounted for.")

    elapsed_time = time.time() - start_time
    print(f"Total execution time: {elapsed_time:.2f} seconds")

In [9]:
if __name__ == "__main__":
    main()

Total images in source folders: 481364
Total images in destination folders: 457857
Total new images processed: 0
Total images skipped: 111477
Total new images processed: 0
Total images skipped: 981
Total new images processed: 11
Total images skipped: 67590
Total new images processed: 2
Total images skipped: 35885
Total new images processed: 0
Total images skipped: 47559
Total new images processed: 0
Total images skipped: 965
Total new images processed: 0
Total images skipped: 132546
Total new images processed: 0
Total images skipped: 213
Total new images processed: 0
Total images skipped: 3126
Total new images processed: 0
Total images skipped: 33620
Total new images processed: 0
Total images skipped: 47389
Total execution time: 15198.92 seconds
