In [None]:
import os
import json
import shutil
import time
from hashlib import md5
from PIL import Image

def calculate_hash(file_path):
    """
    Calculate MD5 hash of a file.
    """
    hash_md5 = md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def load_processed_files(log_path):
    """
    Load processed files from a JSON log.
    """
    if os.path.exists(log_path):
        with open(log_path, "r") as log_file:
            return set(json.load(log_file))
    return set()

def save_processed_files(log_path, processed_files):
    """
    Save processed files to a JSON log.
    """
    with open(log_path, "w") as log_file:
        json.dump(list(processed_files), log_file)

def process_images_in_batches(source_folders, dest_folders, log_path, batch_size=1000, extensions=None):
    """
    Process images from source folders to destination folders in batches.
    """
    if extensions is None:
        extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff'}

    processed_files = load_processed_files(log_path)
    dest_hashes = set()

    # Calculate hashes for all destination images
    for folder in dest_folders.values():
        for root, _, files in os.walk(folder):
            for file in files:
                file_path = os.path.join(root, file)
                dest_hashes.add(calculate_hash(file_path))

    current_batch = []
    total_processed = 0
    total_skipped = 0
    start_time = time.time()

    for source_folder in source_folders:
        for root, _, files in os.walk(source_folder):
            for file in files:
                if os.path.splitext(file)[1].lower() not in extensions:
                    continue

                file_path = os.path.join(root, file)
                file_hash = calculate_hash(file_path)

                if file_hash in processed_files or file_hash in dest_hashes:
                    total_skipped += 1
                    continue

                current_batch.append((file_path, file_hash))

                if len(current_batch) >= batch_size:
                    total_processed += process_batch(current_batch, dest_folders, processed_files, log_path)
                    current_batch = []

                    elapsed = time.time() - start_time
                    print(f"Copied {batch_size} images. Total processed: {total_processed}. Time elapsed: {elapsed:.2f} seconds")

    if current_batch:
        total_processed += process_batch(current_batch, dest_folders, processed_files, log_path)

    elapsed = time.time() - start_time
    print(f"Processing complete. Total processed: {total_processed}, Total skipped: {total_skipped}, Time elapsed: {elapsed:.2f} seconds")

def process_batch(batch, dest_folders, processed_files, log_path):
    """
    Process a batch of images.
    """
    processed_count = 0

    for file_path, file_hash in batch:
        try:
            with Image.open(file_path) as img:
                width, height = img.size
                pixel_count = width * height

                if pixel_count <= 48 * 48:
                    dest_folder = dest_folders["small"]
                elif pixel_count <= 1000 * 1000:
                    dest_folder = dest_folders["medium"]
                else:
                    dest_folder = dest_folders["large"]

                dest_path = os.path.join(dest_folder, os.path.basename(file_path))
                os.makedirs(dest_folder, exist_ok=True)
                shutil.copy2(file_path, dest_path)

                processed_files.add(file_hash)
                processed_count += 1

        except Exception as e:
            with open("error_log.txt", "a") as error_log:
                error_log.write(f"Error processing {file_path}: {e}\n")

    save_processed_files(log_path, processed_files)
    return processed_count

def main():
    source_folders = [
        "/home/natalyagrokh/img_datasets/celeba_dataset_curated",
        "/home/natalyagrokh/img_datasets/ck_dataset_filtered",
        "/home/natalyagrokh/img_datasets/faces_dataset_curated",
        "/home/natalyagrokh/img_datasets/fer_2013_dataset",
        "/home/natalyagrokh/img_datasets/flickr_dataset_curated",
        "/home/natalyagrokh/img_datasets/google_images_curated",
        "/home/natalyagrokh/img_datasets/humans_dataset_curated",
        "/home/natalyagrokh/img_datasets/jaffe_dataset",
        "/home/natalyagrokh/img_datasets/lfw_dataset_curated",
        "/home/natalyagrokh/img_datasets/pexels_dataset_curated",
        "/home/natalyagrokh/img_datasets/wider_face_dataset_curated"
    ]
    dest_folders = {
        "small": "/home/natalyagrokh/img_datasets/combined_datasets_small",
        "medium": "/home/natalyagrokh/img_datasets/combined_datasets_medium",
        "large": "/home/natalyagrokh/img_datasets/combined_datasets_large",
    }
    log_path = "/home/natalyagrokh/img_datasets/processed_files_log.json"

    process_images_in_batches(source_folders, dest_folders, log_path, batch_size=1000)

if __name__ == "__main__":
    main()