In [1]:
import os
import shutil
import random
import concurrent.futures
import time

# --- Configuration ---

# 1. SET YOUR SOURCE DIRECTORY (as provided in your prompt)
SOURCE_DIR = r"/mnt/c/Users/Pavelishko/Pictures/Хвоя/NewSet_Patches_for_Culling"

# 2. SET YOUR DESTINATION DIRECTORY
# This will create a 'train' folder in the *same* directory as your source
DEST_DIR = os.path.join(os.path.dirname(SOURCE_DIR), "train")

# 3. DEFINE THE FOLDERS (CLASSES) TO PROCESS
FOLDERS = ['kuusi', 'mänty', 'marjakuusi', 'thuja']

# 4. SET THE NUMBER OF IMAGES TO SAMPLE
SAMPLE_SIZE = 2000

# 5. (Optional) Define valid image extensions
IMG_EXTENSIONS = ('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff')

# --- End of Configuration ---


def setup_directories():
    """
    Creates the main destination directory and all class sub-folders.
    """
    try:
        os.makedirs(DEST_DIR, exist_ok=True)
        print(f"Destination directory created or already exists: {DEST_DIR}")

        for folder in FOLDERS:
            class_dir = os.path.join(DEST_DIR, folder)
            os.makedirs(class_dir, exist_ok=True)

        print("All class sub-directories are ready.")

    except Exception as e:
        print(f"Error creating directories: {e}")
        return False # Return False on failure
    return True # Return True on success

def process_folder(folder_name):
    """
    Processes a single folder:
    1. Lists all images.
    2. Randomly samples {SAMPLE_SIZE} images.
    3. Copies and renames them to the destination.
    """
    src_folder_path = os.path.join(SOURCE_DIR, folder_name)
    dest_folder_path = os.path.join(DEST_DIR, folder_name)

    try:
        # 1. Get all valid image files
        all_files = os.listdir(src_folder_path)
        image_files = [f for f in all_files
                       if os.path.isfile(os.path.join(src_folder_path, f))
                       and f.lower().endswith(IMG_EXTENSIONS)]

        num_images = len(image_files)

        if num_images == 0:
            return f"SKIPPED '{folder_name}': No images found."

        # 2. Determine sample size and warn if not enough images
        actual_sample_size = min(num_images, SAMPLE_SIZE)
        if num_images < SAMPLE_SIZE:
            print(f"WARNING: Folder '{folder_name}' has only {num_images} images. Taking all {num_images}.")

        # 3. Randomly sample
        selected_files = random.sample(image_files, actual_sample_size)

        # 4. Copy and rename
        copied_count = 0
        for i, filename in enumerate(selected_files):
            # Get original file extension
            _, ext = os.path.splitext(filename)

            # Create new name (e.g., "kuusi_1.jpg")
            new_name = f"{folder_name}_{i + 1}{ext}"

            # Define full source and destination paths
            src_path = os.path.join(src_folder_path, filename)
            dest_path = os.path.join(dest_folder_path, new_name)

            try:
                shutil.copy2(src_path, dest_path) # copy2 preserves metadata
                copied_count += 1
            except Exception as e:
                print(f"Error copying {src_path} to {dest_path}: {e}")

        return f"Processed '{folder_name}': Copied {copied_count}/{actual_sample_size} images."

    except FileNotFoundError:
        return f"ERROR: Source folder not found: {src_folder_path}"
    except Exception as e:
        return f"ERROR processing folder '{folder_name}': {e}"

def main():
    """
    Main function to set up directories and run processing in parallel.
    """
    start_time = time.time()
    print("--- Starting Dataset Creation ---")
    print(f"Source: {SOURCE_DIR}")
    print(f"Destination: {DEST_DIR}")
    print(f"Sampling {SAMPLE_SIZE} images from: {', '.join(FOLDERS)}\n")

    # Create all destination folders first
    if not setup_directories():
        print("Halting script due to directory setup failure.")
        return

    print("\n--- Starting Image Sampling (Multithreaded) ---")

    # Use ThreadPoolExecutor to run tasks in parallel
    # We use max_workers=len(FOLDERS) to run all 4 at once
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(FOLDERS)) as executor:
        # Submit the process_folder function for each folder
        futures = {executor.submit(process_folder, folder): folder for folder in FOLDERS}

        # Wait for results as they complete
        for future in concurrent.futures.as_completed(futures):
            try:
                result = future.result()
                print(result)
            except Exception as e:
                folder = futures[future]
                print(f"An exception occurred for folder {folder}: {e}")

    end_time = time.time()
    print("\n--- All tasks complete. ---")
    print(f"Total time taken: {end_time - start_time:.2f} seconds.")

# --- This is the only part that's different from the .py file ---
# We directly call main() at the end of the cell.
# In a Jupyter Notebook, the `if __name__ == "__main__":` check is not strictly necessary,
# but it's good practice. We'll just call main() directly for simplicity here.

main()

--- Starting Dataset Creation ---
Source: /mnt/c/Users/Pavelishko/Pictures/Хвоя/NewSet_Patches_for_Culling
Destination: /mnt/c/Users/Pavelishko/Pictures/Хвоя/train
Sampling 2000 images from: kuusi, mänty, marjakuusi, thuja

Destination directory created or already exists: /mnt/c/Users/Pavelishko/Pictures/Хвоя/train
All class sub-directories are ready.

--- Starting Image Sampling (Multithreaded) ---
Processed 'mänty': Copied 2000/2000 images.
Processed 'kuusi': Copied 2000/2000 images.
Processed 'marjakuusi': Copied 2000/2000 images.
Processed 'thuja': Copied 2000/2000 images.

--- All tasks complete. ---
Total time taken: 45.40 seconds.
