In [6]:
import os
import shutil
import random
from PIL import Image, ImageOps
from tqdm import tqdm # A library for nice progress bars

# --- CONFIGURATION ---

# 1. List the full paths to your 4 original image folders
SOURCE_DIRECTORIES = [
    r"/mnt/c/Users/Pavelishko/Pictures/Хвоя/OriginalSize/kuusi",
    r"/mnt/c/Users/Pavelishko/Pictures/Хвоя/OriginalSize/mänty",
    r"/mnt/c/Users/Pavelishko/Pictures/Хвоя/OriginalSize/marjakuusi",
    r"/mnt/c/Users/Pavelishko/Pictures/Хвоя/OriginalSize/thuja"
]

# 2. Set a path for the NEW processed dataset
OUTPUT_DIRECTORY = r"/mnt/c/Users/Pavelishko/Pictures/Хвоя/NewSet"

# 3. Set the image and split specifications
TARGET_SIZE = 800
SPLITS = {
    'train': 104,
    'validation': 23,
    'test': 23
}

# ------------------------------------------------

# List of file extensions to look for
IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']

print("Configuration loaded.")

Configuration loaded.


In [7]:
def process_and_crop_image(src_path, dest_path, size):
    """
    Opens, resizes, crops, and saves an image.
    - Resizes to fit 800x800
    - Center-crops if not square
    - Converts to RGB (handles transparency)
    - Saves as a new JPEG file
    """
    try:
        with Image.open(src_path) as img:
            # ImageOps.fit crops to the exact size,
            # using a high-quality (LANCZOS) filter.
            img_processed = ImageOps.fit(img, (size, size), Image.Resampling.LANCZOS)

            # Convert to RGB to avoid issues with PNG transparency (RGBA)
            # or other modes (P)
            if img_processed.mode != 'RGB':
                img_processed = img_processed.convert('RGB')


            img_processed.save(dest_path, "PNG")
        return True
    except Exception as e:
        print(f"Error processing {src_path}: {e}")
        return False

print("Helper function defined2.")

Helper function defined2.


In [8]:
print("Starting dataset creation...")

# Calculate total images needed per class
total_needed_per_class = sum(SPLITS.values()) # 104 + 23 + 23 = 150

# --- 1. Create main output directories ---
print(f"Creating directory structure at {OUTPUT_DIRECTORY}")
for split_name in SPLITS.keys():
    # This creates output/train, output/validation, output/test
    os.makedirs(os.path.join(OUTPUT_DIRECTORY, split_name), exist_ok=True)


# --- 2. Process each source folder (class) ---
for source_folder in SOURCE_DIRECTORIES:
    # Use the folder's name (e.g., "subjectA") as the class name
    class_name = os.path.basename(source_folder)
    print(f"\nProcessing class: {class_name}...")

    # Create class subdirectories in train/val/test
    # e.g., output/train/subjectA, output/validation/subjectA, etc.
    for split_name in SPLITS.keys():
        os.makedirs(os.path.join(OUTPUT_DIRECTORY, split_name, class_name), exist_ok=True)

    # Find and filter all images in the source folder
    all_images = []
    for f in os.listdir(source_folder):
        if any(f.lower().endswith(ext) for ext in IMAGE_EXTENSIONS):
            all_images.append(os.path.join(source_folder, f))

    print(f"  Found {len(all_images)} images.")

    # Check if we have enough images
    if len(all_images) < total_needed_per_class:
        print(f"  !! Warning: Not enough images for {class_name}. Found {len(all_images)}, need {total_needed_per_class}. Skipping this class.")
        continue # Move to the next source folder

    # Shuffle the list for a random split
    random.shuffle(all_images)

    # --- 3. Split the file list ---
    file_splits = {}
    start_idx = 0
    for split_name, count in SPLITS.items():
        end_idx = start_idx + count
        file_splits[split_name] = all_images[start_idx:end_idx]
        start_idx = end_idx

    # --- 4. Process and copy files ---
    image_counter = 1 # Start renaming at 1 (e.g., subjectA_1.jpg)

    for split_name, file_list in file_splits.items():
        print(f"  Processing {split_name} split ({len(file_list)} images)...")
        dest_dir = os.path.join(OUTPUT_DIRECTORY, split_name, class_name)

        # Use tqdm for a progress bar
        for src_path in tqdm(file_list, desc=f'  {split_name:10}'):
            # New filename format: subjectA_1.jpg, subjectA_2.jpg, etc.
            new_filename = f"{class_name}_{image_counter}.jpg"
            dest_path = os.path.join(dest_dir, new_filename)

            # Call our helper function to do the work
            if process_and_crop_image(src_path, dest_path, TARGET_SIZE):
                image_counter += 1 # Only increment if processing was successful

print("\n\n✅ Dataset processing complete!")
print(f"Your new dataset is ready in: {OUTPUT_DIRECTORY}")

Starting dataset creation...
Creating directory structure at /mnt/c/Users/Pavelishko/Pictures/Хвоя/NewSet

Processing class: kuusi...
  Found 154 images.
  Processing train split (104 images)...


  train     : 100%|██████████| 104/104 [01:21<00:00,  1.28it/s]


  Processing validation split (23 images)...


  validation: 100%|██████████| 23/23 [00:18<00:00,  1.27it/s]


  Processing test split (23 images)...


  test      : 100%|██████████| 23/23 [00:18<00:00,  1.27it/s]



Processing class: mänty...
  Found 150 images.
  Processing train split (104 images)...


  train     : 100%|██████████| 104/104 [01:31<00:00,  1.13it/s]


  Processing validation split (23 images)...


  validation: 100%|██████████| 23/23 [00:19<00:00,  1.17it/s]


  Processing test split (23 images)...


  test      : 100%|██████████| 23/23 [00:20<00:00,  1.11it/s]



Processing class: marjakuusi...
  Found 158 images.
  Processing train split (104 images)...


  train     : 100%|██████████| 104/104 [01:31<00:00,  1.14it/s]


  Processing validation split (23 images)...


  validation: 100%|██████████| 23/23 [00:19<00:00,  1.16it/s]


  Processing test split (23 images)...


  test      : 100%|██████████| 23/23 [00:20<00:00,  1.13it/s]



Processing class: thuja...
  Found 164 images.
  Processing train split (104 images)...


  train     : 100%|██████████| 104/104 [01:15<00:00,  1.38it/s]


  Processing validation split (23 images)...


  validation: 100%|██████████| 23/23 [00:16<00:00,  1.43it/s]


  Processing test split (23 images)...


  test      : 100%|██████████| 23/23 [00:16<00:00,  1.40it/s]



✅ Dataset processing complete!
Your new dataset is ready in: /mnt/c/Users/Pavelishko/Pictures/Хвоя/NewSet



