In [9]:
import os
import shutil
import random
from pathlib import Path

def create_dir_if_not_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)

def split_data(source_dir, train_dir, val_dir, test_dir, train_size=0.8, val_size=0.1, test_size=0.1, random_seed=42):
    random.seed(random_seed)
    
    # Validate split sizes
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Split ratios must add up to 1.0"
    
    # Convert to absolute paths
    source_dir = os.path.abspath(source_dir)
    train_dir = os.path.abspath(train_dir)
    val_dir = os.path.abspath(val_dir)
    test_dir = os.path.abspath(test_dir)
    
    print(f"Source directory: {source_dir}")
    print(f"Train directory: {train_dir}")
    print(f"Validation directory: {val_dir}")
    print(f"Test directory: {test_dir}")
    
    # Create main output directories
    for path in [train_dir, val_dir, test_dir]:
        create_dir_if_not_exists(path)

    # Image file extensions to include
    valid_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp'}
    
    total_images = 0
    total_train = 0
    total_val = 0
    total_test = 0
    
    # For each class folder in the source directory
    for class_name in os.listdir(source_dir):
        class_path = os.path.join(source_dir, class_name)

        if not os.path.isdir(class_path):
            print(f"Skipping non-directory: {class_name}")
            continue

        print(f"\n📂 Processing class: {class_name}")
        
        # Create class subfolders in each split directory
        class_train_path = os.path.join(train_dir, class_name)
        class_val_path = os.path.join(val_dir, class_name)
        class_test_path = os.path.join(test_dir, class_name)

        create_dir_if_not_exists(class_train_path)
        create_dir_if_not_exists(class_val_path)
        create_dir_if_not_exists(class_test_path)

        # Get all image files in class folder (filter by extension)
        all_files = os.listdir(class_path)
        images = [f for f in all_files 
                 if os.path.isfile(os.path.join(class_path, f)) and 
                 os.path.splitext(f)[1].lower() in valid_extensions]
        
        if not images:
            print(f"⚠️  No valid image files found in {class_name}")
            continue
            
        # Shuffle images for random distribution
        random.shuffle(images)

        total = len(images)
        train_count = int(total * train_size)
        val_count = int(total * val_size)
        test_count = total - train_count - val_count

        # Handle edge case where counts don't add up exactly
        if train_count + val_count + test_count != total:
            remaining = total - train_count - val_count - test_count
            test_count += remaining

        train_imgs = images[:train_count]
        val_imgs = images[train_count:train_count + val_count]
        test_imgs = images[train_count + val_count:]

        print(f"Total: {total} | Train: {len(train_imgs)} | Val: {len(val_imgs)} | Test: {len(test_imgs)}")
        
        # Update totals
        total_images += total
        total_train += len(train_imgs)
        total_val += len(val_imgs)
        total_test += len(test_imgs)

        # Copy files to appropriate directories
        for img_list, target_dir, split_name in [
            (train_imgs, class_train_path, "train"), 
            (val_imgs, class_val_path, "val"), 
            (test_imgs, class_test_path, "test")
        ]:
            for img in img_list:
                src_path = os.path.join(class_path, img)
                dst_path = os.path.join(target_dir, img)
                try:
                    shutil.copy2(src_path, dst_path)  # copy2 preserves metadata
                except Exception as e:
                    print(f"Error copying {img} to {split_name}: {e}")

    print(f"Dataset split completed successfully!")
    print(f"SUMMARY:")
    print(f"Total images processed: {total_images}")
    print(f"Training set: {total_train} ({total_train/total_images*100:.1f}%)")
    print(f"Validation set: {total_val} ({total_val/total_images*100:.1f}%)")
    print(f"Test set: {total_test} ({total_test/total_images*100:.1f}%)")

current_dir = os.getcwd()  # Gets current working directory
source_dir = current_dir  # The rice_leaf_diseases folder itself
base_output_dir = os.path.join(os.path.dirname(current_dir), 'dataset')
train_dir = os.path.join(base_output_dir, 'train')
val_dir = os.path.join(base_output_dir, 'val')
test_dir = os.path.join(base_output_dir, 'test')
split_data(
    source_dir=source_dir,
    train_dir=train_dir,
    val_dir=val_dir,
    test_dir=test_dir,
    train_size=0.8,    # 80% for training
    val_size=0.1,      # 10% for validation  
    test_size=0.1,     # 10% for testing
    random_seed=42     # For reproducible results
)

Source directory: d:\Projects\Rice Leaf Classification & Detection\rice_leaf_diseases
Train directory: d:\Projects\Rice Leaf Classification & Detection\dataset\train
Validation directory: d:\Projects\Rice Leaf Classification & Detection\dataset\val
Test directory: d:\Projects\Rice Leaf Classification & Detection\dataset\test

📂 Processing class: Bacterial leaf blight
Total: 40 | Train: 32 | Val: 4 | Test: 4

📂 Processing class: Brown spot
Total: 40 | Train: 32 | Val: 4 | Test: 4
Skipping non-directory: datasplit.ipynb

📂 Processing class: Leaf smut
Total: 40 | Train: 32 | Val: 4 | Test: 4
Dataset split completed successfully!
SUMMARY:
Total images processed: 120
Training set: 96 (80.0%)
Validation set: 12 (10.0%)
Test set: 12 (10.0%)
