In [1]:
import os
import shutil
import random
from pathlib import Path

In [2]:
def split_dataset(source_dir, output_dir, train_ratio=0.6, val_ratio=0.2, test_ratio=0.2):
    """
    Split dataset into train/val/test folders with specified ratios.
    
    Args:
        source_dir (str): Path to source directory containing class folders
        output_dir (str): Path to output directory where train/val/test will be created
        train_ratio (float): Ratio for training set (default: 0.6)
        val_ratio (float): Ratio for validation set (default: 0.2)
        test_ratio (float): Ratio for test set (default: 0.2)
    """
    
    # Validate ratios
    if abs(train_ratio + val_ratio + test_ratio - 1.0) > 1e-6:
        raise ValueError("Ratios must sum to 1.0")
    
    source_path = Path(source_dir)
    output_path = Path(output_dir)
    
    # Create output directory structure
    splits = ['train', 'val', 'test']
    for split in splits:
        (output_path / split).mkdir(parents=True, exist_ok=True)
    
    # Get all class directories
    class_dirs = [d for d in source_path.iterdir() if d.is_dir()]
    
    if not class_dirs:
        raise ValueError(f"No class directories found in {source_dir}")
    
    print(f"Found {len(class_dirs)} classes: {[d.name for d in class_dirs]}")
    
    # Process each class
    for class_dir in class_dirs:
        class_name = class_dir.name
        print(f"\nProcessing class: {class_name}")
        
        # Create class directories in each split
        for split in splits:
            (output_path / split / class_name).mkdir(parents=True, exist_ok=True)
        
        # Get all image files
        image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
        images = [f for f in class_dir.iterdir() 
                 if f.is_file() and f.suffix.lower() in image_extensions]
        
        if not images:
            print(f"  Warning: No images found in {class_name}")
            continue
        
        # Shuffle images randomly
        random.shuffle(images)
        total_images = len(images)
        
        # Calculate split indices
        train_end = int(total_images * train_ratio)
        val_end = train_end + int(total_images * val_ratio)
        
        # Split images
        train_images = images[:train_end]
        val_images = images[train_end:val_end]
        test_images = images[val_end:]
        
        print(f"  Total images: {total_images}")
        print(f"  Train: {len(train_images)}, Val: {len(val_images)}, Test: {len(test_images)}")
        
        # Copy images to respective directories
        splits_data = [
            ('train', train_images),
            ('val', val_images),
            ('test', test_images)
        ]
        
        for split_name, split_images in splits_data:
            for img in split_images:
                src = img
                dst = output_path / split_name / class_name / img.name
                shutil.copy2(src, dst)
    
    print(f"\nDataset split completed! Output saved to: {output_dir}")
    
    # Print summary
    print("\nSummary:")
    for split in splits:
        split_path = output_path / split
        total_images = sum(len(list((split_path / class_name).glob('*'))) 
                          for class_name in [d.name for d in class_dirs] 
                          if (split_path / class_name).exists())
        print(f"  {split}: {total_images} images")

In [3]:
def main():
    # Configuration
    SOURCE_DIR = "dataset"  # Change this to your source directory path
    OUTPUT_DIR = "dataset_split"  # Change this to your desired output directory
    
    # Split ratios (must sum to 1.0)
    TRAIN_RATIO = 0.6   # 60%
    VAL_RATIO = 0.2     # 20%
    TEST_RATIO = 0.2    # 20%
    
    # Set random seed for reproducibility
    random.seed(42)
    
    try:
        print("Starting dataset split...")
        print(f"Source directory: {SOURCE_DIR}")
        print(f"Output directory: {OUTPUT_DIR}")
        print(f"Split ratios - Train: {TRAIN_RATIO*100}%, Val: {VAL_RATIO*100}%, Test: {TEST_RATIO*100}%")
        
        split_dataset(
            source_dir=SOURCE_DIR,
            output_dir=OUTPUT_DIR,
            train_ratio=TRAIN_RATIO,
            val_ratio=VAL_RATIO,
            test_ratio=TEST_RATIO
        )
        
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

Starting dataset split...
Source directory: dataset
Output directory: dataset_split
Split ratios - Train: 60.0%, Val: 20.0%, Test: 20.0%
Found 2 classes: ['correct', 'incorrect']

Processing class: correct
  Total images: 2906
  Train: 1743, Val: 581, Test: 582

Processing class: incorrect
  Total images: 1546
  Train: 927, Val: 309, Test: 310

Dataset split completed! Output saved to: dataset_split

Summary:
  train: 2670 images
  val: 890 images
  test: 892 images
