## Dataset Splitting Function
Implements split_dataset to:
Randomly split data into training, validation, and testing sets.
Create output directories for the splits while maintaining class folders.
Copy images into their respective split folders.

In [1]:
import os
import shutil
import random

def split_dataset(base_dir, output_dir, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    if not os.path.exists(base_dir):
        print(f"Base directory '{base_dir}' does not exist.")
        return

    # Create the main output directory
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Define output directories
    train_dir = os.path.join(output_dir, 'train')
    val_dir = os.path.join(output_dir, 'val')
    test_dir = os.path.join(output_dir, 'test')

    # Create directories if they don't exist
    for dir_path in [train_dir, val_dir, test_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    # Iterate over each class folder
    for class_name in os.listdir(base_dir):
        class_path = os.path.join(base_dir, class_name)

        # Skip if not a directory (e.g., train, val, test folders)
        if not os.path.isdir(class_path):
            continue

        # Gather all image files
        images = [f for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f))]
        random.shuffle(images)

        # Calculate split indices
        total_images = len(images)
        train_count = int(total_images * train_ratio)
        val_count = int(total_images * val_ratio)
        
        # Split images
        train_images = images[:train_count]
        val_images = images[train_count:train_count + val_count]
        test_images = images[train_count + val_count:]

        # Copy images to respective directories
        for image in train_images:
            dest_dir = os.path.join(train_dir, class_name)
            os.makedirs(dest_dir, exist_ok=True)
            shutil.copy(os.path.join(class_path, image), os.path.join(dest_dir, image))

        for image in val_images:
            dest_dir = os.path.join(val_dir, class_name)
            os.makedirs(dest_dir, exist_ok=True)
            shutil.copy(os.path.join(class_path, image), os.path.join(dest_dir, image))

        for image in test_images:
            dest_dir = os.path.join(test_dir, class_name)
            os.makedirs(dest_dir, exist_ok=True)
            shutil.copy(os.path.join(class_path, image), os.path.join(dest_dir, image))

        print(f"Processed class '{class_name}' with {total_images} images.")

if __name__ == "__main__":
    base_dir = r'D:\Umar\Dermnet'
    output_dir = os.path.join(base_dir, 'SplitedD')
    split_dataset(base_dir, output_dir)


Processed class 'Acne and Rosacea Photos' with 6890 images.
Processed class 'Actinic Keratosis Basal Cell Carcinoma and other Malignant Lesions' with 11144 images.
Processed class 'Atopic Dermatitis Photos' with 4840 images.
Processed class 'Bullous Disease Photos' with 4352 images.
Processed class 'Cellulitis Impetigo and other Bacterial Infections' with 2832 images.
Processed class 'Eczema Photos' with 11520 images.
Processed class 'Exanthems and Drug Eruptions' with 3816 images.
Processed class 'Hair Loss Photos Alopecia and other Hair Diseases' with 2328 images.
Processed class 'HealthyDataset' with 8639 images.
Processed class 'Herpes HPV and other STDs Photos' with 4032 images.
Processed class 'Light Diseases and Disorders of Pigmentation' with 5616 images.
Processed class 'Lupus and other Connective Tissue diseases' with 4160 images.
Processed class 'Melanoma Skin Cancer Nevi and Moles' with 4496 images.
Processed class 'Nail Fungus and other Nail Disease' with 10176 images.
Pro