In [6]:
import os
import shutil
import random

def split_dataset(input_folder, output_folder, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    # Ensure output folders exist
    os.makedirs(os.path.join(output_folder, 'train'), exist_ok=True)
    os.makedirs(os.path.join(output_folder, 'val'), exist_ok=True)
    os.makedirs(os.path.join(output_folder, 'test'), exist_ok=True)
    
    # Go through each class (subdirectory) in the input folder
    for class_name in os.listdir(input_folder):
        class_path = os.path.join(input_folder, class_name)
        
        # Skip if not a directory
        if not os.path.isdir(class_path):
            continue
        
        # Create corresponding class directories in output splits
        os.makedirs(os.path.join(output_folder, 'train', class_name), exist_ok=True)
        os.makedirs(os.path.join(output_folder, 'val', class_name), exist_ok=True)
        os.makedirs(os.path.join(output_folder, 'test', class_name), exist_ok=True)
        
        # Get all files in the class directory
        files = os.listdir(class_path)
        
        # Shuffle files
        random.seed(42)  # for reproducibility
        random.shuffle(files)
        
        # Calculate split indices
        total_files = len(files)
        train_end = int(total_files * train_ratio)
        val_end = train_end + int(total_files * val_ratio)
        
        # Split and copy files
        for i, filename in enumerate(files):
            src_path = os.path.join(class_path, filename)
            
            if i < train_end:
                dest_folder = os.path.join(output_folder, 'train', class_name)
            elif i < val_end:
                dest_folder = os.path.join(output_folder, 'val', class_name)
            else:
                dest_folder = os.path.join(output_folder, 'test', class_name)
            
            # Copy file to destination
            dest_path = os.path.join(dest_folder, filename)
            shutil.copy2(src_path, dest_path)
    
    print("Dataset split completed!")

# Use the function
input_folder = r'chest_xray\train'
output_folder = r'chest_xray\dataset'

split_dataset(input_folder, output_folder)

Dataset split completed!


In [7]:
import os

def check_dataset_split(output_folder):
    print("Dataset Split Summary:")
    print("-" * 40)
    
    # List of split folders to check
    split_folders = ['train', 'val', 'test']
    
    # Dictionary to store total counts
    total_counts = {}
    
    # Iterate through each split (train, val, test)
    for split in split_folders:
        split_path = os.path.join(output_folder, split)
        print(f"\n{split.upper()} Dataset:")
        
        # Dictionary to store class-wise counts
        class_counts = {}
        
        # Count files in each class
        for class_name in os.listdir(split_path):
            class_path = os.path.join(split_path, class_name)
            
            # Count files in the class directory
            file_count = len([f for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f))])
            
            class_counts[class_name] = file_count
            print(f"  {class_name}: {file_count} images")
        
        # Store total count for this split
        total_counts[split] = sum(class_counts.values())
    
    # Print overall summary
    print("\nOVERALL SUMMARY:")
    print("-" * 40)
    for split, count in total_counts.items():
        print(f"{split.upper()} total images: {count}")
    
    # Calculate percentages
    total_images = sum(total_counts.values())
    print("\nPERCENTAGES:")
    print("-" * 40)
    for split, count in total_counts.items():
        percentage = (count / total_images) * 100
        print(f"{split.upper()}: {percentage:.2f}%")

# Use the function
output_folder = r'chest_xray\dataset'
check_dataset_split(output_folder)

Dataset Split Summary:
----------------------------------------

TRAIN Dataset:
  NORMAL: 1266 images
  PNEUMONIA: 3418 images

VAL Dataset:
  NORMAL: 158 images
  PNEUMONIA: 427 images

TEST Dataset:
  NORMAL: 159 images
  PNEUMONIA: 428 images

OVERALL SUMMARY:
----------------------------------------
TRAIN total images: 4684
VAL total images: 585
TEST total images: 587

PERCENTAGES:
----------------------------------------
TRAIN: 79.99%
VAL: 9.99%
TEST: 10.02%
