## For training + Validation + Test Data

In [1]:
import os
import random
import shutil

# Set your data path
main_data_path = 'Nih All d'  
new_main_data_path = 'data_300_80_100'  
train_val_test_folders = ['train', 'val', 'test']
sample_sizes = {'train': 300, 'val': 80, 'test': 100}
included_categories = ['Atelectasis', 'Effusion', 'Infiltration', 'No Finding', 'Nodule']

# Create the new main "data" folder
if not os.path.exists(new_main_data_path):
    os.makedirs(new_main_data_path)

# Iterate through each category and copy/split images
for category in os.listdir(main_data_path):
    # Skip categories not in the included list
    if category not in included_categories:
        continue

    category_path = os.path.join(main_data_path, category)
    new_category_path = os.path.join(new_main_data_path, category)

    # Create subfolders for training, validation, and test
    for folder in train_val_test_folders:
        subfolder_path = os.path.join(new_main_data_path, folder, category)
        if not os.path.exists(subfolder_path):
            os.makedirs(subfolder_path)

    # Copy and split images
    category_images = os.listdir(category_path)

    # Randomly shuffle the images
    random.shuffle(category_images)

    # Split the images into training, validation, and test sets
    train_images = category_images[:sample_sizes['train']]
    val_images = category_images[sample_sizes['train']:sample_sizes['train'] + sample_sizes['val']]
    test_images = category_images[sample_sizes['train'] + sample_sizes['val']: 
                                  sample_sizes['train'] + sample_sizes['val'] + sample_sizes['test']]

    # Copy training images
    for image_name in train_images:
        source_path = os.path.join(category_path, image_name)
        destination_path = os.path.join(new_main_data_path, 'train', category, image_name)
        shutil.copyfile(source_path, destination_path)

    # Copy validation images
    for image_name in val_images:
        source_path = os.path.join(category_path, image_name)
        destination_path = os.path.join(new_main_data_path, 'val', category, image_name)
        shutil.copyfile(source_path, destination_path)

    # Copy test images
    for image_name in test_images:
        source_path = os.path.join(category_path, image_name)
        destination_path = os.path.join(new_main_data_path, 'test', category, image_name)
        shutil.copyfile(source_path, destination_path)

print(f"Data has been copied and split to {new_main_data_path}.")


Data has been copied and split to data_300_80_100.
