In [10]:
import os
import random
import shutil
from math import floor
import pandas as pd

# Define input paths for each category
category_dirs = {
    'NORMAL': [r"D:\source\new\data\oct-Normal", r"D:\source\new\data\rsna-Normal", r"D:\source\new\data\shenzhen-Normal", r"D:\source\new\data\montgomerySet-Normal"],
    'PNEUMONIA': [r"D:\source\new\data\oct-Pneumonia"],
    'UNKNOWN': [r"D:\source\new\data\pavan-Unknown"],
    'TUBERCULOSIS': [r"D:\source\new\data\shenzhen-TB", r"D:\source\new\data\montgomerySet-TB", r"D:\source\new\data\niaid-TB", r"D:\source\new\data\belarus"]
}

# Define output directories
dataset_dir = r"D:\source\new\data\data"
train_dir = os.path.join(dataset_dir, 'train')
val_dir = os.path.join(dataset_dir, 'val')
test_dir = os.path.join(dataset_dir, 'test')

# Ensure the train, val, and test directories exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Split ratios
test_ratio = 0.1
val_ratio = 0.05

# List to store metadata for the CSV
csv_data = []

def create_image_metadata(directory, label, source):
    """
    Scans a directory for images and returns a list of metadata for the images (name, label, source).
    
    Args:
        directory (str): Path to the directory containing the images.
        label (str): The label of the images (e.g., 'normal', 'tuberculosis', etc.).
        source (str): The source of the images (e.g., 'oct', 'rsna', etc.).
    
    Returns:
        list: List of dictionaries containing metadata for each image.
    """
    data = []
    for file in os.listdir(directory):
        if file.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".gif")):
            data.append({"name": file, "label": label, "source": source})
    return data

# Iterate through input directories to create metadata and split the images
for category, dirs in category_dirs.items():
    category_train_dir = os.path.join(train_dir, category)
    category_val_dir = os.path.join(val_dir, category)
    category_test_dir = os.path.join(test_dir, category)

    os.makedirs(category_train_dir, exist_ok=True)
    os.makedirs(category_val_dir, exist_ok=True)
    os.makedirs(category_test_dir, exist_ok=True)

    # Create metadata for each directory
    for dir in dirs:
        if not os.path.exists(dir):
            print(f"Category folder not found: {dir}")
            continue

        # Create metadata for the current directory
        source_name = os.path.basename(dir).split('-')[0]
        files_metadata = create_image_metadata(dir, category, source_name)

        # Shuffle the files for random split
        random.shuffle(files_metadata)
        
        total_files = len(files_metadata)
        test_count = floor(total_files * test_ratio)
        val_count = floor(total_files * val_ratio)
        train_count = total_files - test_count - val_count

        test_files = files_metadata[:test_count]
        val_files = files_metadata[test_count:test_count + val_count]
        train_files = files_metadata[test_count + val_count:]

        print(f"Category: {category}, Total: {total_files}, Train: {train_count}, Val: {val_count}, Test: {test_count}")

        # Copy files to corresponding directories and add metadata
        for file_data in val_files:
            file_name = file_data["name"]
            src_path = os.path.join(dir, file_name)
            dest_path = os.path.join(category_val_dir, file_name)
            shutil.copy(src_path, dest_path)
            csv_data.append({**file_data, "split": "val"})

        for file_data in test_files:
            file_name = file_data["name"]
            src_path = os.path.join(dir, file_name)
            dest_path = os.path.join(category_test_dir, file_name)
            shutil.copy(src_path, dest_path)
            csv_data.append({**file_data, "split": "test"})

        for file_data in train_files:
            file_name = file_data["name"]
            src_path = os.path.join(dir, file_name)
            dest_path = os.path.join(category_train_dir, file_name)
            shutil.copy(src_path, dest_path)
            csv_data.append({**file_data, "split": "train"})

# Create a Pandas DataFrame from the collected metadata
df = pd.DataFrame(csv_data)

# Save the DataFrame to a CSV file
output_csv = os.path.join(dataset_dir, "dataset_splits_with_metadata.csv")
df.to_csv(output_csv, index=False)
print(f"CSV file with dataset splits and metadata saved to {output_csv}")


Category: NORMAL, Total: 1583, Train: 1346, Val: 79, Test: 158
Category: NORMAL, Total: 3500, Train: 2975, Val: 175, Test: 350
Category: NORMAL, Total: 326, Train: 278, Val: 16, Test: 32
Category: NORMAL, Total: 80, Train: 68, Val: 4, Test: 8
Category: PNEUMONIA, Total: 4273, Train: 3633, Val: 213, Test: 427
Category: UNKNOWN, Total: 1357, Train: 1155, Val: 67, Test: 135
Category: TUBERCULOSIS, Total: 336, Train: 287, Val: 16, Test: 33
Category: TUBERCULOSIS, Total: 58, Train: 51, Val: 2, Test: 5
Category: TUBERCULOSIS, Total: 3499, Train: 2976, Val: 174, Test: 349
Category: TUBERCULOSIS, Total: 304, Train: 259, Val: 15, Test: 30
CSV file with dataset splits and metadata saved to D:\source\new\data\data\dataset_splits_with_metadata.csv
