In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split

In [2]:
def split_data(base_dir, output_dir, test_size=0.2):
    """
    Splits dataset into train and test folders while preserving subdirectories.

    Args:
        base_dir (str): The original dataset folder containing subfolders (classes).
        output_dir (str): The folder where "train" and "test" folders will be created.
        test_size (float): Proportion of the dataset to allocate to the test set.
    """
    train_dir = os.path.join(output_dir, "train")
    test_dir = os.path.join(output_dir, "test")

    # Create output directories if they do not exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # Iterate through each subfolder (class/category)
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        if not os.path.isdir(category_path):  
            continue  # Skip files, process only directories

        # Get list of files in the category folder
        files = [f for f in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, f))]

        if len(files) == 0:
            print(f"Skipping empty folder: {category_path}")
            continue

        # Split dataset into train and test
        train_files, test_files = train_test_split(files, test_size=test_size, random_state=42)

        # Create category folders inside train and test directories
        train_category_path = os.path.join(train_dir, category)
        test_category_path = os.path.join(test_dir, category)
        os.makedirs(train_category_path, exist_ok=True)
        os.makedirs(test_category_path, exist_ok=True)

        # Move files to respective folders
        for file in train_files:
            shutil.move(os.path.join(category_path, file), os.path.join(train_category_path, file))

        for file in test_files:
            shutil.move(os.path.join(category_path, file), os.path.join(test_category_path, file))

        print(f"Processed {category}: Train ({len(train_files)}), Test ({len(test_files)})")

In [3]:
# Example Usage
base_dir = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/jaffe_dataset copy"
output_dir = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/jaffe_dataset_split"
split_data(base_dir, output_dir, test_size=0.2)

Processed anger: Train (24), Test (7)
Processed disgust: Train (23), Test (6)
Processed fear: Train (25), Test (7)
Processed happiness: Train (25), Test (7)
Processed neutral: Train (24), Test (7)
Processed sadness: Train (24), Test (7)
Processed surprise: Train (24), Test (6)
