In [5]:
import os
import shutil
import random

In [6]:
def split_data_matching_names(data_dir, output_dir, train_ratio=0.7, test_ratio=0.15, val_ratio=0.15):

    images_dir = os.path.join(data_dir, 'images')
    labels_dir = os.path.join(data_dir, 'labels')

    image_files = sorted(os.listdir(images_dir))
    label_files = sorted(os.listdir(labels_dir))

    for subset in ['train', 'test', 'valid']:
        os.makedirs(os.path.join(output_dir, subset, 'images'), exist_ok=True)
        os.makedirs(os.path.join(output_dir, subset, 'labels'), exist_ok=True)

    matching_pairs = []
    for image_file in image_files:
        base_name = os.path.splitext(image_file)[0]
        label_file = base_name + '.json'
        if label_file in label_files:
            matching_pairs.append((image_file, label_file))

    random.shuffle(matching_pairs)

    total_pairs = len(matching_pairs)
    train_split = int(total_pairs * train_ratio)
    test_split = int(total_pairs * (train_ratio + test_ratio))

    for i, (image_file, label_file) in enumerate(matching_pairs):
        image_path = os.path.join(images_dir, image_file)
        label_path = os.path.join(labels_dir, label_file)

        if i < train_split:
            subset = 'train'
        elif i < test_split:
            subset = 'test'
        else:
            subset = 'valid'

        shutil.copy(image_path, os.path.join(output_dir, subset, 'images', image_file))
        shutil.copy(label_path, os.path.join(output_dir, subset, 'labels', label_file))

In [7]:

data_dir = 'data'
output_dir = 'split_data'

In [8]:
split_data_matching_names(data_dir, output_dir)
print("Data split successfully (matching names)!")

Data split successfully (matching names)!
