In [1]:
import os
import shutil
import random


In [2]:
def create_new_folders(new_base_path, categories):
    # 在新路径下创建 train, val, test 文件夹及其子文件夹
    for category in categories:
        os.makedirs(os.path.join(new_base_path, 'images', category), exist_ok=True)
        os.makedirs(os.path.join(new_base_path, 'labels', category), exist_ok=True)

In [3]:
def move_files(files, src_images_path, src_labels_path, dst_images_path, dst_labels_path):
    # 移动文件到目标文件夹
    for file in files:
        image_file = os.path.join(src_images_path, file)
        label_file = os.path.join(src_labels_path, file.replace('.jpg', '.txt'))

        if os.path.exists(image_file) and os.path.exists(label_file):
            shutil.copy(image_file, os.path.join(dst_images_path, file))
            shutil.copy(label_file, os.path.join(dst_labels_path, file.replace('.jpg', '.txt')))

In [4]:
def split_dataset(src_base_path, new_base_path, train_ratio=0.7, val_ratio=0.1, test_ratio=0.2):
    src_images_path = os.path.join(src_base_path, 'images')
    src_labels_path = os.path.join(src_base_path, 'labels')

    # 列出所有图像文件
    files = [f for f in os.listdir(src_images_path) if f.endswith('.jpg')]

    # 打乱文件顺序
    random.shuffle(files)

    # 划分文件
    total_files = len(files)
    train_end = int(total_files * train_ratio)
    val_end = train_end + int(total_files * val_ratio)

    train_files = files[:train_end]
    val_files = files[train_end:val_end]
    test_files = files[val_end:]

    # 在新路径下创建 train, val, test 文件夹及其子文件夹
    create_new_folders(new_base_path, ['train', 'val', 'test'])

    # 将文件复制到相应的文件夹
    move_files(train_files, src_images_path, src_labels_path, os.path.join(new_base_path, 'images', 'train'), os.path.join(new_base_path, 'labels', 'train'))
    move_files(val_files, src_images_path, src_labels_path, os.path.join(new_base_path, 'images', 'val'), os.path.join(new_base_path, 'labels', 'val'))
    move_files(test_files, src_images_path, src_labels_path, os.path.join(new_base_path, 'images', 'test'), os.path.join(new_base_path, 'labels', 'test'))

In [6]:

# 设置原数据集的基础路径和新的数据集基础路径
src_base_path = 'E:\pyDLW\LUNGC\dataset\LIDCsix'        # 原数据集路径
new_base_path = 'E:\pyDLW\LUNGC\dataset'    # 新的数据集路径

# 划分数据集并移动到新的文件夹中
split_dataset(src_base_path, new_base_path)