In [1]:
import os
from shutil import copy, rmtree
import random

In [2]:
def mk_file(file_path: str):
    if os.path.exists(file_path):
        # 如果文件夹存在，则先删除原文件夹在重新创建
        rmtree(file_path)
    os.makedirs(file_path)

In [3]:
# 保证随机可复现
random.seed(0)

# 将数据集中10%的数据划分到验证集中
split_rate = 0.1

# 指向你解压后的data_photos文件夹
cwd = os.getcwd()
# data_root = os.path.join(cwd, "data_data")#win_powershell
data_root = os.path.abspath(os.path.join(cwd))
origin_data_path = os.path.join(data_root, "NCT-CRC-HE-100K-NONORM")
assert os.path.exists(origin_data_path), "path '{}' does not exist.".format(origin_data_path)

In [4]:
data_class = [cla for cla in os.listdir(origin_data_path)
                if os.path.isdir(os.path.join(origin_data_path, cla))]

# 建立保存训练集的文件夹
train_root = os.path.join(data_root, "train702")
mk_file(train_root)
for cla in data_class:
    # 建立每个类别对应的文件夹
    mk_file(os.path.join(train_root, cla))

# 建立保存验证集的文件夹
val_root = os.path.join(data_root, "test702")
mk_file(val_root)
for cla in data_class:
    # 建立每个类别对应的文件夹
    mk_file(os.path.join(val_root, cla))

In [5]:
total_num = 0
for cla in data_class:
    cla_path = os.path.join(origin_data_path, cla)
    images = os.listdir(cla_path)
    num = len(images)
    total_num += num
    # 随机采样验证集的索引
    eval_index = random.sample(images, k=int(num*split_rate))
    for index, image in enumerate(images):
        if image in eval_index:
            # 将分配至验证集中的文件复制到相应目录
            image_path = os.path.join(cla_path, image)
            new_path = os.path.join(val_root, cla)
            copy(image_path, new_path)
        else:
            # 将分配至训练集中的文件复制到相应目录
            image_path = os.path.join(cla_path, image)
            new_path = os.path.join(train_root, cla)
            copy(image_path, new_path)
        print("\r[{}] processing [{}/{}]".format(cla, index+1, num), end="")  # processing bar
    print()

print(f"processing {total_num} done!")

[TUM] processing [14317/14317]
[LYM] processing [11557/11557]
[STR] processing [10446/10446]
[MUC] processing [8896/8896]
[BACK] processing [10566/10566]
[ADI] processing [10407/10407]
[NORM] processing [8763/8763]
[MUS] processing [13536/13536]
[DEB] processing [11512/11512]
processing 100000 done!


In [6]:
# 保证随机可复现
random.seed(0)

# 将数据集中15%的数据划分到验证集中
split_rate = 0.11

# 指向你解压后的data_photos文件夹
cwd = os.getcwd()
# data_root = os.path.join(cwd, "data_data")#win_powershell
data_root = os.path.abspath(os.path.join(cwd))
origin_data_path = os.path.join(data_root, "train702")
assert os.path.exists(origin_data_path), "path '{}' does not exist.".format(origin_data_path)

In [7]:
data_class = [cla for cla in os.listdir(origin_data_path)
                if os.path.isdir(os.path.join(origin_data_path, cla))]

# 建立保存训练集的文件夹
train_root = os.path.join(data_root, "train")
mk_file(train_root)
for cla in data_class:
    # 建立每个类别对应的文件夹
    mk_file(os.path.join(train_root, cla))

# 建立保存验证集的文件夹
val_root = os.path.join(data_root, "val")
mk_file(val_root)
for cla in data_class:
    # 建立每个类别对应的文件夹
    mk_file(os.path.join(val_root, cla))

In [8]:
total_num = 0
for cla in data_class:
    cla_path = os.path.join(origin_data_path, cla)
    images = os.listdir(cla_path)
    num = len(images)
    total_num += num
    # 随机采样验证集的索引
    eval_index = random.sample(images, k=int(num*split_rate))
    for index, image in enumerate(images):
        if image in eval_index:
            # 将分配至验证集中的文件复制到相应目录
            image_path = os.path.join(cla_path, image)
            new_path = os.path.join(val_root, cla)
            copy(image_path, new_path)
        else:
            # 将分配至训练集中的文件复制到相应目录
            image_path = os.path.join(cla_path, image)
            new_path = os.path.join(train_root, cla)
            copy(image_path, new_path)
        print("\r[{}] processing [{}/{}]".format(cla, index+1, num), end="")  # processing bar
    print()

print(f"processing {total_num} done!")

[TUM] processing [12886/12886]
[LYM] processing [10402/10402]
[STR] processing [9402/9402]
[MUC] processing [8007/8007]
[BACK] processing [9510/9510]
[ADI] processing [9367/9367]
[NORM] processing [7887/7887]
[MUS] processing [12183/12183]
[DEB] processing [10361/10361]
processing 90005 done!


In [9]:
# !find val526/ -type f | wc -l
! ./echo_num.sh train

==train==
[33mADI:[0m 	8337
[33mBACK:[0m 	8464
[33mDEB:[0m 	9222
[33mLYM:[0m 	9258
[33mMUC:[0m 	7127
[33mMUS:[0m 	10843
[33mNORM:[0m 	7020
[33mSTR:[0m 	8368
[33mTUM:[0m 	11469
------------
Total:	80108


In [10]:
! ./echo_num.sh val

==val==
[33mADI:[0m 	1030
[33mBACK:[0m 	1046
[33mDEB:[0m 	1139
[33mLYM:[0m 	1144
[33mMUC:[0m 	880
[33mMUS:[0m 	1340
[33mNORM:[0m 	867
[33mSTR:[0m 	1034
[33mTUM:[0m 	1417
------------
Total:	9897


In [11]:
! ./echo_num.sh test

==test==
[33mADI:[0m 	1040
[33mBACK:[0m 	1056
[33mDEB:[0m 	1151
[33mLYM:[0m 	1155
[33mMUC:[0m 	889
[33mMUS:[0m 	1353
[33mNORM:[0m 	876
[33mSTR:[0m 	1044
[33mTUM:[0m 	1431
------------
Total:	9995
