# 划分训练集和测试集

张子豪 2023-2-22

In [1]:
import os
import shutil
import random
from tqdm import tqdm

In [2]:
Dataset_root = 'Ruler_15_Dataset'

In [3]:
os.chdir(os.path.join(Dataset_root, 'labelme_jsons'))

In [4]:
print('共有 {} 个 labelme 格式的 json 文件'.format(len(os.listdir())))

共有 15 个 labelme 格式的 json 文件


## 删除系统自动生成的多余文件

### 查看待删除的多余文件

In [5]:
!find . -iname '__MACOSX'

In [6]:
!find . -iname '.DS_Store'

In [7]:
!find . -iname '.ipynb_checkpoints'

### 删除多余文件

In [8]:
!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done

In [9]:
!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done

In [10]:
!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

### 验证多余文件已删除

In [11]:
!find . -iname '__MACOSX'

In [12]:
!find . -iname '.DS_Store'

In [13]:
!find . -iname '.ipynb_checkpoints'

## 划分训练集、测试集

In [14]:
test_frac = 0.2  # 测试集比例
random.seed(123) # 随机数种子，便于复现

In [15]:
folder = '.'

In [16]:
img_paths = os.listdir(folder)
random.shuffle(img_paths) # 随机打乱

val_number = int(len(img_paths) * test_frac) # 测试集文件个数
train_files = img_paths[val_number:]         # 训练集文件名列表
val_files = img_paths[:val_number]           # 测试集文件名列表

print('数据集文件总数', len(img_paths))
print('训练集文件个数', len(train_files))
print('测试集文件个数', len(val_files))

数据集文件总数 15
训练集文件个数 12
测试集文件个数 3


## 将训练集json文件移动到`train_labelme_jsons`文件夹

In [17]:
# 创建文件夹，存放训练集的 labelme格式的 json 标注文件
train_labelme_jsons_folder = 'train_labelme_jsons'
os.mkdir(train_labelme_jsons_folder)

In [18]:
for each in tqdm(train_files):
    src_path = os.path.join(folder, each)
    dst_path = os.path.join(train_labelme_jsons_folder, each)
    shutil.move(src_path, dst_path)

100%|██████████| 12/12 [00:00<00:00, 2023.55it/s]


## 将测试集json文件移动到`val_labelme_jsons`文件夹

In [19]:
# 创建文件夹，存放训练集的 labelme格式的 json 标注文件
val_labelme_jsons_folder = 'val_labelme_jsons'
os.mkdir(val_labelme_jsons_folder)

In [20]:
for each in tqdm(val_files):
    src_path = os.path.join(folder, each)
    dst_path = os.path.join(val_labelme_jsons_folder, each)
    shutil.move(src_path, dst_path)

100%|██████████| 3/3 [00:00<00:00, 1142.03it/s]


In [21]:
len(os.listdir(train_labelme_jsons_folder)) + len(os.listdir(val_labelme_jsons_folder))

15

## 删除系统自动生成的多余文件

### 查看待删除的多余文件

In [22]:
!find . -iname '__MACOSX'

In [23]:
!find . -iname '.DS_Store'

In [24]:
!find . -iname '.ipynb_checkpoints'

### 删除多余文件

In [25]:
!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done

In [26]:
!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done

In [27]:
!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

### 验证多余文件已删除

In [28]:
!find . -iname '__MACOSX'

In [29]:
!find . -iname '.DS_Store'

In [30]:
!find . -iname '.ipynb_checkpoints'