# 座椅数据集重构

In [1]:
import json
import os

In [2]:
def get_dataset_paths(root_dir):
    """
    获取数据集的训练和测试数据路径
    
    Args:
        root_dir: 数据集根目录路径
        
    Returns:
        train_dir: 训练数据目录路径
        train_anno: 训练数据标注文件路径 
        test_dir: 测试数据目录路径
        test_anno: 测试数据标注文件路径
    """
    # 训练数据路径
    train_dir = os.path.join(root_dir, 'train')
    train_anno = os.path.join(train_dir, '_annotations.coco.json')
    
    # 验证数据路径
    valid_dir = os.path.join(root_dir, 'valid') 
    valid_anno = os.path.join(valid_dir, '_annotations.coco.json')
    
    # 验证路径是否存在
    if not os.path.exists(train_dir):
        raise FileNotFoundError(f"训练数据目录不存在: {train_dir}")
    if not os.path.exists(train_anno):
        raise FileNotFoundError(f"训练数据标注文件不存在: {train_anno}")
    if not os.path.exists(valid_dir):
        raise FileNotFoundError(f"验证数据目录不存在: {valid_dir}")
    if not os.path.exists(valid_anno):
        raise FileNotFoundError(f"验证数据标注文件不存在: {valid_anno}")
        
    return train_dir, train_anno, valid_dir, valid_anno


In [3]:
def process_annotations(annotation_file):
    """
    Process COCO format annotation file to get normalized bbox and category information
    
    Args:
        annotation_file: Path to COCO format annotation JSON file
    
    Returns:
        images_info: Dict containing processed image information with normalized bboxes
    """
    # Load annotations
    with open(annotation_file, 'r') as f:
        annotations = json.load(f)
    
    # Create images info dict
    images_info = {v['id']: v for v in annotations['images']}
    
    # Process annotations
    for annotation in annotations['annotations']:
        width = images_info[annotation['image_id']]['width']
        height = images_info[annotation['image_id']]['height']
        bbox = annotation['bbox']
        # Convert to YOLO format: x_center, y_center, width, height
        x, y, w, h = bbox
        bbox[0] = (x + w/2) / width  # x_center
        bbox[1] = (y + h/2) / height # y_center 
        bbox[2] = w / width          # width
        bbox[3] = h / height         # height
        if 'bbox' not in images_info[annotation['image_id']]:
            images_info[annotation['image_id']]['bbox'] = [bbox]
        else:
            images_info[annotation['image_id']]['bbox'].append(bbox)
            
        category_id = annotation['category_id']
        if 'category_id' not in images_info[annotation['image_id']]:
            images_info[annotation['image_id']]['category_id'] = [category_id]
        else:
            images_info[annotation['image_id']]['category_id'].append(category_id)
    
    return images_info


In [4]:
train_dir , train_anno, valid_dir, valid_anno = get_dataset_paths(r"\\?\C:\Users\Rooki\Desktop\AI\CV\RT-DETR\chengdu_dataset")
train_dir , train_anno, valid_dir, valid_anno

('\\\\?\\C:\\Users\\Rooki\\Desktop\\AI\\CV\\RT-DETR\\chengdu_dataset\\train',
 '\\\\?\\C:\\Users\\Rooki\\Desktop\\AI\\CV\\RT-DETR\\chengdu_dataset\\train\\_annotations.coco.json',
 '\\\\?\\C:\\Users\\Rooki\\Desktop\\AI\\CV\\RT-DETR\\chengdu_dataset\\valid',
 '\\\\?\\C:\\Users\\Rooki\\Desktop\\AI\\CV\\RT-DETR\\chengdu_dataset\\valid\\_annotations.coco.json')

In [5]:
image_dir = r"\\?\C:\Users\Rooki\Desktop\AI\CV\RT-DETR\chengdu_dataset\train"
# 读取COCO格式的标注文件
annotation_file = r"\\?\C:\Users\Rooki\Desktop\AI\CV\RT-DETR\chengdu_dataset\train\_annotations.coco.json"

new_image_dir = r"./chengdu_dataset/train"

In [6]:
image_info = process_annotations(annotation_file=annotation_file)

In [7]:
def generate_yaml_config(json_file, dataset_root, save_path):
    """Generate YAML configuration file for YOLO dataset
    
    Args:
        json_file: Path to COCO format annotation JSON file
        dataset_root: Root directory of dataset relative to working directory
        train_images_dir: Training images directory relative to dataset_root 
        val_images_dir: Validation images directory relative to dataset_root
        save_path: Path to save the generated YAML file
    """
    # Read category information from JSON file
    with open(json_file, 'r') as f:
        annotation = json.load(f)
    print(annotation['categories'])
    dataset_root = os.path.abspath(dataset_root)
    # Create YAML content
    yaml_content = f"""
path: {dataset_root}  # dataset root dir
train:
  - {os.path.join(dataset_root, 'images', 'train')}   
val:
  - {os.path.join(dataset_root, 'images', 'valid')}  

# Classes
names:
"""
    for anno in annotation['categories']:
        yaml_content += f"  {anno['id']}: {anno['name']}\n"

    # Write YAML file
    with open(save_path, 'w', encoding='utf-8') as f:
        f.write(yaml_content)


In [8]:
generate_yaml_config(annotation_file, new_image_dir, f'{new_image_dir}chengdu.yaml')

[{'id': 0, 'name': '划痕', 'supercategory': 'none'}, {'id': 1, 'name': '吊紧', 'supercategory': 'none'}, {'id': 2, 'name': '拼接间隙', 'supercategory': 'none'}, {'id': 3, 'name': '水渍', 'supercategory': 'none'}, {'id': 4, 'name': '水珠', 'supercategory': 'none'}, {'id': 5, 'name': '爆线', 'supercategory': 'none'}, {'id': 6, 'name': '破损', 'supercategory': 'none'}, {'id': 7, 'name': '碰伤', 'supercategory': 'none'}, {'id': 8, 'name': '红标签', 'supercategory': 'none'}, {'id': 9, 'name': '线头', 'supercategory': 'none'}, {'id': 10, 'name': '织物外漏', 'supercategory': 'none'}, {'id': 11, 'name': '缝线鼓包(轻度)', 'supercategory': 'none'}, {'id': 12, 'name': '脏污', 'supercategory': 'none'}, {'id': 13, 'name': '褶皱（轻度）', 'supercategory': 'none'}, {'id': 14, 'name': '褶皱（重度）', 'supercategory': 'none'}, {'id': 15, 'name': '跳针', 'supercategory': 'none'}, {'id': 16, 'name': '针眼', 'supercategory': 'none'}]


In [9]:
generate_yaml_config

<function __main__.generate_yaml_config(json_file, dataset_root, save_path)>

In [10]:
with open(annotation_file, 'r') as f:
    annotation = json.load(f)
annotation.keys()

dict_keys(['info', 'licenses', 'categories', 'images', 'annotations'])

In [11]:
annotation['licenses']

[{'url': 'n/a', 'id': 0, 'name': 'CC BY 4.0'}]

In [12]:
annotation['categories']

[{'id': 0, 'name': '划痕', 'supercategory': 'none'},
 {'id': 1, 'name': '吊紧', 'supercategory': 'none'},
 {'id': 2, 'name': '拼接间隙', 'supercategory': 'none'},
 {'id': 3, 'name': '水渍', 'supercategory': 'none'},
 {'id': 4, 'name': '水珠', 'supercategory': 'none'},
 {'id': 5, 'name': '爆线', 'supercategory': 'none'},
 {'id': 6, 'name': '破损', 'supercategory': 'none'},
 {'id': 7, 'name': '碰伤', 'supercategory': 'none'},
 {'id': 8, 'name': '红标签', 'supercategory': 'none'},
 {'id': 9, 'name': '线头', 'supercategory': 'none'},
 {'id': 10, 'name': '织物外漏', 'supercategory': 'none'},
 {'id': 11, 'name': '缝线鼓包(轻度)', 'supercategory': 'none'},
 {'id': 12, 'name': '脏污', 'supercategory': 'none'},
 {'id': 13, 'name': '褶皱（轻度）', 'supercategory': 'none'},
 {'id': 14, 'name': '褶皱（重度）', 'supercategory': 'none'},
 {'id': 15, 'name': '跳针', 'supercategory': 'none'},
 {'id': 16, 'name': '针眼', 'supercategory': 'none'}]

In [13]:
def cocodateset_to_yolostddataset(coco_images_dir, annotation_file, save_dir, dataset_type='train'):
    """将COCO格式数据集转换为YOLO标准数据集格式
    
    Args:
        coco_images_dir: COCO数据集图片目录
        annotation_file: COCO格式标注文件路径 
        save_dir: 保存YOLO格式数据集的根目录
        dataset_type: 数据集类型，可选值为'train'、'valid'、'test'，默认为'train'
    """
    """将COCO格式数据集转换为YOLO标准数据集格式
    
    Args:
        coco_images_dir: COCO数据集图片目录
        annotation_file: COCO格式标注文件路径 
        save_dir: 保存YOLO格式数据集的根目录
        dataset_type: 数据集类型，默认为'train'
        
    数据集转换过程:
    1. 创建YOLO格式目录结构: 
       save_dir/
         ├── images/
         │   └── train/
         └── labels/
             └── train/
    2. 复制图片并重命名为6位数字序号
    3. 生成对应的标签文件,每行格式为: <category_id> <x_center> <y_center> <width> <height>
    """
    import json
    import shutil

    # 创建保存目录
    save_images_dir = os.path.join(save_dir, "images", dataset_type)
    save_labels_dir = os.path.join(save_dir, "labels", dataset_type)
    os.makedirs(save_images_dir, exist_ok=True)
    os.makedirs(save_labels_dir, exist_ok=True)  # 创建标签目录
    
    # 处理标注信息
    images_info = process_annotations(annotation_file)
    print(f"开始处理{len(images_info)}张图片...")
    
    for k, image in images_info.items():
        # print(k)
        # 显示处理进度
        print(f"处理第{k+1}张图片: {k:06d} {image['file_name']}")
        
        # 复制并重命名图片
        file_name = image['file_name']
        new_name = f"{k:06d}.jpg"
        old_path = os.path.join(coco_images_dir, file_name)
        new_path = os.path.join(save_images_dir, new_name)
        shutil.copy(old_path, new_path)
        
        # 更新文件名
        images_info[k]['file_name'] = new_name
        
        # 生成YOLO格式标签文件
        label_path = os.path.join(save_labels_dir, f"{k:06d}.txt")
        with open(label_path, 'w') as f:
            if 'category_id' in image.keys():
                for category_id, bbox in zip(image['category_id'], image['bbox']):
                    # YOLO格式: <category_id> <x_center> <y_center> <width> <height>
                    f.write(f"{category_id} {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}\n")
    
    print(f"数据集转换完成! 共处理{len(images_info)}张图片")

In [14]:
# cocodateset_to_yolostddataset(image_dir, annotation_file, "chengdu_dataset")

In [15]:
def main(input_dir, output_dir, yaml_path):
    """
    """
    train_dir, train_anno, valid_dir, valid_anno = get_dataset_paths(input_dir)
    # 训练集
    cocodateset_to_yolostddataset(train_dir, train_anno, output_dir, "train")
    # 验证集
    cocodateset_to_yolostddataset(valid_dir, valid_anno, output_dir, "valid")
    # yaml
    generate_yaml_config(train_anno, output_dir,  yaml_path)

In [16]:
input_dir = r"\\?\C:\Users\Rooki\Desktop\AI\CV\RT-DETR\chengdu_dataset"
output_dir = "chengdu_coco_std"
yaml_path = output_dir + "/chengdu.yaml"

In [17]:
main(input_dir, output_dir, yaml_path)

开始处理5394张图片...
处理第1张图片: 000000 1744654551214_2d20cbda-dc0a-47a5-a7fc-ec59969b5627__p_0040_d75a83877b4645c6a5b34e993f021e59_6154901fb0bc4ee2b3_1744655549846.jpg
处理第2张图片: 000001 1744654551214_2d20cbda-dc0a-47a5-a7fc-ec59969b5627__p_0040_d75a83877b4645c6a5b34e993f021e59_6154901fb0bc4ee2b3.jpg
处理第3张图片: 000002 1744654551171_15a8e727-b9a5-4f85-8511-7ac32dfcefc8__p_0038_76aaf76c6231497e9a89580f69bea22b_cf48045bb804414589_1744655548542.jpg
处理第4张图片: 000003 1744654551171_15a8e727-b9a5-4f85-8511-7ac32dfcefc8__p_0038_76aaf76c6231497e9a89580f69bea22b_cf48045bb804414589.jpg
处理第5张图片: 000004 1744654551106_2f46f35b-e92b-478a-b565-02c45147a622__p_00381_76aaf76c6231497e9a89580f69bea22b_7994f54fb6a74e9484_1744655547319.jpg
处理第6张图片: 000005 1744654551106_2f46f35b-e92b-478a-b565-02c45147a622__p_00381_76aaf76c6231497e9a89580f69bea22b_7994f54fb6a74e9484.jpg
处理第7张图片: 000006 1744654551157_30445555-e1ec-4ee3-8ab3-131520a0c2c7__p_0038_76aaf76c6231497e9a89580f69bea22b_97c99fe4-083d-472e-8d40-3a5771e928b4_1744655547