In [6]:
import os
import random
import shutil
from pathlib import Path

def prepare_flower_dataset(input_dir, output_dir, train_ratio=0.8):
    """
    准备flower数据集，按照ImageNet格式组织
    
    参数:
        input_dir: 原始数据集目录，包含5个子目录(daisy, dandelion, rose, sunflower, tulip)
        output_dir: 输出目录，将在此目录下创建ImageNet格式的数据集
        train_ratio: 训练集比例，默认为0.8
        random_seed: 随机种子，默认为42

    """
    # 确保输出目录存在
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # 创建train和val目录
    train_dir = output_dir / 'train'
    val_dir = output_dir / 'val'
    train_dir.mkdir(exist_ok=True)
    val_dir.mkdir(exist_ok=True)
    
    # 类别列表
    classes = ['daisy', 'dandelion', 'rose', 'sunflower', 'tulip']
    
    # 创建classes.txt文件
    with open(output_dir / 'classes.txt', 'w') as f:
        f.write('\n'.join(classes))
    
    # 准备训练集和验证集的标注文件
    train_annotations = []
    val_annotations = []
    
    for class_idx, class_name in enumerate(classes):
        # 创建train和val中的子目录
        (train_dir / class_name).mkdir(exist_ok=True)
        (val_dir / class_name).mkdir(exist_ok=True)
        
        # 获取原始目录中的所有图片
        src_dir = Path(input_dir) / class_name
        image_files = list(src_dir.glob('*.jpg')) + list(src_dir.glob('*.jpeg')) + list(src_dir.glob('*.png'))
        
        # 随机打乱图片顺序
        random.shuffle(image_files)
        
        # 计算分割点
        split_idx = int(len(image_files) * train_ratio)
        
        # 处理训练集
        for img_path in image_files[:split_idx]:
            dst_path = train_dir / class_name / img_path.name
            shutil.copy(img_path, dst_path)
            train_annotations.append(f"{class_name}/{img_path.name} {class_idx}")
        
        # 处理验证集
        for img_path in image_files[split_idx:]:
            dst_path = val_dir / class_name / img_path.name
            shutil.copy(img_path, dst_path)
            val_annotations.append(f"{class_name}/{img_path.name} {class_idx}")
    
    # 写入标注文件
    with open(output_dir / 'train.txt', 'w') as f:
        f.write('\n'.join(train_annotations))
    
    with open(output_dir / 'val.txt', 'w') as f:
        f.write('\n'.join(val_annotations))
    
    print(f"数据集准备完成，保存在: {output_dir}")
    print(f"训练集样本数: {len(train_annotations)}")
    print(f"验证集样本数: {len(val_annotations)}")



In [8]:
if __name__ == "__main__":
    input_directory = "/Users/niezhiqi/Desktop/课程/深度学习/hw1/EX1/flower_dataset"
    output_directory = "/Users/niezhiqi/Desktop/课程/深度学习/hw1/EX1/processed_flower_dataset"
    seed = 42
    prepare_flower_dataset(input_directory, output_directory)


数据集准备完成，保存在: /Users/niezhiqi/Desktop/课程/深度学习/hw1/EX1/processed_flower_dataset
训练集样本数: 2276
验证集样本数: 572


In [5]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split  # 需提前安装：

# 配置参数
原始数据集路径 = "/Users/niezhiqi/Desktop/课程/深度学习/hw1/EX1/flower_dataset"  
输出路径 = "/Users/niezhiqi/Desktop/课程/深度学习/hw1/EX1/processed_flower_dataset"  # 处理后的数据集根目录
训练验证比例 = 0.8  # 训练集占比80%
随机种子 = 42  # 固定随机种子确保可复现

# 创建输出目录结构
os.makedirs(f"{输出路径}/train", exist_ok=True)
os.makedirs(f"{输出路径}/val", exist_ok=True)
os.makedirs(f"{输出路径}/train/daisy", exist_ok=True)
# 重复上述代码创建其他类别文件夹（dandelion、rose、sunflower、tulip）

# 遍历每个类别，划分数据集
for 类别 in ["daisy", "dandelion", "rose", "sunflower", "tulip"]:
    图片路径列表 = [os.path.join(原始数据集路径, 类别, f) for f in os.listdir(os.path.join(原始数据集路径, 类别))]
    训练路径, 验证路径 = train_test_split(图片路径列表, train_size=训练验证比例, random_state=随机种子)
    
    # 移动训练集图片
    for 路径 in 训练路径:
        shutil.copy2(路径, f"{输出路径}/train/{类别}/")
    
    # 移动验证集图片
    for 路径 in 验证路径:
        shutil.copy2(路径, f"{输出路径}/val/{类别}/")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/niezhiqi/Desktop/课程/深度学习/hw1/EX1/processed_flower_dataset/val/daisy/'