# DermNet 数据预处理
简化版数据预处理，专注于核心功能

## 1. 环境初始化

In [48]:
import os
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2

# 设置随机种子
np.random.seed(42)
torch.manual_seed(42)

print("环境初始化完成")

环境初始化完成


## 2. 路径配置

In [49]:
# 路径配置 - 使用相对路径
dataset_dir = "DermNet"
train_dir = os.path.join(dataset_dir, "train")
test_dir = os.path.join(dataset_dir, "test")
processed_dir = "processed_data"

os.makedirs(processed_dir, exist_ok=True)
print(f"输出目录: {processed_dir}")

输出目录: processed_data


## 3. 数据收集

In [50]:
def collect_data(data_dir, dataset_type):
    """收集数据信息 - 使用相对路径"""
    data_list = []
    categories = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    
    for category in categories:
        category_path = os.path.join(data_dir, category)
        image_files = [f for f in os.listdir(category_path) 
                      if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
        
        for img_file in image_files:
            # 生成相对路径而不是绝对路径
            relative_path = os.path.join(data_dir, category, img_file)
            # 确保使用正斜杠以提高跨平台兼容性
            relative_path = relative_path.replace('\\', '/')
            
            data_list.append({
                'image_path': relative_path,
                'category': category,
                'dataset_type': dataset_type
            })
    
    return data_list

# 收集数据
train_data = collect_data(train_dir, "train")
test_data = collect_data(test_dir, "test")
all_data = train_data + test_data

print(f"训练集: {len(train_data)}, 测试集: {len(test_data)}, 总计: {len(all_data)}")
print(f"示例路径: {all_data[0]['image_path']}")

训练集: 15557, 测试集: 4002, 总计: 19559
示例路径: DermNet/train/Acne and Rosacea Photos/07Acne081101.jpg


In [51]:
# 如果已经存在CSV文件，将其中的绝对路径转换为相对路径
def convert_absolute_to_relative_paths():
    """将现有CSV文件中的绝对路径转换为相对路径"""
    csv_files = ['train.csv', 'test.csv', 'full_dataset.csv']
    
    for csv_file in csv_files:
        csv_path = os.path.join(processed_dir, csv_file)
        if os.path.exists(csv_path):
            print(f"转换 {csv_file} 中的路径...")
            df = pd.read_csv(csv_path)
            
            # 转换路径
            def convert_path(path):
                if os.path.isabs(path):
                    # 提取相对于当前工作目录的路径
                    if 'DermNet' in path:
                        # 找到DermNet的位置
                        parts = path.split(os.sep)
                        try:
                            dermnet_idx = parts.index('DermNet')
                            relative_path = '/'.join(parts[dermnet_idx:])
                            return relative_path
                        except ValueError:
                            return path
                return path.replace('\\', '/')
            
            df['image_path'] = df['image_path'].apply(convert_path)
            df.to_csv(csv_path, index=False)
            print(f"  转换完成，示例路径: {df.iloc[0]['image_path']}")
    
# 检查并转换现有文件
if os.path.exists(processed_dir):
    convert_absolute_to_relative_paths()

转换 train.csv 中的路径...
  转换完成，示例路径: DermNet/train/Acne and Rosacea Photos/07Acne081101.jpg
转换 test.csv 中的路径...
  转换完成，示例路径: DermNet/test/Acne and Rosacea Photos/07PerioralDermEye.jpg
转换 full_dataset.csv 中的路径...
  转换完成，示例路径: DermNet/train/Acne and Rosacea Photos/07Acne081101.jpg
  转换完成，示例路径: DermNet/train/Acne and Rosacea Photos/07Acne081101.jpg


## 4. 标签编码

In [52]:
# 创建标签映射
categories = sorted(list(set([item['category'] for item in all_data])))
category_to_idx = {cat: idx for idx, cat in enumerate(categories)}
idx_to_category = {idx: cat for cat, idx in category_to_idx.items()}

# 保存映射
mapping_info = {
    'categories': categories,
    'category_to_idx': category_to_idx,
    'idx_to_category': idx_to_category,
    'num_classes': len(categories)
}

with open(os.path.join(processed_dir, 'category_mapping.json'), 'w', encoding='utf-8') as f:
    json.dump(mapping_info, f, ensure_ascii=False, indent=2)

print(f"类别数: {len(categories)}")

类别数: 23


## 5. 类别权重计算

In [53]:
# 创建数据框
df = pd.DataFrame(all_data)
df['label'] = df['category'].map(category_to_idx)

# 计算类别权重
class_counts = df['label'].value_counts().sort_index()
total_samples = len(df)
class_weights = total_samples / (len(categories) * class_counts)
class_weights_dict = class_weights.to_dict()

# 保存权重
with open(os.path.join(processed_dir, 'class_weights.json'), 'w') as f:
    json.dump(class_weights_dict, f, indent=2)

print(f"权重范围: {class_weights.min():.3f} ~ {class_weights.max():.3f}")

权重范围: 0.484 ~ 3.209


## 6. 数据增强定义

In [54]:
def get_transforms(mode='train'):
    """获取数据变换"""
    if mode == 'train':
        return A.Compose([
            A.Resize(256, 256),
            A.RandomCrop(224, 224),
            A.HorizontalFlip(p=0.5),
            A.RandomBrightnessContrast(p=0.3),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ])
    else:
        return A.Compose([
            A.Resize(224, 224),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ])

print("数据增强策略定义完成")

数据增强策略定义完成


## 7. 数据集保存

In [55]:
# 分离训练集和测试集
train_df = df[df['dataset_type'] == 'train'].copy()
test_df = df[df['dataset_type'] == 'test'].copy()

# 保存CSV文件
train_df.to_csv(os.path.join(processed_dir, 'train.csv'), index=False)
test_df.to_csv(os.path.join(processed_dir, 'test.csv'), index=False)
df.to_csv(os.path.join(processed_dir, 'full_dataset.csv'), index=False)

print(f"数据集已保存:")
print(f"  训练集: {len(train_df)} 张")
print(f"  测试集: {len(test_df)} 张")

数据集已保存:
  训练集: 15557 张
  测试集: 4002 张


## 8. 数据集类定义

In [56]:
class DermNetDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.data = dataframe.reset_index(drop=True)
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        image = cv2.imread(row['image_path'])
        if image is None:
            raise ValueError(f"无法读取图片: {row['image_path']}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.transform:
            transformed = self.transform(image=image)
            image = transformed['image']
        
        return image, row['label']

# 简单测试
test_dataset = DermNetDataset(train_df.head(10), transform=get_transforms('train'))
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

for images, labels in test_loader:
    print(f"测试成功: {images.shape}, 标签: {labels}")
    break

测试成功: torch.Size([2, 3, 224, 224]), 标签: tensor([0, 0])


## 9. 完成总结

In [57]:
print("\n=== 预处理完成 ===")
print(f"总类别数: {len(categories)}")
print(f"训练集: {len(train_df)} 张")
print(f"测试集: {len(test_df)} 张")
print(f"输出文件:")
print(f"  - category_mapping.json")
print(f"  - class_weights.json")
print(f"  - train.csv")
print(f"  - test.csv")
print(f"\n可以开始训练了！")


=== 预处理完成 ===
总类别数: 23
训练集: 15557 张
测试集: 4002 张
输出文件:
  - category_mapping.json
  - class_weights.json
  - train.csv
  - test.csv

可以开始训练了！
