# 1. 整体规划
- 根据csv将图片和label相对应
- 将数据分成训练集和验证集
- 搭建对应的网络
- 训练网络并完成推理
- 将推理结果写入submition.csv

# 2. 加载
这里实现将csv读入并和每个图片文件名做label的对应绑定，可以选择用一个list实现对应数据的label记录

# 3. 区分训练集和验证集
将数据打乱后，数据总体的80%作为训练集，剩下的20%作为验证集
并准备好DataLoader

# 4. 搭建模型
模型的主体考虑以下方法：
1. 为了不打破整体的空间结构，使用深度卷积神经网络，建立net类
    a. 首先确定图像的输入大小是否一致，并将输入统一成3通道的RGB图像，确定每个图像的size是224*224
    b. 每一层的输出都要使用batch normalize，来加速收敛
2. 使用resnet思想

3. 输出通道和分类一致，总体的通道数是3，要分类的类别是176

4. 具体的网络结构
    a. 首先输入后，先使用一个5*5的conv层做stage，padding=3, stride=2,将输入的224\*224的图片变成112\*112的，但是通道数从3增加到6
    b. 再使用一个3\*3的conv层做stage， padding=1, strdge=2, 将features变成56\*56的，通道数从6增加到22
    c. 使用一个resnet层做stage，conv1的kernel=3,padding=1,stride=1， conv2的kernel=3,padding=1,stride=2， conv3是前馈通道kernel=1,paddle=0,stride=2， 将通道数增加到44，但是size缩小到28*28
    d. 再使用一个同样的resnet层做stage，把通道数增加到88，但是size缩小到14*14
    e. 再使用一个同样的resnet层做stage，把通道数增加到176，但是size缩小到7*7
    f. 最后使用一个max pool，将输出变成176通道的1*1
    g. 使用flatten，将176通道压缩成1维的176个输出的数据

In [90]:
import pandas as pd
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from random import shuffle
import torch.nn as nn

class LeavesDataset(torch.utils.data.Dataset):
    def __init__(self, data, img_dir, transform=None):
        self.data = data
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name, label = self.data[idx]
        img_path = os.path.join(self.img_dir, img_name)
        
        # 读取图像
        image = plt.imread(img_path)
        
        # 确保图像是 RGB 格式 (H, W, 3)
        if len(image.shape) == 2:  # 灰度图
            image = np.stack([image] * 3, axis=-1)
        
        # 转换为 PyTorch tensor: (H, W, C) -> (C, H, W)
        image = torch.from_numpy(image).permute(2, 0, 1).float()
        
        # 归一化到 [0, 1]
        if image.max() > 1:
            image = image / 255.0
        
        # 只在图像尺寸不是 224x224 时才调整大小
        if image.shape[1] != 224 or image.shape[2] != 224:
            image = torch.nn.functional.interpolate(
                image.unsqueeze(0), size=(224, 224), mode='bilinear', align_corners=False
            ).squeeze(0)
        
        if self.transform:
            image = self.transform(image)
        
        # 确保标签是 long tensor
        label = torch.tensor(label, dtype=torch.long)
        
        return image, label

class Residual(nn.Module):
    def __init__(self, model_layers):
        super(Residual, self).__init__()
        self.model_layers = model_layers
        self.layers = nn.ModuleList()
        self.stride_product = 1
        
        # 获取输入输出通道数
        first_conv = None
        last_conv = None
        for layer in model_layers:
            if layer['type'] == 'conv':
                if first_conv is None:
                    first_conv = layer
                last_conv = layer
        
        for layer in self.model_layers:
            if layer['type'] == 'conv':
                conv = nn.Conv2d(
                    in_channels=layer['in_channels'],
                    out_channels=layer['out_channels'],
                    kernel_size=layer['kernel_size'],
                    stride=layer['stride'],
                    padding=layer['padding']
                )
                self.stride_product *= layer['stride']
                self.layers.append(conv)
            elif layer['type'] == 'bn':
                bn = nn.BatchNorm2d(layer['num_features'])
                self.layers.append(bn)
            elif layer['type'] == 'relu':
                relu = nn.ReLU(inplace=False)  # 确保不是 inplace
                self.layers.append(relu)
        
        # 如果需要调整维度，创建1x1卷积
        if self.stride_product != 1 or (first_conv and last_conv and first_conv['in_channels'] != last_conv['out_channels']):
            self.shortcut = nn.Conv2d(
                in_channels=first_conv['in_channels'],
                out_channels=last_conv['out_channels'],
                kernel_size=1,
                stride=self.stride_product,
                padding=0
            )
        else:
            self.shortcut = None
                
    def forward(self, X):
        Y = X
        for layer in self.layers:
            Y = layer(Y)
        
        if self.shortcut is not None:
            X = self.shortcut(X)
        
        # 避免 inplace 操作，使用非 inplace 加法
        out = Y + X
        return nn.functional.relu(out)
        

class LeavesNet(nn.Module):
    # 输入的model_layers是一个列表，列表中每个元素是一个字典，字典中包含conv、bn、relu、maxpool、fc等键，对应的值是字典，字典中包含in_channels、out_channels、kernel_size、stride、padding等键，对应的值是整数
    # 所以可能会有多个conv层，多个bn层，多个relu层，多个maxpool层，多个fc层，需要每次都append到self.layers中
    def __init__(self, model_layers):
        super(LeavesNet, self).__init__()
        # 按照注释，每一层都要append到self.layers中
        self.layers = nn.ModuleList()  # 用ModuleList存储每一层
        self.model_layers = model_layers
        for layer in self.model_layers:
            if layer['type'] == 'conv':
                conv = nn.Conv2d(
                    in_channels=layer['in_channels'],
                    out_channels=layer['out_channels'],
                    kernel_size=layer['kernel_size'],
                    stride=layer['stride'],
                    padding=layer['padding']
                )
                self.layers.append(conv)
            elif layer['type'] == 'bn':
                bn = nn.BatchNorm2d(layer['num_features'])
                self.layers.append(bn)
            elif layer['type'] == 'relu':
                relu = nn.ReLU(inplace=False)  # 确保不是 inplace
                self.layers.append(relu)
            elif layer['type'] == 'maxpool':
                maxpool = nn.MaxPool2d(
                    kernel_size=layer['kernel_size'],
                    stride=layer['stride'],
                    padding=layer['padding']
                )
                self.layers.append(maxpool)
            elif layer['type'] == 'fc':
                fc = nn.Linear(
                    in_features=layer['in_features'],
                    out_features=layer['out_features']
                )
                self.layers.append(fc)
            elif layer['type'] == 'residual':
                self.layers.append(Residual(layer['residual_layers']))
            elif layer['type'] == 'flatten':
                self.layers.append(nn.Flatten())
        
        # 在循环外创建 Sequential
        self.model = nn.Sequential(*self.layers)
    
    def forward(self, X):
        return self.model(X)
                

class ClassifyLeaves():
    def __init__(self, csv_path, submission_path, img_dir, batch_size, model_layers):
        self.csv_path = csv_path
        self.submission_path = submission_path
        self.img_dir = img_dir
        self.label_list = []
        self.train_images = []
        self.test_images = []
        self.submit_images = []
        self.batch_size = batch_size
        self.model_layers = model_layers
        
        # 设置设备
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        print(f'训练设备: {self.device}')

    def load_csv(self):
        # 读取csv文件
        df = pd.read_csv(self.csv_path)
        df_submit = pd.read_csv(self.submission_path)

        # 建立标签到索引的映射
        unique_labels = sorted(df['label'].unique())
        self.label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
        self.idx_to_label = {idx: label for label, idx in self.label_to_idx.items()}
        self.num_classes = len(unique_labels)
        
        print(f'共有 {self.num_classes} 个类别')
        
        # 建立label列表（转换为数字索引）
        label_list = []
        for index, row in df.iterrows():
            # 获取图片文件名
            image_path = row['image']
            image_name = os.path.basename(image_path)
            # 获取图片标签并转换为索引
            label_str = row['label']
            label_idx = self.label_to_idx[label_str]
            # 将图片文件名和标签索引对应起来
            label_list.append((image_name, label_idx))

        submit_images = []
        for index, row in df_submit.iterrows():
            # 获取图片文件名
            image_path = row['image']
            image_name = os.path.basename(image_path)
            # 获取图片标签并转换为索引
            label_str = row['label']
            label_idx = self.label_to_idx[label_str]
            # 将图片文件名和标签索引对应起来
            submit_images.append((image_name, label_idx))
        
        self.label_list = label_list
        self.submit_images = submit_images
    
    def prepare_data(self):
        # 将数据集随机打乱
        data = self.label_list.copy()
        shuffle(data)
        total_len = len(data)
        print(f'共有 {total_len} 张图片')
        split_idx = int(0.8 * total_len)
        self.train_images = data[:split_idx]
        self.val_images = data[split_idx:]
        self.train_dataset = LeavesDataset(self.train_images, self.img_dir, transform=None)
        self.train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
        self.test_dataset = LeavesDataset(self.val_images, self.img_dir, transform=None)
        self.test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=True)

        self.submit_dataset = LeavesDataset(self.submit_images, self.img_dir, transform=None)
        self.submit_loader = DataLoader(self.submit_dataset, batch_size=self.batch_size, shuffle=False)

    def build_model(self):
        # 创建模型并移到GPU
        self.net = LeavesNet(self.model_layers)
        self.net = self.net.to(self.device)
    
    def train_model(self, num_epochs=10, lr=0.001, use_cosine_lr=True):
        # 定义损失函数和优化器
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.net.parameters(), lr=lr)
        
        # 余弦学习率调度器
        if use_cosine_lr:
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=num_epochs, eta_min=lr*0.01
            )
            print(f'开始训练，共 {num_epochs} 个epoch，使用余弦学习率调度')
        else:
            scheduler = None
            print(f'开始训练，共 {num_epochs} 个epoch，使用固定学习率 {lr}')
        
        for epoch in range(num_epochs):
            self.net.train()
            train_loss = 0.0
            correct = 0
            total = 0
            
            # 获取当前学习率
            current_lr = optimizer.param_groups[0]['lr']
            
            for batch_idx, (images, labels) in enumerate(self.train_loader):
                # 将数据移到GPU
                images = images.to(self.device)
                labels = labels.to(self.device)
                
                # 前向传播
                optimizer.zero_grad()
                outputs = self.net(images)
                loss = criterion(outputs, labels)
                
                # 反向传播
                loss.backward()
                optimizer.step()
                
                # 统计
                train_loss += loss.item()
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()
                
                if (batch_idx + 1) % 10 == 0:
                    print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(self.train_loader)}], '
                          f'Loss: {loss.item():.4f}, Acc: {100.*correct/total:.2f}%, LR: {current_lr:.6f}')
            
            # 每个epoch结束后在验证集上测试
            val_acc = self.validate()
            
            # 更新学习率
            if scheduler is not None:
                scheduler.step()
                print(f'学习率更新为: {optimizer.param_groups[0]["lr"]:.6f}')
    
    def validate(self):
        self.net.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for images, labels in self.test_loader:
                images = images.to(self.device)
                labels = labels.to(self.device)
                
                outputs = self.net(images)
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()
        
        accuracy = 100. * correct / total
        print(f'验证集准确率: {accuracy:.2f}%')
        return accuracy

    # 生成提交文件
    def submission(self):
        self.net.eval()
        
        predictions = []
        image_names = []
        
        # 预测所有测试图片
        with torch.no_grad():
            for idx, (images, _) in enumerate(self.submit_loader):
                images = images.to(self.device)
                outputs = self.net(images)
                _, predicted = outputs.max(1)
                
                # 获取对应的图片名称
                start_idx = idx * self.batch_size
                end_idx = min(start_idx + len(images), len(self.submit_images))
                batch_names = [self.submit_images[i][0] for i in range(start_idx, end_idx)]
                
                image_names.extend(batch_names)
                predictions.extend([self.idx_to_label[p.item()] for p in predicted])
        
        # 写入提交文件（CSV格式）
        with open(self.submission_path, 'w') as f:
            f.write('image,label\n')  # 写入表头
            for img_name, pred_label in zip(image_names, predictions):
                f.write(f'images/{img_name},{pred_label}\n')
        
        print(f'提交文件已保存到: {self.submission_path}')

    def run(self, num_epochs=10, lr=0.001, use_cosine_lr=True):
        self.load_csv()
        self.prepare_data()
        self.build_model()
        self.train_model(num_epochs, lr, use_cosine_lr)
        

        
    





In [91]:
# 根据网络结构描述构建model_layers
model_layers = [
    # Stage 1: 5*5 conv, 3->6通道, 224->112
    {'type': 'conv', 'in_channels': 3, 'out_channels': 6, 'kernel_size': 5, 'stride': 2, 'padding': 2},
    {'type': 'bn', 'num_features': 6},
    {'type': 'relu'},

    # maxpool
    {'type': 'maxpool', 'kernel_size': 3, 'stride': 1, 'padding': 1},
    
    # Stage 2: 3*3 conv, 6->22通道, 112->56
    {'type': 'conv', 'in_channels': 6, 'out_channels': 22, 'kernel_size': 3, 'stride': 2, 'padding': 1},
    {'type': 'bn', 'num_features': 22},
    {'type': 'relu'},
    
    # Stage 3: 两层卷积(ResNet风格), 22->44通道, 56->28
    {'type': 'residual', 'residual_layers': [
        {'type': 'conv', 'in_channels': 22, 'out_channels': 22, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'type': 'bn', 'num_features': 22},
        {'type': 'relu'},
        {'type': 'conv', 'in_channels': 22, 'out_channels': 44, 'kernel_size': 3, 'stride': 2, 'padding': 1},
        {'type': 'bn', 'num_features': 44},
        {'type': 'relu'},
    ]},
    
    # Stage 4: 两层卷积, 44->88通道, 28->14
    {'type': 'residual', 'residual_layers': [
        {'type': 'conv', 'in_channels': 44, 'out_channels': 44, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'type': 'bn', 'num_features': 44},
        {'type': 'relu'},
        {'type': 'conv', 'in_channels': 44, 'out_channels': 88, 'kernel_size': 3, 'stride': 2, 'padding': 1},
        {'type': 'bn', 'num_features': 88},
        {'type': 'relu'},
    ]},
    
    # Stage 5: 两层卷积, 88->176通道, 14->7
    {'type': 'residual', 'residual_layers': [
        {'type': 'conv', 'in_channels': 88, 'out_channels': 88, 'kernel_size': 3, 'stride': 1, 'padding': 1},
        {'type': 'bn', 'num_features': 88},
        {'type': 'relu'},
        {'type': 'conv', 'in_channels': 88, 'out_channels': 176, 'kernel_size': 3, 'stride': 2, 'padding': 1},
        {'type': 'bn', 'num_features': 176},
        {'type': 'relu'},
    ]},
    
    # Stage 6: MaxPool或AdaptiveAvgPool, 7->1
    {'type': 'maxpool', 'kernel_size': 7, 'stride': 1, 'padding': 0},
    
    # Stage 7: Flatten, 176*1*1 -> 176
    {'type': 'flatten'},
    
    # # 最后全连接层，输出176个类别
    {'type': 'fc', 'in_features': 176, 'out_features': 176}
]

net = LeavesNet(model_layers)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'使用设备: {device}')

# 将模型移到GPU
net = net.to(device)

# 打印每层模型的形状
X = torch.rand(size=(1, 3, 224, 224)).to(device)  # 输入也要移到GPU
for layer in net.layers:
    X = layer(X)
    print(layer.__class__.__name__,'output shape:\t', X.shape)


使用设备: cuda:0
Conv2d output shape:	 torch.Size([1, 6, 112, 112])
BatchNorm2d output shape:	 torch.Size([1, 6, 112, 112])
ReLU output shape:	 torch.Size([1, 6, 112, 112])
MaxPool2d output shape:	 torch.Size([1, 6, 112, 112])
Conv2d output shape:	 torch.Size([1, 22, 56, 56])
BatchNorm2d output shape:	 torch.Size([1, 22, 56, 56])
ReLU output shape:	 torch.Size([1, 22, 56, 56])
Residual output shape:	 torch.Size([1, 44, 28, 28])
Residual output shape:	 torch.Size([1, 88, 14, 14])
Residual output shape:	 torch.Size([1, 176, 7, 7])
MaxPool2d output shape:	 torch.Size([1, 176, 1, 1])
Flatten output shape:	 torch.Size([1, 176])
Linear output shape:	 torch.Size([1, 176])


In [92]:
csv_path = 'f:\d2l/d2l-zh/data/classify_leaves/train.csv'
submission_path = 'f:\d2l/d2l-zh/data/classify_leaves/submission.csv'
img_dir = 'f:\d2l/d2l-zh/data/classify_leaves/images'  # 修正图片路径
batch_size = 256  # 减小 batch size，避免内存不足

# 创建训练处理器
handler = ClassifyLeaves(csv_path, submission_path, img_dir, batch_size, model_layers)
handler.load_csv()
handler.prepare_data()

# 开始训练
# 取消下面注释开始训练

# 方案1: 使用余弦学习率（推荐）- 适合长时间训练
# handler.train_model(num_epochs=20, lr=0.001, use_cosine_lr=True)

# 方案2: 使用固定学习率 - 适合快速实验
# handler.train_model(num_epochs=5, lr=0.001, use_cosine_lr=False)

训练设备: cuda:0
共有 176 个类别
共有 18353 张图片


In [93]:
handler.build_model()
# handler.train_model(num_epochs=5, lr=0.001, use_cosine_lr=False)

In [98]:
handler.train_model(num_epochs=45, lr=0.01, use_cosine_lr=True)

开始训练，共 45 个epoch，使用余弦学习率调度
Epoch [1/45], Step [10/58], Loss: 1.5703, Acc: 49.69%, LR: 0.010000
Epoch [1/45], Step [20/58], Loss: 1.2572, Acc: 52.19%, LR: 0.010000
Epoch [1/45], Step [30/58], Loss: 1.2360, Acc: 56.08%, LR: 0.010000
Epoch [1/45], Step [40/58], Loss: 1.0558, Acc: 58.77%, LR: 0.010000
Epoch [1/45], Step [50/58], Loss: 1.1295, Acc: 60.52%, LR: 0.010000
验证集准确率: 33.97%
学习率更新为: 0.009988
Epoch [2/45], Step [10/58], Loss: 0.9040, Acc: 72.07%, LR: 0.009988
Epoch [2/45], Step [20/58], Loss: 0.9423, Acc: 71.93%, LR: 0.009988
Epoch [2/45], Step [30/58], Loss: 0.8599, Acc: 72.64%, LR: 0.009988
Epoch [2/45], Step [40/58], Loss: 0.7867, Acc: 72.79%, LR: 0.009988
Epoch [2/45], Step [50/58], Loss: 0.9184, Acc: 72.97%, LR: 0.009988
验证集准确率: 12.59%
学习率更新为: 0.009952
Epoch [3/45], Step [10/58], Loss: 0.7123, Acc: 74.96%, LR: 0.009952
Epoch [3/45], Step [20/58], Loss: 0.7056, Acc: 75.43%, LR: 0.009952
Epoch [3/45], Step [30/58], Loss: 0.7837, Acc: 75.40%, LR: 0.009952
Epoch [3/45], Step [40/58

In [99]:
handler.submission()

提交文件已保存到: f:\d2l/d2l-zh/data/classify_leaves/submission.csv
