In [193]:
import numpy as np
import pandas as pd
import pathlib, sys, os, random, time
import cv2, gc
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import albumentations as A
from albumentations.pytorch import ToTensorV2

import rasterio
from rasterio.windows import Window

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as D
import torchvision
from torchvision import transforms as T

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import torch.cuda.amp as amp  # 导入自动混合精度模块

from torch.optim.lr_scheduler import CosineAnnealingLR  # 添加余弦退火学习率调度器

import segmentation_models_pytorch as smp

In [194]:
# RLE编码和解码函数
def rle_encode(im):
    '''
    im: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = im.flatten(order = 'F')
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

def rle_decode(mask_rle, shape=(512, 512)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background
    '''
    if mask_rle == '' or pd.isna(mask_rle):
        return np.zeros(shape, dtype=np.uint8)
    
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape, order='F')

In [195]:
# 配置参数
SEED = 42
EPOCHS = 15
BATCH_SIZE = 4
IMAGE_SIZE = 320  # 增加图像尺寸以获取更多细节
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
THRESHOLD = 0.5  # 二值化阈值，可以通过验证集调整

In [196]:
# 设置随机种子
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [197]:
# 增强的数据增强策略
# 减少预处理计算负担
train_transform = A.Compose([
    A.Resize(IMAGE_SIZE, IMAGE_SIZE),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomRotate90(p=0.5),
    A.ShiftScaleRotate(p=0.5, shift_limit=0.1, scale_limit=0.2, rotate_limit=30, border_mode=0),
    A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]),
    ToTensorV2(),
])

# 验证集变换 - 加在此处
valid_transform = A.Compose([
    A.Resize(IMAGE_SIZE, IMAGE_SIZE),
    A.Normalize(
        mean=[0.625, 0.448, 0.688],
        std=[0.131, 0.177, 0.101],
    ),
    ToTensorV2(),
])

In [198]:
class BuildingSegmentationDataset(torch.utils.data.Dataset):
    def __init__(self, img_paths, mask_paths=None, transform=None):
        self.img_paths = img_paths
        self.mask_paths = mask_paths
        self.transform = transform
        
    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        # 加载图像
        img = cv2.imread(self.img_paths[idx])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # 处理掩码数据
        if self.mask_paths is not None:
            mask_data = self.mask_paths[idx]
            
            # 使用已有的rle_decode函数处理RLE格式的掩码
            if isinstance(mask_data, str):
                mask = rle_decode(mask_data, shape=(512, 512))
            elif isinstance(mask_data, np.ndarray):
                mask = mask_data
            elif isinstance(mask_data, torch.Tensor):
                mask = mask_data.cpu().numpy()
            else:
                print(f"警告：未知的掩码数据类型 - {type(mask_data)}")
                mask = np.zeros((512, 512), dtype=np.uint8)
            
            # 应用变换
            if self.transform is not None:
                transformed = self.transform(image=img, mask=mask)
                img = transformed["image"]
                mask = transformed["mask"]
            
            # 关键修复: 始终确保mask是float类型的tensor
            if isinstance(mask, np.ndarray):
                mask = torch.from_numpy(mask).float()
            else:
                # 确保tensor是float类型
                mask = mask.float()
            
            if mask.dim() == 2:
                mask = mask.unsqueeze(0)
                
            return img, mask
        else:
            # 只返回图像（用于测试集）
            if self.transform is not None:
                transformed = self.transform(image=img)
                img = transformed["image"]
                
            return img

In [199]:
# U-Net模型定义 - 基础模块
class DoubleConv(nn.Module):
    """(Conv2D -> BN -> ReLU) * 2"""
    def __init__(self, in_channels, out_channels, mid_channels=None):
        super().__init__()
        if not mid_channels:
            mid_channels = out_channels
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.double_conv(x)

class Down(nn.Module):
    """Downscaling with maxpool then double conv"""
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.maxpool_conv = nn.Sequential(
            nn.MaxPool2d(2),
            DoubleConv(in_channels, out_channels)
        )

    def forward(self, x):
        return self.maxpool_conv(x)

In [200]:
class Up(nn.Module):
    """Upscaling then double conv"""
    def __init__(self, in_channels, out_channels, bilinear=True):
        super().__init__()

        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
            self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
        else:
            self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
            self.conv = DoubleConv(in_channels, out_channels)

    def forward(self, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                        diffY // 2, diffY - diffY // 2])
        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)

class OutConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(OutConv, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)

    def forward(self, x):
        return self.conv(x)

In [201]:
class UNet(nn.Module):
    def __init__(self, n_channels=3, n_classes=1, bilinear=True):
        super(UNet, self).__init__()
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.bilinear = bilinear
        
        # 减少通道数
        self.inc = DoubleConv(n_channels, 32)
        self.down1 = Down(32, 64)
        self.down2 = Down(64, 128)
        self.down3 = Down(128, 256)
        self.down4 = Down(256, 512 // 2 if bilinear else 512)
        self.up1 = Up(512, 256 // 2 if bilinear else 256, bilinear)
        self.up2 = Up(256, 128 // 2 if bilinear else 128, bilinear)
        self.up3 = Up(128, 64 // 2 if bilinear else 64, bilinear)
        self.up4 = Up(64, 32, bilinear)
        self.outc = OutConv(32, n_classes)
        
    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5, x4)
        x = self.up2(x, x3)
        x = self.up3(x, x2)
        x = self.up4(x, x1)
        return self.outc(x)

In [202]:
class AdvancedUNet(nn.Module):
    def __init__(self, encoder_name="efficientnet-b4", in_channels=3, classes=1):
        """
        初始化具有预训练编码器的高级U-Net模型
        """
        super().__init__()
        
        # 创建模型主干 - 使用预训练的编码器
        self.model = smp.UnetPlusPlus(
            encoder_name=encoder_name,     # 从 'resnet50', 'efficientnet-b4' 等选择
            encoder_weights="imagenet",     # 使用预训练权重
            in_channels=in_channels,        # 输入通道数
            classes=classes,                # 输出通道数
            activation=None,                # 不需要激活，我们会在损失函数中处理
        )
    
    def forward(self, x):
        return self.model(x)
    
    @staticmethod
    def get_model_names():
        """返回可用的编码器名称列表"""
        return [
            "resnet18", "resnet101",
            "efficientnet-b3", "efficientnet-b0",
            "timm-resnest50d", "timm-mobilenetv3_large_100",
            "densenet121", "densenet169"
        ]

In [203]:
# 优化的损失函数 - Dice Loss
class DiceLoss(nn.Module):
    def __init__(self, smooth=1.0):
        super(DiceLoss, self).__init__()
        self.smooth = smooth
        
    def forward(self, pred, target):
        pred = torch.sigmoid(pred)
        
        # 平滑处理以避免0/0的情况
        intersection = (pred * target).sum(dim=(2,3))
        union = pred.sum(dim=(2,3)) + target.sum(dim=(2,3))
        
        dice = (2.0 * intersection + self.smooth) / (union + self.smooth)
        return 1.0 - dice.mean()

In [204]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.eps = 1e-7  # 添加极小值避免数值不稳定
        
    def forward(self, pred, target):
        # 不再手动应用sigmoid
        # 使用binary_cross_entropy_with_logits，它内部会稳定地计算sigmoid+BCE
        bce = F.binary_cross_entropy_with_logits(pred, target, reduction='none')
        
        # 应用focal loss公式
        pt = torch.exp(-bce)  # pt = p if y=1, pt = 1-p if y=0
        focal_loss = (1-pt)**self.gamma * bce
        
        return focal_loss.mean()

In [205]:
# 优化的组合损失函数
class CombinedLoss(nn.Module):
    def __init__(self, dice_weight=0.5, focal_weight=0.5):
        super(CombinedLoss, self).__init__()
        self.dice_weight = dice_weight
        self.focal_weight = focal_weight
        self.dice_loss = DiceLoss()
        self.focal_loss = FocalLoss()
        
    def forward(self, pred, target):
        dice = self.dice_loss(pred, target)
        focal = self.focal_loss(pred, target)
        return self.dice_weight * dice + self.focal_weight * focal

In [206]:
class EarlyStopping:
    """当验证集性能不再提升时提前停止训练"""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): 验证集性能不提升后等待多少轮停止训练
            verbose (bool): 是否打印详细信息
            delta (float): 性能变化的最小阈值
            path (str): 保存检查点路径
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.inf
        self.delta = delta
        self.path = path
        
    def __call__(self, val_loss, model):
        score = -val_loss
        
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'早停计数: {self.counter}/{self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0
            
    def save_checkpoint(self, val_loss, model):
        '''当验证损失减小时保存模型'''
        if self.verbose:
            print(f'验证损失从 ({self.val_loss_min:.6f} 降至 {val_loss:.6f})。保存模型...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [207]:
def debug_tensors(images, masks, outputs=None):
    """检查张量的形状和类型以及值的范围"""
    print(f"Images: shape={images.shape}, type={images.dtype}, device={images.device}")
    print(f"Masks: shape={masks.shape}, type={masks.dtype}, device={masks.device}")
    
    if outputs is not None:
        print(f"Outputs: shape={outputs.shape}, type={outputs.dtype}, device={outputs.device}")
        
        # 检查输出值的范围
        with torch.no_grad():
            outputs_sigmoid = torch.sigmoid(outputs)
            min_val = outputs_sigmoid.min().item()
            max_val = outputs_sigmoid.max().item()
            print(f"输出sigmoid后的值范围: [{min_val:.6f}, {max_val:.6f}]")
            
            # 检查是否有极端值
            if min_val < 0 or max_val > 1:
                print("警告: sigmoid后的输出值超出[0,1]范围!")
    
    # 检查是否包含NaN或Inf
    if torch.isnan(images).any():
        print("警告: 图像包含NaN值!")
    if torch.isnan(masks).any():
        print("警告: 掩码包含NaN值!")
    if outputs is not None and torch.isnan(outputs).any():
        print("警告: 输出包含NaN值!")
    if outputs is not None and torch.isinf(outputs).any():
        print("警告: 输出包含Inf值!")

In [208]:
def train_one_epoch(model, dataloader, optimizer, criterion, device, accumulation_steps=4):
    model.train()
    total_loss = 0
    
    # 创建混合精度训练的梯度缩放器
    scaler = amp.GradScaler()
    
    optimizer.zero_grad()
    
    for i, (images, masks) in enumerate(tqdm(dataloader)):
        images = images.to(device)
        masks = masks.to(device).float()
        
        # 仅在特定间隔打印调试信息
        if i % 500 == 0:
            debug_tensors(images, masks)
        
        # 使用自动混合精度上下文
        with amp.autocast():
            # 前向传播
            outputs = model(images)
            
            # 同样，限制输出频率
            if i % 500 == 0:
                debug_tensors(images, masks, outputs)
        
        # 在autocast上下文之外计算损失
        loss = criterion(outputs, masks)
        loss = loss / accumulation_steps
        
        # 检查损失是否有问题
        if torch.isnan(loss) or torch.isinf(loss):
            print(f"警告: 损失值异常: {loss.item()}")
            continue  # 跳过这个批次
        
        # 使用梯度缩放器进行反向传播    
        scaler.scale(loss).backward()
        
        if (i + 1) % accumulation_steps == 0:
            # 添加梯度剪裁（在优化器步骤之前）
            scaler.unscale_(optimizer)  # 在剪裁前取消梯度缩放
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            # 使用梯度缩放器更新权重
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        total_loss += loss.item() * accumulation_steps
    
    return total_loss / len(dataloader)

In [209]:
def clear_memory():
    """清理可能的内存泄漏"""
    torch.cuda.empty_cache()
    gc.collect()

# 在train_with_checkpoints函数中每个epoch后调用
clear_memory()

In [210]:
# 验证函数
@torch.no_grad()
def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    dice_scores = []
    
    for images, masks in tqdm(dataloader):
        images = images.to(device)
        masks = masks.to(device)
        
        # 前向传播
        outputs = model(images)
        loss = criterion(outputs, masks)
        
        # 计算Dice分数
        preds = (torch.sigmoid(outputs) > THRESHOLD).float()
        dice = (2 * (preds * masks).sum()) / (preds.sum() + masks.sum() + 1e-8)
        dice_scores.append(dice.item())
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader), np.mean(dice_scores)

In [211]:
# 预测函数
@torch.no_grad()
def predict(model, dataloader, device, threshold=THRESHOLD):
    model.eval()
    results = []
    
    for images, filenames in tqdm(dataloader):
        images = images.to(device)
        outputs = model(images)
        preds = torch.sigmoid(outputs)
        
        # 处理每个批次的预测
        for pred, filename in zip(preds, filenames):
            pred = pred.cpu().numpy().squeeze()
            pred = cv2.resize(pred, (512, 512))  # 调整为原始大小
            mask = (pred > threshold).astype(np.uint8)
            rle = rle_encode(mask)
            results.append([filename, rle])
    
    return results

In [212]:
@torch.no_grad()
def predict_with_tta(model, image, device, threshold=THRESHOLD, tta_transforms=None):
    """测试时增强提高预测质量"""
    model.eval()
    
    # 如果没有提供TTA变换，则使用基本变换
    if tta_transforms is None:
        tta_transforms = [
            A.Compose([A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]), ToTensorV2()]),
            A.Compose([A.HorizontalFlip(p=1.0), A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]), ToTensorV2()]),
            A.Compose([A.VerticalFlip(p=1.0), A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]), ToTensorV2()]),
            A.Compose([A.Transpose(p=1.0), A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]), ToTensorV2()])
        ]
    
    # 应用所有变换并预测
    preds = []
    for transform in tta_transforms:
        augmented = transform(image=image)
        img_tensor = augmented['image'].unsqueeze(0).to(device)
        output = model(img_tensor)
        pred = torch.sigmoid(output).cpu().numpy().squeeze()
        
        # 还原变换
        if 'HorizontalFlip' in str(transform):
            pred = np.fliplr(pred)
        if 'VerticalFlip' in str(transform):
            pred = np.flipud(pred)
        if 'Transpose' in str(transform):
            pred = np.transpose(pred)
            
        preds.append(pred)
    
    # 平均所有预测结果
    final_pred = np.mean(preds, axis=0)
    return (final_pred > threshold).astype(np.uint8)

In [213]:
def train_with_checkpoints(model, train_loader, valid_loader, optimizer, 
                          criterion, scheduler, device, num_epochs, 
                          checkpoint_dir='checkpoints', accumulation_steps=4,
                          start_epoch=1, best_dice=0, best_model_epoch=0):  # 添加了这三个参数
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # 如果是继续训练，使用传入的最佳值
    best_dice = best_dice
    best_epoch = best_model_epoch
    
    # 创建日志 - 根据是否继续训练决定写入模式
    log_mode = "a" if start_epoch > 1 else "w"
    log_file = open(f"{checkpoint_dir}/training_log.csv", log_mode)
    
    # 仅在新训练时写入表头
    if start_epoch == 1:
        log_file.write("epoch,train_loss,val_loss,val_dice,learning_rate\n")
    
    # 初始化早停
    early_stopping = EarlyStopping(patience=7, verbose=True, 
                                   path=f"{checkpoint_dir}/early_stop_model.pth")
    
    # 修改循环范围，从start_epoch开始
    for epoch in range(start_epoch, start_epoch + num_epochs):
        print(f"第 {epoch}/{start_epoch + num_epochs - 1} 轮")
        
        # 训练 - 使用梯度累积
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device, accumulation_steps)
        
        # 清理内存
        clear_memory()

        print("\n" + "="*50)
        print(f"完成第 {epoch} 轮训练，开始验证...")

        # 验证
        val_loss, val_dice = validate(model, valid_loader, criterion, device)

        print(f"验证完成! 损失: {val_loss:.4f}, Dice: {val_dice:.4f}")
        print("="*50 + "\n")

        # 清理内存
        clear_memory()
        
        # 记录学习率
        current_lr = optimizer.param_groups[0]['lr']
        
        # 写入日志
        log_file.write(f"{epoch},{train_loss:.4f},{val_loss:.4f},{val_dice:.4f},{current_lr:.8f}\n")
        log_file.flush()
        
        # 调整学习率
        scheduler.step()
        
        print(f"训练损失: {train_loss:.4f} | 验证损失: {val_loss:.4f} | Dice分数: {val_dice:.4f} | 学习率: {current_lr:.8f}")
        
        # 保存检查点 - 只保存必要信息以节省空间
        if epoch % 5 == 0 or epoch == start_epoch + num_epochs - 1:  # 每5个epoch保存一次完整检查点
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_dice': val_dice,
                'best_dice': best_dice,
                'best_epoch': best_epoch,
            }, f"{checkpoint_dir}/checkpoint_epoch_{epoch}.pth")
        
        # 保存最佳模型
        if val_dice > best_dice:
            print(f"Dice分数从 {best_dice:.4f} 提高到 {val_dice:.4f}. 正在保存模型...")
            best_dice = val_dice
            best_epoch = epoch
            torch.save(model.state_dict(), f"{checkpoint_dir}/best_model.pth")
        
        # 检查早停条件
        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("触发早停! 训练停止。")
            break
    
    log_file.close()
    print(f"训练完成! 最佳Dice分数: {best_dice:.4f} (第{best_epoch}轮)")
    return best_dice, best_epoch

In [214]:
def resume_training(checkpoint_path=None, num_epochs=30):
    """从检查点恢复训练
    
    Args:
        checkpoint_path: 检查点路径，可以是以下几种:
            - None: 自动寻找model_checkpoints/best_model.pth
            - "best": 使用model_checkpoints/best_model.pth
            - 具体路径: 如"model_checkpoints/checkpoint_epoch_13.pth"
        num_epochs: 继续训练的轮数
    """
    # 确定要加载的检查点路径
    if checkpoint_path is None or checkpoint_path == "best":
        checkpoint_path = 'model_checkpoints/best_model.pth'
    
    print(f"从检查点 {checkpoint_path} 恢复训练...")
    
    # 加载数据加载器（与main函数相同）
    train_mask = pd.read_csv('数据集/train_mask.csv', sep='\t', names=['name', 'mask'])
    train_mask['name'] = train_mask['name'].apply(lambda x: '数据集/train/' + x)
    
    # 分割训练集和验证集
    train_idx, valid_idx = [], []
    for i in range(len(train_mask)):
        if i % 7 == 0:
            valid_idx.append(i)
        else:
            train_idx.append(i)
    
    train_df = train_mask.iloc[train_idx].reset_index(drop=True)
    valid_df = train_mask.iloc[valid_idx].reset_index(drop=True)
    
    train_ds = BuildingSegmentationDataset(
        train_df['name'].values,
        train_df['mask'].fillna('').values,
        transform=train_transform
    )
    
    valid_ds = BuildingSegmentationDataset(
        valid_df['name'].values,
        valid_df['mask'].fillna('').values,
        transform=valid_transform
    )
    
    train_loader = D.DataLoader(
        train_ds, batch_size=BATCH_SIZE, shuffle=True, 
        num_workers=2, pin_memory=True
    )
    
    valid_loader = D.DataLoader(
        valid_ds, batch_size=BATCH_SIZE, shuffle=False, 
        num_workers=2, pin_memory=True
    )
    
    # 创建模型
    model = UNet(n_channels=3, n_classes=1, bilinear=False)
    
    # 区分是完整检查点还是仅模型权重
    if checkpoint_path.endswith('.pth'):
        try:
            # 尝试作为完整检查点加载
            checkpoint = torch.load(checkpoint_path)
            if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
                # 这是一个完整检查点
                model.load_state_dict(checkpoint['model_state_dict'])
                start_epoch = checkpoint.get('epoch', 0) + 1
                best_dice = checkpoint.get('best_dice', 0)
                best_epoch = checkpoint.get('best_epoch', 0)
                
                # 创建优化器并尝试加载状态
                optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4, eps=1e-8)
                if 'optimizer_state_dict' in checkpoint:
                    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
                    print("已加载优化器状态")
                
                print(f"已从完整检查点加载。最佳Dice: {best_dice:.4f} (第{best_epoch}轮)")
                print(f"继续从第 {start_epoch} 轮开始训练")
            else:
                # 这只是模型权重
                model.load_state_dict(checkpoint)
                start_epoch = 1  # 从第1轮重新开始计数
                best_dice = 0
                best_epoch = 0
                optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4, eps=1e-8)
                print("已加载模型权重，但没有训练状态信息。从第1轮开始训练")
        except Exception as e:
            print(f"加载检查点时出错: {e}")
            print("创建新模型...")
            start_epoch = 1
            best_dice = 0
            best_epoch = 0
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4, eps=1e-8)
    else:
        raise ValueError(f"不支持的检查点路径: {checkpoint_path}")
    
    model.to(DEVICE)
    
    # 创建学习率调度器
    scheduler = CosineAnnealingLR(
        optimizer,
        T_max=num_epochs,
        eta_min=1e-6
    )
    
    # 损失函数
    criterion = CombinedLoss(dice_weight=0.8, focal_weight=0.2)
    
    # 继续训练
    best_dice, best_epoch = train_with_checkpoints(
        model, train_loader, valid_loader, optimizer,
        criterion, scheduler, DEVICE, num_epochs,
        checkpoint_dir='model_checkpoints',
        start_epoch=start_epoch,
        best_dice=best_dice,
        best_model_epoch=best_epoch
    )
    
    return model, best_dice, best_epoch

In [215]:
def train_multiple_models(encoders=["efficientnet-b0", "resnet18"], 
                         epochs=EPOCHS,  # 默认使用全局EPOCHS
                         save_dir="model_ensemble",
                         include_existing_model=True,
                         existing_model_path="model_checkpoints/best_model.pth",
                         batch_size=BATCH_SIZE,  # 默认使用全局BATCH_SIZE
                         image_size=IMAGE_SIZE,  # 默认使用全局IMAGE_SIZE
                         accumulation_steps=16):
    """训练多个不同编码器的模型用于集成，优化内存使用"""
    
    os.makedirs(save_dir, exist_ok=True)
    trained_models = []
    best_scores = []
    encoders_used = []
    
    # 如果包含现有模型，将其添加到列表中
    if include_existing_model and os.path.exists(existing_model_path):
        print(f"包含现有模型: {existing_model_path}")
        trained_models.append(existing_model_path)
        best_scores.append(0.86)
        encoders_used.append("unet")
    
    # 数据加载和转换 - 使用新的image_size
    train_transform_local = A.Compose([
        A.Resize(image_size, image_size),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.RandomRotate90(p=0.5),
        A.ShiftScaleRotate(p=0.5, shift_limit=0.1, scale_limit=0.2, rotate_limit=30, border_mode=0),
        A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]),
        ToTensorV2(),
    ])
    
    valid_transform_local = A.Compose([
        A.Resize(image_size, image_size),
        A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]),
        ToTensorV2(),
    ])
    
    # 加载数据
    try:
        print("正在加载训练数据...")
        train_mask = pd.read_csv('数据集/train_mask.csv', sep='\t', names=['name', 'mask'])
        train_mask['name'] = train_mask['name'].apply(lambda x: '数据集/train/' + x)
    except Exception as e:
        print(f"加载数据时出错: {e}")
        return [], [], []
    
    # 分割训练集和验证集
    train_idx, valid_idx = [], []
    for i in range(len(train_mask)):
        if i % 7 == 0:
            valid_idx.append(i)
        else:
            train_idx.append(i)
    
    train_df = train_mask.iloc[train_idx].reset_index(drop=True)
    valid_df = train_mask.iloc[valid_idx].reset_index(drop=True)
    
    print(f"训练集: {len(train_df)} 样本, 验证集: {len(valid_df)} 样本")
    
    # 创建数据集
    train_ds = BuildingSegmentationDataset(
        train_df['name'].values,
        train_df['mask'].fillna('').values,
        transform=train_transform_local
    )
    
    valid_ds = BuildingSegmentationDataset(
        valid_df['name'].values,
        valid_df['mask'].fillna('').values,
        transform=valid_transform_local
    )
    
    # 使用传入的batch_size
    train_loader = D.DataLoader(
        train_ds, batch_size=batch_size, shuffle=True, 
        num_workers=2, pin_memory=True
    )
    
    valid_loader = D.DataLoader(
        valid_ds, batch_size=batch_size, shuffle=False, 
        num_workers=2, pin_memory=True
    )
    
    # 训练每个模型
    for i, encoder_name in enumerate(encoders):
        print(f"\n训练模型 {i+1}/{len(encoders)}: 使用编码器 {encoder_name}")
        
        # 在每个新模型训练前，清理内存
        clear_memory()
        
        try:
            # 创建优化后的模型
            model = smp.UnetPlusPlus(
                encoder_name=encoder_name,
                encoder_weights="imagenet",
                in_channels=3,
                classes=1,
                activation=None,
                decoder_channels=(256, 128, 64, 32, 16)  # 更小的解码器
            )
            model.to(DEVICE)
            
            # 打印模型信息
            total_params = sum(p.numel() for p in model.parameters())
            print(f"模型总参数量: {total_params:,}")
            
            # 优化器和调度器
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4, eps=1e-8)
            scheduler = CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-6)
            
            # 损失函数
            criterion = CombinedLoss(dice_weight=0.8, focal_weight=0.2)
            
            # 训练
            model_save_dir = f"{save_dir}/model_{i+1}_{encoder_name}"
            os.makedirs(model_save_dir, exist_ok=True)
            
            best_dice, best_epoch = train_with_checkpoints(
                model, train_loader, valid_loader, optimizer,
                criterion, scheduler, DEVICE, epochs,
                checkpoint_dir=model_save_dir,
                accumulation_steps=accumulation_steps
            )
            
            print(f"模型 {i+1} 训练完成! 最佳Dice: {best_dice:.4f} (第{best_epoch}轮)")
            trained_models.append(f"{model_save_dir}/best_model.pth")
            best_scores.append(best_dice)
            encoders_used.append(encoder_name)
            
        except RuntimeError as e:
            if 'out of memory' in str(e).lower():
                print(f"在训练 {encoder_name} 时内存不足。尝试更小的编码器或增大累积步数。")
                clear_memory()
            else:
                print(f"训练 {encoder_name} 时出错: {e}")
        
        # 清理内存
        clear_memory()
    
    # 返回训练好的模型信息
    result_df = pd.DataFrame({
        'model_path': trained_models,
        'encoder': encoders_used,
        'best_dice': best_scores
    })
    result_df.to_csv(f"{save_dir}/ensemble_models_info.csv", index=False)
    
    return trained_models, encoders_used, best_scores

In [216]:
def fix_state_dict_keys(state_dict, model_type):
    """修复状态字典的键名以匹配模型架构
    
    Args:
        state_dict: 需要修复的状态字典
        model_type: 模型类型，"unet"或其他（如"efficientnet-b0"）
    
    Returns:
        修复后的状态字典
    """
    new_state_dict = {}
    
    if model_type == "unet":
        # 原始UNet不需要修改
        return state_dict
    else:
        # 为高级模型修复键名
        for key, value in state_dict.items():
            if key.startswith("encoder") or key.startswith("decoder") or key.startswith("segmentation_head"):
                # 添加"model."前缀
                new_key = f"model.{key}"
                new_state_dict[new_key] = value
            else:
                new_state_dict[key] = value
    
    return new_state_dict

In [217]:
def load_ensemble_models(model_paths, encoder_names):
    """加载多个模型用于集成，支持修复键名不匹配问题"""
    
    assert len(model_paths) == len(encoder_names), "模型路径和编码器名称数量必须相同"
    
    models = []
    for path, encoder in zip(model_paths, encoder_names):
        if encoder == "unet":
            # 原始UNet模型
            model = UNet(n_channels=3, n_classes=1, bilinear=False)
        else:
            # 高级模型
            model = AdvancedUNet(encoder_name=encoder)
        
        # 加载权重并修复键名
        state_dict = torch.load(path)
        fixed_state_dict = fix_state_dict_keys(state_dict, encoder)
        
        # 加载修复后的状态字典
        try:
            model.load_state_dict(fixed_state_dict)
            print(f"成功加载模型: {path} (编码器: {encoder})")
        except Exception as e:
            print(f"加载模型失败: {path} (编码器: {encoder})")
            print(f"错误: {str(e)}")
            continue
            
        model.to(DEVICE)
        model.eval()
        models.append(model)
    
    return models

In [218]:
def predict_with_ensemble(models, image_path, device=DEVICE, threshold=0.5):
    """使用模型集成进行预测"""
    # 读取图像
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # 应用测试时增强(TTA)
    tta_transforms = [
        A.Compose([
            A.Resize(IMAGE_SIZE, IMAGE_SIZE),
            A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]),
            ToTensorV2()
        ]),
        A.Compose([
            A.Resize(IMAGE_SIZE, IMAGE_SIZE),
            A.HorizontalFlip(p=1.0),
            A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]),
            ToTensorV2()
        ]),
        A.Compose([
            A.Resize(IMAGE_SIZE, IMAGE_SIZE),
            A.VerticalFlip(p=1.0),
            A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]),
            ToTensorV2()
        ])
    ]
    
    # 所有模型和所有TTA的预测结果
    all_preds = []
    
    with torch.no_grad():
        # 对每个模型进行预测
        for model in models:
            model_preds = []
            
            # 对每个TTA变换进行预测
            for transform in tta_transforms:
                # 应用变换
                augmented = transform(image=image)
                img_tensor = augmented['image'].unsqueeze(0).to(device)
                
                # 预测
                output = model(img_tensor)
                pred = torch.sigmoid(output).cpu().numpy().squeeze()
                
                # 还原变换
                if 'HorizontalFlip' in str(transform):
                    pred = np.fliplr(pred)
                if 'VerticalFlip' in str(transform):
                    pred = np.flipud(pred)
                
                # 调整大小到原始尺寸
                pred = cv2.resize(pred, (512, 512))
                model_preds.append(pred)
            
            # 平均单个模型的所有TTA预测
            avg_pred = np.mean(model_preds, axis=0)
            all_preds.append(avg_pred)
    
    # 平均所有模型的预测
    final_pred = np.mean(all_preds, axis=0)
    
    # 应用阈值得到二值分割图
    binary_mask = (final_pred > threshold).astype(np.uint8)
    
    return binary_mask, final_pred

In [219]:
def evaluate_ensemble_on_valid(model_paths, encoder_names, threshold=0.5):
    """在验证集上评估单个模型和集成的表现"""
    # 加载模型
    models = load_ensemble_models(model_paths, encoder_names)
    
    # 加载验证数据
    train_mask = pd.read_csv('数据集/train_mask.csv', sep='\t', names=['name', 'mask'])
    train_mask['name'] = train_mask['name'].apply(lambda x: '数据集/train/' + x)
    
    # 分割训练集和验证集
    valid_idx = []
    for i in range(len(train_mask)):
        if i % 7 == 0:
            valid_idx.append(i)
    
    valid_df = train_mask.iloc[valid_idx].reset_index(drop=True)
    
    # 评估每个单个模型
    individual_scores = []
    for i, model in enumerate(models):
        dice_scores = []
        model.eval()
        
        for idx, row in tqdm(valid_df.iterrows(), desc=f"评估模型 {i+1}/{len(models)}"):
            image_path = row['name']
            
            # 读取图像和真实掩码
            image = cv2.imread(image_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
            mask_gt = rle_decode(row['mask'], shape=(512, 512))
            
            # 预测掩码
            with torch.no_grad():
                transformed = valid_transform(image=image)
                img_tensor = transformed['image'].unsqueeze(0).to(DEVICE)
                output = model(img_tensor)
                pred = torch.sigmoid(output).cpu().numpy().squeeze()
                pred = cv2.resize(pred, (512, 512))
                pred_mask = (pred > threshold).astype(np.uint8)
            
            # 计算Dice分数
            dice = (2.0 * (pred_mask * mask_gt).sum()) / (pred_mask.sum() + mask_gt.sum() + 1e-8)
            dice_scores.append(dice)
            
            # 每10个样本清理一次内存
            if (idx + 1) % 10 == 0:
                clear_memory()
        
        # 记录这个模型的分数
        model_score = np.mean(dice_scores)
        individual_scores.append(model_score)
        print(f"模型 {i+1} 平均Dice分数: {model_score:.4f}")
    
    # 评估集成模型
    ensemble_dice_scores = []
    for idx, row in tqdm(valid_df.iterrows(), desc="评估模型集成"):
        image_path = row['name']
        mask_gt = rle_decode(row['mask'], shape=(512, 512))
        
        try:
            # 使用集成预测
            pred_mask, _ = predict_with_ensemble(models, image_path, DEVICE, threshold)
            
            # 计算Dice分数
            dice = (2.0 * (pred_mask * mask_gt).sum()) / (pred_mask.sum() + mask_gt.sum() + 1e-8)
            ensemble_dice_scores.append(dice)
            
            # 每10个样本清理一次内存
            if (idx + 1) % 10 == 0:
                clear_memory()
        except Exception as e:
            print(f"处理图像 {image_path} 时出错: {str(e)}")
    
    # 计算集成的平均分数
    ensemble_score = np.mean(ensemble_dice_scores)
    print(f"\n集成模型的平均Dice分数: {ensemble_score:.4f}")
    
    # 打印比较
    print("\n性能比较:")
    for i, score in enumerate(individual_scores):
        print(f"模型 {i+1}: {score:.4f}")
    print(f"集成: {ensemble_score:.4f}")
    
    # 计算集成提升
    best_individual = max(individual_scores)
    improvement = ensemble_score - best_individual
    print(f"\n集成相比最好单模型提升: {improvement:.4f} ({improvement*100:.2f}%)")
    
    return individual_scores, ensemble_score

In [220]:
def main():
    # 设置随机种子
    seed_everything(SEED)
    
    # 加载数据
    try:
        print("正在加载训练数据...")
        train_mask = pd.read_csv('数据集/train_mask.csv', sep='\t', names=['name', 'mask'])
        train_mask['name'] = train_mask['name'].apply(lambda x: '数据集/train/' + x)
    except Exception as e:
        print(f"加载数据时出错: {e}")
        print("请确保'数据集/train_mask.csv'文件存在并且格式正确!")
        return
    
    print(f"已加载 {len(train_mask)} 条训练数据")
    
    # 分割训练集和验证集
    train_idx, valid_idx = [], []
    for i in range(len(train_mask)):
        if i % 7 == 0:
            valid_idx.append(i)
        else:
            train_idx.append(i)
    
    train_df = train_mask.iloc[train_idx].reset_index(drop=True)
    valid_df = train_mask.iloc[valid_idx].reset_index(drop=True)
    
    print(f"训练集: {len(train_df)} 样本, 验证集: {len(valid_df)} 样本")
    
    # 创建数据集和数据加载器
    train_ds = BuildingSegmentationDataset(
        train_df['name'].values,
        train_df['mask'].fillna('').values,
        transform=train_transform
    )
    
    valid_ds = BuildingSegmentationDataset(
        valid_df['name'].values,
        valid_df['mask'].fillna('').values,
        transform=valid_transform
    )
    
    train_loader = D.DataLoader(
        train_ds, batch_size=BATCH_SIZE, shuffle=True, 
        num_workers=2, pin_memory=True
    )
    
    valid_loader = D.DataLoader(
        valid_ds, batch_size=BATCH_SIZE, shuffle=False, 
        num_workers=2, pin_memory=True
    )
    
    # 初始化模型
    model = UNet(n_channels=3, n_classes=1, bilinear=False)
    model.to(DEVICE)
    
    # 打印模型摘要
    print(f"模型已创建并加载到设备: {DEVICE}")
    total_params = sum(p.numel() for p in model.parameters())
    print(f"模型总参数量: {total_params:,}")
    
    # 使用余弦退火学习率调度
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4, eps=1e-8)
    
    # 替换为余弦退火学习率调度器
    # T_max为总周期数，即训练的总轮次
    # eta_min为最小学习率
    scheduler = CosineAnnealingLR(
        optimizer, 
        T_max=EPOCHS,  # 一个完整的余弦周期
        eta_min=1e-6   # 最小学习率
    )
    
    # 损失函数
    criterion = CombinedLoss(dice_weight=0.8, focal_weight=0.2)  # 增加Dice权重
    
    # 使用改进的训练循环
    best_dice, best_epoch = train_with_checkpoints(
        model, train_loader, valid_loader, optimizer, 
        criterion, scheduler, DEVICE, EPOCHS, checkpoint_dir='model_checkpoints'
    )
    
    print(f"训练完成! 最佳Dice分数: {best_dice:.4f} 在第{best_epoch}轮")
    return model

In [221]:
def predict_test_set():
    # 加载最佳模型进行预测
    model = UNet(n_channels=3, n_classes=1, bilinear=False)
    try:
        model.load_state_dict(torch.load('model_checkpoints/best_model.pth'))
    except:
        model.load_state_dict(torch.load('best_building_segmentation_model.pth'))
    
    model.to(DEVICE)
    model.eval()
    
    # 创建测试数据集
    test_paths = []
    # 确保这个路径指向您的测试图像目录
    for file in os.listdir('数据集/test_a'):
        if file.endswith('.jpg') or file.endswith('.tif'):
            test_paths.append(os.path.join('数据集/test_a', file))
    
    # 使用更小的批次
    test_batch_size = 1  # 单张预测避免内存问题
    
    test_ds = BuildingSegmentationDataset(
        test_paths,
        None,  # 测试集没有掩码数据
        transform=valid_transform
    )
    
    test_loader = D.DataLoader(
        test_ds, batch_size=test_batch_size, shuffle=False,
        num_workers=1, pin_memory=True  # 使用更少的worker和更小的批量
    )
    
    results = []
    test_files = [os.path.basename(p) for p in test_paths]
    
    with torch.no_grad():
        i = 0
        for images in tqdm(test_loader):
            images = images.to(DEVICE)
            outputs = model(images)
            preds = torch.sigmoid(outputs)
            
            # 处理每个批次的预测
            for pred in preds:
                if i >= len(test_files):  # 安全检查
                    break
                    
                pred = pred.cpu().numpy().squeeze()
                # 及时清理GPU内存
                clear_memory()
                
                pred = cv2.resize(pred, (512, 512))  # 调整为原始大小
                mask = (pred > THRESHOLD).astype(np.uint8)
                rle = rle_encode(mask)
                results.append([test_files[i], rle])
                i += 1
    
    submission = pd.DataFrame(results, columns=['name', 'mask'])
    submission.to_csv('submission.csv', index=False, header=False, sep='\t')
    print("预测完成! 结果已保存到 submission.csv")

In [222]:
def predict_test_with_ensemble(model_paths=None, encoder_names=None, threshold=0.5):
    """使用模型集成预测测试集，确保符合比赛提交格式要求"""
    # 如果没有提供模型路径，使用默认值
    if model_paths is None:
        model_paths = [
            "model_checkpoints/best_model.pth",
            "model_ensemble/model_1_efficientnet-b0/best_model.pth",
            "model_ensemble/model_1_resnet18/best_model.pth"
        ]
    
    # 如果没有提供编码器名称，使用默认值
    if encoder_names is None:
        encoder_names = ["unet", "efficientnet-b0", "resnet18"]
    
    # 加载模型
    models = load_ensemble_models(model_paths, encoder_names)
    if not models:
        print("没有可用的模型，无法进行预测。")
        return
    
    print(f"成功加载 {len(models)} 个模型进行集成预测")
    
    # 读取样本提交文件，确保顺序和格式一致
    try:
        # 使用header=None参数，防止第一行被当作列名
        sample_submit = pd.read_csv('数据集/test_a_samplesubmit.csv', header=None)
        print(f"读取样本提交文件成功，包含 {len(sample_submit)} 个测试样本")
        
        # 手动添加列名
        if len(sample_submit.columns) == 2:
            sample_submit.columns = ['name', 'mask']
        else:
            sample_submit.columns = ['name']
        
        # 清理文件名中的额外空格
        sample_submit['name'] = sample_submit['name'].str.strip()
        
        print(f"样本文件有 {len(sample_submit)} 行数据")
    except Exception as e:
        print(f"读取样本提交文件失败: {e}")
        print("尝试直接从测试文件夹读取图像...")
        test_dir = '数据集/test_a'
        sample_submit = pd.DataFrame({
            'name': [f for f in os.listdir(test_dir) if f.endswith('.jpg') or f.endswith('.tif')]
        })
    
    # 创建结果DataFrame
    results = pd.DataFrame({'name': sample_submit['name'].values})
    results['mask'] = ''  # 初始化为空字符串
    
    # 记录测试图像的完整路径
    test_dir = '数据集/test_a'
    test_paths = {}
    for f in os.listdir(test_dir):
        if f.endswith('.jpg') or f.endswith('.tif'):
            # 确保文件名的键也被清理
            test_paths[f.strip()] = os.path.join(test_dir, f)
    
    # 检查文件名匹配情况
    missing_files = []
    for filename in results['name']:
        if filename not in test_paths:
            missing_files.append(filename)
    
    if missing_files:
        print(f"警告: 有 {len(missing_files)} 个文件名在测试目录中找不到")
        print("前5个缺失的文件名:", missing_files[:5])
        print("测试目录中的第一个文件名:", list(test_paths.keys())[0])
        
        # 尝试进行更细致的匹配
        test_files_set = set(test_paths.keys())
        for missing in missing_files[:]:  # 创建副本进行遍历
            # 检查去除扩展名后是否匹配
            missing_base = os.path.splitext(missing)[0]
            for test_file in test_files_set:
                test_base = os.path.splitext(test_file)[0]
                if missing_base == test_base:
                    print(f"找到替代匹配: '{missing}' -> '{test_file}'")
                    # 更新test_paths字典以包含两种形式
                    test_paths[missing] = test_paths[test_file]
                    missing_files.remove(missing)
                    break
    
    # 对每个测试图像进行预测
    for idx, row in tqdm(results.iterrows(), desc="预测测试集"):
        filename = row['name']
        if filename not in test_paths:
            print(f"警告: 测试图像 '{filename}' 不存在，将使用空RLE")
            continue
            
        image_path = test_paths[filename]
        
        try:
            # 读取图像
            image = cv2.imread(image_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
            # 使用集成模型预测
            all_preds = []
            with torch.no_grad():  # 添加这一行
                for model in models:
                    # 进行测试时增强预测
                    model_preds = []
                    
                    # 基本变换
                    augmented = valid_transform(image=image)
                    img_tensor = augmented['image'].unsqueeze(0).to(DEVICE)
                    output = model(img_tensor)
                    pred = torch.sigmoid(output).detach().cpu().numpy().squeeze()  # 添加.detach()
                    pred = cv2.resize(pred, (512, 512))
                    model_preds.append(pred)
                    
                    # 水平翻转
                    flip_h_transform = A.Compose([
                        A.HorizontalFlip(p=1.0),
                        A.Resize(IMAGE_SIZE, IMAGE_SIZE),
                        A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]),
                        ToTensorV2()
                    ])
                    augmented = flip_h_transform(image=image)
                    img_tensor = augmented['image'].unsqueeze(0).to(DEVICE)
                    output = model(img_tensor)
                    pred = torch.sigmoid(output).detach().cpu().numpy().squeeze()  # 添加.detach()
                    pred = cv2.resize(pred, (512, 512))
                    pred = np.fliplr(pred)
                    model_preds.append(pred)
                    
                    # 垂直翻转
                    flip_v_transform = A.Compose([
                        A.VerticalFlip(p=1.0),
                        A.Resize(IMAGE_SIZE, IMAGE_SIZE),
                        A.Normalize(mean=[0.625, 0.448, 0.688], std=[0.131, 0.177, 0.101]),
                        ToTensorV2()
                    ])
                    augmented = flip_v_transform(image=image)
                    img_tensor = augmented['image'].unsqueeze(0).to(DEVICE)
                    output = model(img_tensor)
                    pred = torch.sigmoid(output).detach().cpu().numpy().squeeze()  # 添加.detach()
                    pred = cv2.resize(pred, (512, 512))
                    pred = np.flipud(pred)
                    model_preds.append(pred)
                    
                    # 平均单个模型的各种预测
                    avg_pred = np.mean(model_preds, axis=0)
                    all_preds.append(avg_pred)
            
            # 平均所有模型的预测
            final_pred = np.mean(all_preds, axis=0)
            mask = (final_pred > threshold).astype(np.uint8)
            rle = rle_encode(mask)
            
            # 保存到结果DataFrame对应位置
            results.loc[idx, 'mask'] = rle
            
            # 每50个样本清理一次内存
            if (idx + 1) % 50 == 0:
                clear_memory()
                
        except Exception as e:
            print(f"处理图像 {filename} 时出错: {str(e)}")
            traceback.print_exc()  # 添加这行以获取更详细的错误信息
            # 错误时保留空字符串
    
    # 检查提交文件中是否有原始空格，确保保持与样本文件一致的格式
    original_sample = pd.read_csv('数据集/test_a_samplesubmit.csv', header=None)
    first_line = original_sample.iloc[0, 0]
    print(f"原始样本第一行: '{first_line}'")
    
    has_trailing_space = False
    if first_line.endswith(' '):
        print("检测到原始文件名后有空格，保持此格式...")
        has_trailing_space = True
        
    # 如果原始文件有空格，添加回去
    if has_trailing_space:
        results['name'] = results['name'].apply(lambda x: x + ' ')
    
    # 创建符合要求的提交文件
    # 使用逗号作为分隔符，不保留索引，不写入列名
    # 注意: 如果是空格分隔，需要修改sep参数
    sep_char = ' ' if first_line.endswith(' ') else ','
    results.to_csv('submission_ensemble.csv', index=False, header=False, sep=sep_char)
    print(f"预测完成! 结果保存为 submission_ensemble.csv，共 {len(results)} 个样本")
    print(f"使用分隔符: '{sep_char}'")
    
    # 检查生成的提交文件
    try:
        # 手动读取第一行以检查格式
        with open('submission_ensemble.csv', 'r') as f:
            first_line = f.readline().strip()
        print(f"生成文件的第一行: '{first_line}'")
        
        # 再次检查完整文件
        check_df = pd.read_csv('submission_ensemble.csv', header=None, sep=sep_char)
        print(f"生成的提交文件包含 {len(check_df)} 行")
        
        if len(check_df) != len(sample_submit):
            print(f"警告: 生成的文件行数 ({len(check_df)}) 与样本提交文件行数 ({len(sample_submit)}) 不一致!")
        else:
            print("行数检查通过!")
            
    except Exception as e:
        print(f"提交文件检查失败: {e}")

In [223]:
def execute_ensemble_pipeline(train_new=True, include_existing=True):
    """执行完整的集成模型训练和预测流程，可选包含现有模型"""
    
    # 设置随机种子
    seed_everything(SEED)
    
    # 模型编码器列表 - 只需要训练2个新模型，现有模型为第3个
    encoders = ["efficientnet-b0", "resnet18"]
    
    
    if train_new:
        print("开始训练模型用于集成...")
        model_paths, encoder_names, best_scores = train_multiple_models(
            encoders=encoders,
            save_dir="model_ensemble",
            include_existing_model=include_existing,
            existing_model_path="model_checkpoints/best_model.pth"  # 您现有模型的路径
        )
    else:
        # 如果已经训练好了，直接加载路径
        try:
            ensemble_info = pd.read_csv("model_ensemble/ensemble_models_info.csv")
            model_paths = ensemble_info['model_path'].tolist()
            encoder_names = ensemble_info['encoder'].tolist()
        except:
            # 如果CSV不存在，使用默认路径
            if include_existing:
                model_paths = [
                    "model_checkpoints/best_model.pth",  # 现有模型
                    "model_ensemble/model_1_efficientnet-b4/best_model.pth",
                    "model_ensemble/model_2_resnet50/best_model.pth"
                ]
                encoder_names = ["unet", "efficientnet-b4", "resnet50"]
            else:
                model_paths = [
                    "model_ensemble/model_1_efficientnet-b4/best_model.pth",
                    "model_ensemble/model_2_resnet50/best_model.pth"
                ]
                encoder_names = ["efficientnet-b4", "resnet50"]
    
    # 在验证集上评估集成性能
    _, ensemble_score = evaluate_ensemble_on_valid(
        model_paths, 
        encoder_names=encoder_names
    )
    
    # 生成测试集预测
    predict_test_with_ensemble(model_paths, encoder_names=encoder_names)
    
    return ensemble_score

In [224]:
if __name__ == "__main__":
    # 包含现有模型并训练新模型
    #score = execute_ensemble_pipeline(
    #    train_new=False,       # 是否训练新模型
    #    include_existing=True  # 是否包含现有模型
    #)
    #print(f"最终集成模型的Dice分数: {score:.4f}")
    predict_test_with_ensemble()

成功加载模型: model_checkpoints/best_model.pth (编码器: unet)
成功加载模型: model_ensemble/model_1_efficientnet-b0/best_model.pth (编码器: efficientnet-b0)
成功加载模型: model_ensemble/model_1_resnet18/best_model.pth (编码器: resnet18)
成功加载 3 个模型进行集成预测
读取样本提交文件成功，包含 2500 个测试样本
样本文件有 2500 行数据


预测测试集: 0it [00:00, ?it/s]

原始样本第一行: 'R05K5826G4.jpg	'
预测完成! 结果保存为 submission_ensemble.csv，共 2500 个样本
使用分隔符: ','
生成文件的第一行: 'R05K5826G4.jpg,20 24 532 25 1043 28 1555 29 2066 32 2577 34 3089 35 3599 38 4110 41 4621 43 5133 44 5644 46 6155 48 6667 48 7178 50 7690 50 8201 52 8712 54 9224 54 9735 56 10247 57 10758 60 11269 67 11780 84 12292 88 12803 91 13314 96 13826 102 14338 115 14850 121 15362 128 15873 136 16385 145 16897 151 17409 158 17921 165 18433 173 18945 181 19457 183 19969 187 20481 199 20993 202 21505 206 22017 216 22529 223 23041 232 23553 240 24065 247 24577 250 25089 253 25601 258 26113 271 26625 277 27137 283 27649 287 28161 293 28673 297 29185 300 29697 304 30209 307 30721 101 30824 206 31233 100 31337 209 31745 100 31850 210 32257 99 32362 214 32769 99 32875 219 33281 99 33388 221 33797 50 33855 37 33901 223 34310 46 34369 35 34415 223 34825 40 34883 33 34932 220 35338 37 35396 32 35449 219 35853 7 35868 18 35910 30 35966 219 36382 15 36423 29 36482 218 36896 12 36935 29 37001 214 37448 28 37519 211