# 翼状胬肉诊断模型

# 准备

## 导入必要的库
导入PyTorch、OpenCV、Pandas等必要的库，为图像分类模型做准备。

In [None]:
# --- 交叉验证参数 ---
K = 5  # 折数
base_seed = 420 # 用于 KFold 分割的随机种子
# --- 定义要使用的 CNN 特征提取器列表 ---
#CNN_FEATURE_EXTRACTORS = ['ResNet18Classifier', 'ResNet34Classifier', 'ResNet50Classifier']
CNN_FEATURE_EXTRACTORS = ['ResNet50Classifier']
# --- LightGBM 超参数 (初步设置，需要调优) ---
# 这些参数直接影响模型的复杂度和正则化能力
lgbm_params = {
    'objective': 'multiclass', # 多分类任务
    'num_class': 3,            # 3 个类别
    'metric': 'multi_logloss', # Log Loss 作为评估指标
    'boosting_type': 'gbdt',   # 梯度提升决策树
    'n_estimators': 1000,      # 树的数量 (配合早停使用，可以设置得大一些)
    'learning_rate': 0.03,     # 学习率
    'num_leaves': 20,          # 控制树的复杂度，防止过拟合
    'max_depth': 7,            # 树的最大深度，-1表示不限制 (配合 num_leaves 控制)
    'seed': base_seed,         # 随机种子
    'n_jobs': -1,              # 使用所有可用核心
    'verbose': -1,             # 不打印中间信息
    'colsample_bytree': 0.7,   # 每棵树随机采样的特征比例
    'subsample': 0.7,          # 每棵树随机采样的样本比例
    'reg_alpha': 0.3,          # L1 正则化
    'reg_lambda': 0.4,         # L2 正则化
    'min_child_samples': 30    # 叶子节点的最小样本数
}

# --- CNN 微调参数 (在每折内部使用) ---
cnn_micro_train_params = {
    'num_epochs': 10, # 在每折中微调 CNN 的 epoch 数，不需要太多，只要让其适应当前折叠的数据即可
    'lr': 5e-4,       # 微调的学习率
    'weight_decay': 1e-4 # 微调的权重衰减
}

# ================== 缩放参数设置 =================
TARGET_SIZE = (512, 512) # 目标尺寸
output_format = "PNG" # 输出格式

# ================== 数据集路径 =================
# 数据路径
image_dir =          r"f:/train"
# colab路径
colab_zip_path = "/content/drive/My Drive/train.zip"
colab_extract_path = "/content/trains/"
# Kaggle路径
#kaggle_zip_path = "/kaggle/working/train.zip"
#kaggle_extract_path = "/kaggle/working/trains/"
kaggle_extract_path = "/kaggle/input/pterygium/train/"
kaggle_temp_path = "/kaggle/working/"

# =================== 验证集路径 =================
# 验证集路径
val_image_dir =      r"f:/val"
# colab路径
#colab_val_zip_path = "/content/drive/My Drive/val.zip"
#colab_val_extract_path = "/content/val/"
# Kaggle路径
kaggle_val_path = "/kaggle/input/pterygium/val_img/"

# =================== SHAP设置 =================
shap_scaling_factor = 100

In [None]:
def setup_matplotlib_agg_backend_if_no_gui():
    """
    检查是否可能缺少 GUI 后端（例如，在无头服务器上运行）。
    如果是这种情况，将 Matplotlib 后端设置为 'Agg' 以避免错误。

    应该在首次导入 `matplotlib.pyplot` 之前调用此函数。
    """
    
    # 检查是否在非 Windows 系统上且没有设置 DISPLAY 环境变量
    # 这是判断是否缺少 GUI 的常见启发式方法
    try:
        # 尝试获取 IPython 实例
        shell = get_ipython().__class__.__name__ # type: ignore
        # 'ZMQInteractiveShell' 表示 Jupyter Notebook 或 QtConsole
        # 'TerminalInteractiveShell' 表示 IPython 命令行
        if 'Shell' in shell:
            # Jupyter/IPython 环境
            print('检测到jupyter环境')
            get_ipython().run_line_magic('matplotlib', 'inline') # type: ignore
            return True
        else:
            # 其他情况（理论上不应发生在此 try 块）
            raise NameError
    except NameError:
        print("检测到可能没有 GUI 环境，将 Matplotlib 后端设置为 'Agg'。")
        matplotlib.use('Agg') # type: ignore
        return False      # 标准 Python 解释器 (get_ipython 未定义)
    except Exception as e:
        print(f"警告：尝试将 Matplotlib 后端设置为 'Agg' 时出错: {e}")
        return False

In [None]:
import random
import subprocess
import torch
import time
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms.functional import to_pil_image
import lightgbm as lgb
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Subset
import torch.backends.cudnn as cudnn
from torchvision import transforms, models
import pandas as pd
import boto3
import botocore
import shutil
import os
import zipfile
import shap
import sys
from PIL import Image
import platform
import numpy as np
import glob
from tqdm.autonotebook import tqdm # 好看！
import matplotlib
setup_matplotlib_agg_backend_if_no_gui()
import matplotlib.pyplot as plt
import matplotlib.font_manager

if platform.system() == "Windows":
    num_workers = 0
    print(f"检测到 Windows 系统，将 DataLoader 的 num_workers 设置为 {num_workers}。")
else:
    # 在非 Windows 系统（如 Linux/Colab）上
    num_workers = 4
    print(f"检测到非 Windows 系统 ({platform.system()})，将 DataLoader 的 num_workers 设置为 {num_workers}。")
    # 设置中文字体
    if not os.path.exists('simhei.ttf'):
        subprocess.run(['wget','-q','-O', 'simhei.ttf', "https://cdn.jsdelivr.net/gh/Haixing-Hu/latex-chinese-fonts/chinese/%E9%BB%91%E4%BD%93/SimHei.ttf"], check=True)
    matplotlib.font_manager.fontManager.addfont('simhei.ttf')
    matplotlib.rc('font', family='SimHei')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 配置GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"CUDA 可用: {torch.cuda.is_available()}")
print(f"使用的设备: {device}")

if torch.cuda.is_available():
    cudnn.benchmark = True
    print("cuDNN benchmark 模式已启用")

## 读取和准备数据
从train_classification_label.xlsx读取标签数据，并组织预处理后的图像数据路径。标签包括：0（健康）、1（建议观察）、2（建议手术）。

In [None]:
if os.path.exists('.env'):
    from dotenv import load_dotenv
    load_dotenv('.env')

R2_ACCESS_KEY_ID = os.environ.get('R2_ACCESS_KEY_ID', '')
R2_SECRET_ACCESS_KEY = os.environ.get('R2_SECRET_ACCESS_KEY', '')
R2_BUCKET_NAME = os.environ.get('R2_BUCKET_NAME', '')
R2_ENDPOINT_URL = os.environ.get('R2_ENDPOINT_URL', '')

# 如果在云端上运行，从 Google Drive 读取数据
if 'google.colab' in sys.modules or os.path.exists("/kaggle/working"):
    if 'google.colab' in sys.modules:
        print('在 Google Colab 环境中运行')
        image_dir = os.path.join(colab_extract_path,"train")
        label_file = os.path.join(image_dir,"train_classification_label.xlsx")
        zip_path = colab_zip_path
        extract_path = colab_extract_path

        # Mount Google Drive
        from google.colab import drive # type: ignore
        from google.colab import userdata # type: ignore
        drive.mount('/content/drive')
        R2_ACCESS_KEY_ID = userdata.get("R2_ACCESS_KEY_ID")
        R2_SECRET_ACCESS_KEY = userdata.get("R2_SECRET_ACCESS_KEY")
        R2_BUCKET_NAME = userdata.get("R2_BUCKET_NAME")
        R2_ENDPOINT_URL = userdata.get("R2_ENDPOINT_URL")
    else:
        print('在 Kaggle 环境中运行')
        # Kaggle 环境下的路径设置
        # image_dir = os.path.join(kaggle_extract_path,"train")
        # label_file = os.path.join(image_dir,"train_classification_label.xlsx")
        # zip_path = kaggle_zip_path
        # extract_path = kaggle_extract_path

        # Google Drive 有每日下载次数限制，可能会导致下载失败
        # if not os.path.exists(zip_path):
        #     from kaggle_secrets import UserSecretsClient
        #     user_secrets = UserSecretsClient()
        #     !gdown --id {user_secrets.get_secret("train_zip_downloadurl")}
        image_dir = os.path.join(kaggle_extract_path,"train")
        label_file = os.path.join(image_dir,"train_classification_label.xlsx")
        val_image_dir = os.path.join(kaggle_val_path,"val_img")

        from kaggle_secrets import UserSecretsClient # type: ignore
        user_secrets = UserSecretsClient()
        R2_ACCESS_KEY_ID = user_secrets.get_secret("R2_ACCESS_KEY_ID")
        R2_SECRET_ACCESS_KEY = user_secrets.get_secret("R2_SECRET_ACCESS_KEY")
        R2_BUCKET_NAME = user_secrets.get_secret("R2_BUCKET_NAME")
        R2_ENDPOINT_URL = user_secrets.get_secret("R2_ENDPOINT_URL")

    if not os.path.exists(label_file):
        # 解压数据
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)    
else:
    print(f'不在云端环境中运行,使用本地数据路径{image_dir}')
label_file = os.path.join(image_dir,"train_classification_label.xlsx")

# 自定义数据集类，用于读取图像和标签
class PterygiumDataset(Dataset):
    def __init__(self, label_file, image_dir, transform=None):
        """
        初始化数据集
        :param label_file: 包含图像标签的Excel文件路径
        :param image_dir: 图像文件夹路径
        :param transform: 图像变换操作
        """
        self.labels_df = pd.read_excel(label_file)
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        """
        获取指定索引的图像和标签
        :param idx: 索引
        :return: 图像张量和对应标签
        """
        row = self.labels_df.iloc[idx]
        image_name = row['Image']
        label = row['Pterygium']
        image_folder = f"{int(image_name):04d}"
        image_path = os.path.join(self.image_dir, image_folder, f"{image_folder}.png")

        # 加载图像
        image = Image.open(image_path).convert("RGB")

        # 应用图像变换
        if self.transform:
            image = self.transform(image)

        return image, label

## 数据 Resize
只在Linux运行时使用，因为windows仅用与测试。

### 准备R2

In [None]:
def create_r2_client():
    """尝试创建并返回一个配置好的 boto3 R2 客户端。"""
    # 确认环境变量已加载 (这些变量应在之前的单元格中设置)
    required_vars = ['R2_ENDPOINT_URL', 'R2_ACCESS_KEY_ID', 'R2_SECRET_ACCESS_KEY', 'R2_BUCKET_NAME']
    if not all(var in globals() and globals()[var] for var in required_vars):
        print("R2 配置不完整（缺少 Endpoint URL, Access Key, Secret Key 或 Bucket Name）。跳过 R2 缓存。")
        return None, False # 返回 None 和 R2 未配置标志

    global r2_configured # 声明我们要修改全局变量
    r2_configured = True # 标记 R2 已配置

    try:
        print("正在创建 R2 (boto3 S3) 客户端...")
        s3_client = boto3.client(
            service_name='s3',
            endpoint_url=R2_ENDPOINT_URL,
            aws_access_key_id=R2_ACCESS_KEY_ID,
            aws_secret_access_key=R2_SECRET_ACCESS_KEY,
            region_name='auto', # R2 通常使用 'auto'
            config=botocore.config.Config(signature_version='s3v4') # 明确签名版本
        )
        # 尝试列出 buckets (可选，作为连接测试)
        # s3_client.list_buckets()
        print("R2 客户端创建成功。")
        return s3_client, True
    except Exception as e:
        print(f"创建 R2 客户端时出错: {e}")
        r2_configured = False # 出错则标记为未配置
        return None, False

def check_r2_cache(s3_client, bucket_name, cache_key):
    """检查指定的缓存键是否存在于 R2 存储桶中。"""
    if not s3_client: return False
    try:
        s3_client.head_object(Bucket=bucket_name, Key=cache_key)
        return True
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            return False # 文件未找到
        else:
            # 其他错误 (如权限问题)
            print(f"检查 R2 缓存时出错 (Key: {cache_key}): {e}")
            return False
    except Exception as e:
        print(f"检查 R2 缓存时发生未知错误: {e}")
        return False

def download_from_r2(s3_client, bucket_name, cache_key, local_path):
    """从 R2 下载文件到本地路径，带进度条。"""
    if not s3_client: return False
    try:
        # 获取文件大小以显示进度
        response = s3_client.head_object(Bucket=bucket_name, Key=cache_key)
        total_size = int(response.get('ContentLength', 0))

        print(f"正在从 R2 下载 {cache_key} 到 {local_path} ({total_size / (1024*1024):.2f} MB)...")
        with tqdm(total=total_size, unit='B', unit_scale=True, desc=cache_key, leave=False) as pbar:
            s3_client.download_file(
                Bucket=bucket_name,
                Key=cache_key,
                Filename=local_path,
                Callback=lambda bytes_transferred: pbar.update(bytes_transferred)
            )
        print(f"文件 {cache_key} 下载完成。")
        return True
    except botocore.exceptions.ClientError as e:
        print(f"从 R2 下载文件时出错 (Key: {cache_key}): {e}")
        # 如果文件下载失败，尝试删除本地可能不完整的文件
        if os.path.exists(local_path):
            try: os.remove(local_path)
            except: pass
        return False
    except Exception as e:
        print(f"下载 R2 文件时发生未知错误: {e}")
        if os.path.exists(local_path):
            try: os.remove(local_path)
            except: pass
        return False

def upload_to_r2(s3_client, bucket_name, local_path, cache_key):
    """将本地文件上传到 R2，带进度条。"""
    if not s3_client or not os.path.exists(local_path):
        print(f"上传 R2 失败：客户端未初始化或本地文件不存在 ({local_path})。")
        return False
    try:
        total_size = os.path.getsize(local_path)
        print(f"正在上传 {local_path} ({total_size / (1024*1024):.2f} MB) 到 R2 作为 {cache_key}...")
        with tqdm(total=total_size, unit='B', unit_scale=True, desc=cache_key, leave=False) as pbar:
            s3_client.upload_file(
                Filename=local_path,
                Bucket=bucket_name,
                Key=cache_key,
                Callback=lambda bytes_transferred: pbar.update(bytes_transferred)
            )
        print(f"文件 {cache_key} 上传完成。")
        return True
    except botocore.exceptions.ClientError as e:
        print(f"上传文件到 R2 时出错 (Key: {cache_key}): {e}")
        return False
    except Exception as e:
        print(f"上传 R2 文件时发生未知错误: {e}")
        return False

def zip_directory(folder_path, zip_path):
    """压缩指定文件夹的内容到 zip 文件。"""
    if not os.path.isdir(folder_path):
        print(f"错误：要压缩的文件夹不存在 {folder_path}")
        return False
    print(f"正在压缩目录 {folder_path} 到 {zip_path}...")
    try:
        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            # 获取文件夹内的所有文件和子文件夹
            file_paths = []
            for root, dirs, files in os.walk(folder_path):
                for filename in files:
                    file_paths.append(os.path.join(root, filename))

            # 使用 tqdm 显示压缩进度 (按文件数)
            with tqdm(total=len(file_paths), desc="压缩文件", unit="file", leave=False) as pbar:
                for file in file_paths:
                    # 计算文件在 zip 中的相对路径
                    arcname = os.path.relpath(file, folder_path)
                    zipf.write(file, arcname)
                    pbar.update(1)
        print("目录压缩完成。")
        return True
    except Exception as e:
        print(f"压缩目录时出错: {e}")
        # 如果压缩失败，删除可能不完整的 zip 文件
        if os.path.exists(zip_path):
            try: os.remove(zip_path)
            except: pass
        return False

def unzip_directory(zip_path, extract_to_folder):
    """解压缩 zip 文件到指定文件夹。"""
    if not os.path.exists(zip_path):
        print(f"错误：要解压的 zip 文件不存在 {zip_path}")
        return False
    print(f"正在解压缩文件 {zip_path} 到 {extract_to_folder}...")
    try:
        os.makedirs(extract_to_folder, exist_ok=True) # 确保目标文件夹存在
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # 获取 zip 文件中的成员数量以显示进度
            total_files = len(zip_ref.namelist())
            with tqdm(total=total_files, desc="解压缩文件", unit="file", leave=False) as pbar:
                # 使用 extractall 并更新进度条可能不直接，改为逐个提取
                for member in zip_ref.infolist():
                    zip_ref.extract(member, extract_to_folder)
                    pbar.update(1)
                    # 或者直接用 extractall，进度条可能不准确但更快
                    # zip_ref.extractall(extract_to_folder)
        print("文件解压缩完成。")
        return True
    except Exception as e:
        print(f"解压缩文件时出错: {e}")
        # 如果解压失败，可以选择是否删除不完整的解压目录
        # if os.path.exists(extract_to_folder):
        #     shutil.rmtree(extract_to_folder)
        return False

In [None]:
resize_transform = transforms.Resize(TARGET_SIZE, interpolation=transforms.InterpolationMode.BILINEAR, antialias=True)

# --- Processing Function ---
def resize_and_save_image(img_info, base_input_dir, base_output_dir, transform, device):
    """
    Reads an image, resizes it (potentially on GPU), and saves it.
    """
    try:
        image_name = img_info['Image']
        image_name = f"{int(image_name):04d}"
        if os.path.exists(os.path.join(base_input_dir, f"{image_name}.png")):
            # 验证集图像路径
            input_path = os.path.join(base_input_dir, f"{image_name}.png")
            os.makedirs(base_output_dir, exist_ok=True)
            output_path = os.path.join(base_output_dir, f"{image_name}.{output_format.lower()}")
        else:
            # 训练集图像路径
            input_path = os.path.join(base_input_dir, image_name, f"{image_name}.png")
            # Create corresponding output subdirectory if it doesn't exist
            output_folder_path = os.path.join(base_output_dir, image_name)
            os.makedirs(output_folder_path, exist_ok=True)
            output_path = os.path.join(output_folder_path, f"{image_name}.{output_format.lower()}")

        # 1. Read image using PIL (CPU)
        img_pil = Image.open(input_path).convert("RGB")

        # 2. Convert PIL image to Tensor (CPU, scales to [0, 1])
        img_tensor_cpu = transforms.functional.to_tensor(img_pil) # Output: CxHxW

        # 3. Move tensor to GPU (if available)
        img_tensor_gpu = img_tensor_cpu.to(device)

        # 4. Apply Resize transform (GPU)
        resized_tensor_gpu = transform(img_tensor_gpu)

        # 5. Move resized tensor back to CPU
        resized_tensor_cpu = resized_tensor_gpu.cpu()

        # 6. Convert tensor back to PIL Image (CPU)
        # to_pil_image expects CxHxW tensor in [0, 1] range
        resized_img_pil = to_pil_image(resized_tensor_cpu)

        # 7. Save the resized PIL image (CPU)
        resized_img_pil.save(output_path, format=output_format)
        
        return True # Indicate success

    except FileNotFoundError:
        print(f"错误: 文件未找到 {input_path}")
        return False
    except Exception as e:
        print(f"错误处理图像 {input_path}: {e}")
        return False

In [None]:
if 'google.colab' in sys.modules:
    print('在 Google Colab 环境中运行')
    original_image_dir = os.path.join(colab_extract_path,"train")
    output_dir = os.path.join(colab_extract_path,"train_resized")
    temp_dir = colab_extract_path
elif os.path.exists("/kaggle/working"):
    print('在 Kaggle 环境中运行')
    original_image_dir = os.path.join(kaggle_extract_path,"train")
    output_dir = os.path.join(kaggle_temp_path,"train_resized")
    temp_dir = kaggle_temp_path
else:
    print("错误: 无法识别的非 Windows 环境（可能是Linux），需要手动处理")
    exit(1)

if original_image_dir:
    print(f"原始输入目录: {original_image_dir}")
    print(f"目标输出目录: {output_dir}")
    print(f"临时文件目录: {temp_dir}")
    print(f"目标尺寸: {TARGET_SIZE}")

    # 创建 R2 客户端并检查配置
    s3_client, r2_configured = create_r2_client()
    r2_cache_key = f"work1_resized_{TARGET_SIZE[0]}x{TARGET_SIZE[1]}.zip"
    print(f"生成的 R2 缓存键: {r2_cache_key}")
    r2_local_zip_path = os.path.join(temp_dir, r2_cache_key)
    resize_done = False
    if os.path.exists(output_dir) and os.listdir(output_dir):
        print("检测到已存在的resize数据在本地，跳过resize步骤")
        resize_done = True

    # --- If not found locally, try R2 cache ---
    os.makedirs(output_dir, exist_ok=True)
    if not resize_done and r2_configured:
        print(f"本地目录 {output_dir} 为空或不存在，尝试检查 R2 缓存...")
        if check_r2_cache(s3_client, R2_BUCKET_NAME, r2_cache_key):
            print(f"检测到 R2 缓存文件: {r2_cache_key}. 尝试下载...")
            # Download the cache
            if download_from_r2(s3_client, R2_BUCKET_NAME, r2_cache_key, r2_local_zip_path):
                print(f"R2 缓存下载成功。正在解压到 {output_dir}...")
                # Unzip the cache
                # Ensure output_dir is clean before extracting to avoid mixing old/new files
                if os.path.exists(output_dir):
                    try: shutil.rmtree(output_dir)
                    except Exception as e: print(f"警告: 清理旧的输出目录失败: {e}")
                os.makedirs(output_dir, exist_ok=True) # Recreate empty directory
                if unzip_directory(r2_local_zip_path, output_dir):
                    print("R2 缓存解压成功。跳过本地resize步骤。")
                    resize_done = True # Data loaded from R2 cache
                    # Clean up the temporary zip file after extraction
                    if os.path.exists(r2_local_zip_path):
                        try: os.remove(r2_local_zip_path)
                        except Exception as e: print(f"警告: 清理本地zip文件失败: {e}")
                else:
                    print("错误: R2 缓存解压失败。将执行本地resize。")
                    resize_done = False # Reset flag to perform local resize
                    # Clean up potentially incomplete extraction directory
                    if os.path.exists(output_dir):
                        try: shutil.rmtree(output_dir)
                        except Exception as e: print(f"警告: 清理不完整输出目录失败: {e}")
            else:
                print("错误: 从 R2 下载缓存失败。将执行本地resize。")
                resize_done = False # Reset flag to perform local resize
        else:
            print("未检测到 R2 缓存文件。将执行本地resize。")
            resize_done = False # Ensure flag is false
    elif not resize_done and not r2_configured:
        print("R2 未配置或初始化失败，将执行本地resize。")
        resize_done = False # Ensure flag is false
    # --- Perform local resizing if not done by cache ---
    if not resize_done:
        print("执行本地图像resize...")
        try:
            labels_df = pd.read_excel(label_file)
        except Exception as e:
            print(f"Error reading label file {label_file}: {e}")
            sys.exit(1)
        success_count = 0
        error_count = 0
        # Create the main output directory BEFORE starting the loop (done above, but ensure it exists)
        os.makedirs(output_dir, exist_ok=True)
        # Iterate through images listed in the label file
        # Use original_image_dir as input base for resizing
        for index, row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Resizing Images"):
            if resize_and_save_image(row, original_image_dir, output_dir, resize_transform, device):
                success_count += 1
            else:
                error_count += 1
        print(f"\n本地处理完成!")
        print(f"成功处理图像数: {success_count}")
        print(f"处理失败图像数: {error_count}")
        print(f"处理后的图像保存在: {output_dir}")
        # --- Upload resized data to R2 cache if configured and resizing was successful ---
        if r2_configured and success_count > 0: # Only upload if some files were processed successfully
            print(f"将本地resize后的数据上传到 R2 缓存 ({r2_cache_key})...")
            # Create a zip file of the output directory
            if zip_directory(output_dir, r2_local_zip_path):
                # Upload the zip file
                if upload_to_r2(s3_client, R2_BUCKET_NAME, r2_local_zip_path, r2_cache_key):
                    print("R2 缓存上传成功。")
                else:
                    print("错误: R2 缓存上传失败。")
                # Clean up the temporary local zip file after upload attempt
                if os.path.exists(r2_local_zip_path):
                    try: os.remove(r2_local_zip_path)
                    except Exception as e: print(f"警告: 清理本地zip文件失败: {e}")
            else:
                print("错误: 创建本地 zip 文件失败，跳过 R2 上传。")
        elif not r2_configured:
            print("R2 未配置，跳过上传resize后的数据。")
        elif success_count == 0:
            print("本地resize失败（成功处理图像数为0），跳过R2上传。")
    image_dir = output_dir
else:
    print("未识别的非 Windows 环境，跳过图片resize步骤。")
    pass

## 创建数据加载器
使用PyTorch的Dataset和DataLoader类创建数据集和加载器，包括数据增强和训练/验证集的划分。

### 模拟高光的数据增强策略

In [None]:
from PIL import Image, ImageDraw
import random
import torchvision.transforms.functional as TF

class AddRandomHighlight:
    """
    一个 torchvision transform，用于在 PIL 图像上随机添加圆形高光。

    参数:
        p (float): 应用此变换的概率 (0 到 1)。
        max_highlights (int): 单张图像上添加的最大高光数量（实际数量将在1到max_highlights之间随机选择）。
        radius_range (tuple): 一个包含两个整数的元组 (min_radius, max_radius)，指定高光圆形的半径范围。
        color (tuple): 一个包含三个整数的元组 (R, G, B)，指定高光的颜色 (默认为白色)。
    """
    def __init__(self, p=0.5, max_highlights=3, radius_range=(5, 15), color=(255, 255, 255)):
        if not 0.0 <= p <= 1.0:
            raise ValueError(f"概率 p 必须在 [0, 1] 范围内, 但得到 {p}")
        if not (isinstance(max_highlights, int) and max_highlights >= 1):
            raise ValueError(f"最大高光数 max_highlights 必须是 >= 1 的整数, 但得到 {max_highlights}")
        if not (isinstance(radius_range, tuple) and len(radius_range) == 2 and
                isinstance(radius_range[0], int) and isinstance(radius_range[1], int) and
                0 < radius_range[0] <= radius_range[1]):
            raise ValueError(f"半径范围 radius_range 必须是 (min, max) 形式的正整数元组，且 min <= max, 但得到 {radius_range}")
        if not (isinstance(color, tuple) and len(color) == 3 and all(0 <= c <= 255 for c in color)):
            raise ValueError(f"颜色 color 必须是 (R, G, B) 形式的元组，且值在 [0, 255] 范围内, 但得到 {color}")
            
        self.p = p
        self.max_highlights = max_highlights
        self.radius_range = radius_range
        self.color = color

    def __call__(self, img):
        """
        对输入的 PIL 图像应用变换。

        参数:
            img (PIL.Image.Image): 输入的 PIL 图像。

        返回:
            PIL.Image.Image: 可能添加了高光的 PIL 图像。
        """
        # 以概率 p 应用此变换
        if random.random() < self.p:
            # 复制图像以避免修改原始图像（如果原始图像后续还需使用）
            # 如果这是 Compose 链中的一步，通常不需要显式复制
            # img = img.copy() # 如果需要确保不修改原始输入，取消注释此行

            # 随机决定生成多少个高光 (至少1个)
            # 根据用户要求“小于四个”，我们生成 1 到 max_highlights (这里是3) 个
            num_highlights = random.randint(1, self.max_highlights)
            
            # 获取图像尺寸
            width, height = img.size
            
            # 创建 ImageDraw 对象以在图像上绘制
            draw = ImageDraw.Draw(img)

            for _ in range(num_highlights):
                # 随机选择半径
                radius = random.randint(self.radius_range[0], self.radius_range[1])
                
                # 随机选择圆心位置
                # 确保圆心位置加上半径不会超出图像边界太多（允许部分圆在边缘）
                # 稍微限制圆心范围，避免完全生成在图像外的圆心
                center_x = random.randint(0, width - 1) 
                center_y = random.randint(0, height - 1)

                # 计算圆形的边界框 (left, top, right, bottom)
                # ellipse 方法绘制的是指定边界框内的椭圆，如果边界框是正方形，则为圆形
                left = center_x - radius
                top = center_y - radius
                right = center_x + radius
                bottom = center_y + radius
                
                # 绘制实心圆形高光
                draw.ellipse([left, top, right, bottom], fill=self.color)
            
        # 返回处理后的图像（可能是原始图像或添加了高光的图像）
        return img

    def __repr__(self):
        # 提供一个清晰的表示形式，方便调试
        return f"{self.__class__.__name__}(p={self.p}, max_highlights={self.max_highlights}, radius_range={self.radius_range}, color={self.color})"

### 应用数据增强

In [None]:
# 数据变换
train_transform = transforms.Compose([
    transforms.Resize((int(TARGET_SIZE[0]*1.2), int(TARGET_SIZE[1]*1.2))), # 先放大一点
    transforms.RandomCrop(TARGET_SIZE), # 随机裁剪回目标尺寸
    transforms.RandomHorizontalFlip(p=0.5), # 随机水平翻转
    transforms.RandomRotation(degrees=20), # 随机旋转
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1), # 随机颜色抖动
    AddRandomHighlight(p=0.3, max_highlights=3, radius_range=(5, 12)), # 试试添加高光抑制模型关注高光问题
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 定义验证集/测试集的变换
val_transform = transforms.Compose([
    transforms.Resize(TARGET_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 划分训练集和验证集，并创建对应的数据加载器
from sklearn.model_selection import train_test_split

# 读取标签文件
labels_df = pd.read_excel(label_file)

# 按照8:2的比例划分训练集和验证集
train_df, val_df = train_test_split(labels_df, test_size=0.2, random_state=420, stratify=labels_df['Pterygium'])

# 保存划分后的数据集到文件
train_label_file = os.path.join(image_dir, "train_classification_label_train.xlsx")
val_label_file = os.path.join(image_dir, "train_classification_label_val.xlsx")
if os.path.exists("/kaggle/working"):
    train_label_file = os.path.join(kaggle_temp_path, "train_classification_label_train.xlsx")
    val_label_file = os.path.join(kaggle_temp_path, "train_classification_label_val.xlsx")
train_df.to_excel(train_label_file, index=False)
val_df.to_excel(val_label_file, index=False)

# 创建训练集和验证集的数据集对象 (使用不同的 transform)
train_dataset = PterygiumDataset(label_file=train_label_file, image_dir=image_dir, transform=train_transform) # 使用训练变换
val_dataset = PterygiumDataset(label_file=val_label_file, image_dir=image_dir, transform=val_transform) # 使用验证变换

# 创建训练集和验证集的数据加载器
train_loader = DataLoader(train_dataset,
                        batch_size=64,
                        shuffle=True,
                        num_workers=num_workers,
                        prefetch_factor=2 if platform.system() == "Windows" else 10,
                        pin_memory=False if platform.system() == "Windows" else True)
val_loader = DataLoader(val_dataset,
                        batch_size=64,
                        shuffle=False,
                        num_workers=num_workers,
                        prefetch_factor=2 if platform.system() == "Windows" else 10,
                        pin_memory=False if platform.system() == "Windows" else True)

# 构建 ResNet 模型
使用PyTorch的预训练ResNet18模型，修改最后的全连接层以适应3个类别的分类任务。

In [None]:
from torchvision.models import ResNet18_Weights, ResNet34_Weights, ResNet50_Weights, ResNet101_Weights, ResNet152_Weights
class ResNet18Classifier(nn.Module):
    def __init__(self, num_classes=3, dropout_rate=0.5):
        super().__init__()
        self.resnet18 = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
        in_features = self.resnet18.fc.in_features
        self.resnet18.fc = nn.Sequential(
            nn.Dropout(p=dropout_rate),
            nn.Linear(in_features, num_classes)
        )
    def forward(self, x):
        return self.resnet18(x)

class ResNet34Classifier(nn.Module):
    def __init__(self, num_classes=3, dropout_rate=0.5):
        super().__init__()
        self.resnet34 = models.resnet34(weights=ResNet34_Weights.IMAGENET1K_V1)
        in_features = self.resnet34.fc.in_features
        self.resnet34.fc = nn.Sequential(
            nn.Dropout(p=dropout_rate),
            nn.Linear(in_features, num_classes)
        )
    def forward(self, x):
        return self.resnet34(x)

class ResNet50Classifier(nn.Module):
    def __init__(self, num_classes=3, dropout_rate=0.5):
        super().__init__()
        self.resnet50 = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        in_features = self.resnet50.fc.in_features
        self.resnet50.fc = nn.Sequential(
            nn.Dropout(p=dropout_rate),
            nn.Linear(in_features, num_classes)
        )
    def forward(self, x):
        return self.resnet50(x)
    
class ResNet101Classifier(nn.Module):
    def __init__(self, num_classes=3, dropout_rate=0.5):
        super().__init__()
        self.resnet101 = models.resnet101(weights=ResNet101_Weights.IMAGENET1K_V1)
        in_features = self.resnet101.fc.in_features
        self.resnet101.fc = nn.Sequential(
            nn.Dropout(p=dropout_rate),
            nn.Linear(in_features, num_classes)
        )
    def forward(self, x):
        return self.resnet101(x)

class ResNet152Classifier(nn.Module):
    def __init__(self, num_classes=3, dropout_rate=0.5):
        super().__init__()
        self.resnet152 = models.resnet152(weights=ResNet152_Weights.IMAGENET1K_V1)
        in_features = self.resnet152.fc.in_features
        self.resnet152.fc = nn.Sequential(
            nn.Dropout(p=dropout_rate),
            nn.Linear(in_features, num_classes)
        )
    def forward(self, x):
        return self.resnet152(x)


In [None]:
def get_feature_extractor(model_name, device, dropout_rate=0.5):
    """
    加载一个 CNN 模型并移除最后的分类层，作为特征提取器。
    """
    if model_name == 'ResNet18Classifier':
        base_model = ResNet18Classifier(num_classes=3, dropout_rate=dropout_rate).resnet18
        feature_extractor = nn.Sequential(*(list(base_model.children())[:-1]))
    elif model_name == 'ResNet34Classifier':
        base_model = ResNet34Classifier(num_classes=3, dropout_rate=dropout_rate).resnet34
        feature_extractor = nn.Sequential(*(list(base_model.children())[:-1]))
    elif model_name == 'ResNet50Classifier':
        base_model = ResNet50Classifier(num_classes=3, dropout_rate=dropout_rate).resnet50
        feature_extractor = nn.Sequential(*(list(base_model.children())[:-1]))
    elif model_name == 'ResNet101Classifier':
        base_model = ResNet101Classifier(num_classes=3, dropout_rate=dropout_rate).resnet101
        feature_extractor = nn.Sequential(*(list(base_model.children())[:-1]))
    elif model_name == 'ResNet152Classifier':
        base_model = ResNet152Classifier(num_classes=3, dropout_rate=dropout_rate).resnet152
        feature_extractor = nn.Sequential(*(list(base_model.children())[:-1]))
    else:
        raise ValueError(f"Unsupported model name for feature extraction: {model_name}")
    
    # 确保模型在评估模式以获取一致的特征 (特别是对于包含 Batch Norm 或 Dropout 的层)
    feature_extractor.eval() 
    return feature_extractor.to(device)

def extract_features(model, data_loader, device):
    """
    使用指定的模型（特征提取器）从数据加载器中提取特征。
    """
    model.eval()
    features_list = []
    labels_list = []

    with torch.no_grad():
        for inputs, targets in tqdm(data_loader, desc="Extracting Features", leave=False):
            inputs = inputs.to(device)
            
            # 注意：这里直接使用模型进行前向传播，模型应该是已经移除了fc层的特征提取器
            # 对于 ResNet，最终输出是一个形状为 (N, C, 1, 1) 的张量，需要展平
            with torch.amp.autocast('cuda'): # 在特征提取时也使用自动混合精度
                outputs = model(inputs)
                # 展平特征向量 (从 N, C, 1, 1 变成 N, C)
                outputs = outputs.view(outputs.size(0), -1) 
            
            features_list.append(outputs.cpu().numpy())
            labels_list.append(targets.cpu().numpy())
            
    all_features = np.concatenate(features_list, axis=0)
    all_labels = np.concatenate(labels_list, axis=0)
    
    return all_features, all_labels

# 模型保存和加载

In [None]:
import joblib

# 保存 Boosting 模型
def save_boosting_model(model, path):
    joblib.dump(model, path)
    print(f"Boosting 模型已保存到 {path}")

# 加载 Boosting 模型
def load_boosting_model(path):
    model = joblib.load(path)
    print(f"Boosting 模型已从 {path} 加载")
    return model

# 保存 CNN 模型参数 (如果需要在预测时加载 CNN 特征提取器，例如在最终预测阶段)
def save_cnn_state_dict(model, path):
    torch.save(model.state_dict(), path)
    print(f"CNN 模型参数已保存到 {path}")

# 加载 CNN 模型参数
def load_cnn_state_dict(model, path, device):
    model.load_state_dict(torch.load(path, map_location=device, weights_only=True))
    model = model.to(device)
    print(f"CNN 模型参数已从 {path} 加载")
    return model

# Boosting 基线模型 K 折交叉验证
K 折交叉验证 (K=5)，在每折中训练 CNN 特征提取器，提取特征，然后训练 LightGBM 模型，并评估

In [None]:
# --- 存储结果 ---
fold_val_accuracies = []
fold_val_macro_f1_scores = []
fold_val_macro_precision_scores = [] # 评估指标包含 precision

# --- 准备完整数据集 (加载一次，用于 KFold 分割和创建 Subset) ---
try:
    full_labels_df = pd.read_excel(label_file)
    all_original_labels = full_labels_df['Pterygium'].values
    # 创建基础数据集实例，不应用变换 (变换在创建 Subset 时按需应用)
    base_full_dataset_no_transform = PterygiumDataset(label_file=label_file, image_dir=image_dir, transform=None)

    # 确保数据集大小和标签数量匹配
    assert len(base_full_dataset_no_transform) == len(all_original_labels), "Dataset size and label count mismatch!"
    print(f"成功加载完整数据集，共 {len(base_full_dataset_no_transform)} 张图像。")
except Exception as e:
    print(f"Error preparing full dataset for K-Fold: {e}")
    K = 0 # 设置 K 为 0 以跳过循环

# --- 创建 KFold 分割器 ---
if K > 0:
    skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=base_seed)
    print(f"开始进行 {K}-Fold Cross-Validation...")
else:
    print("Skipping K-Fold Cross-Validation due to data preparation error or K=0.")

# --- K-Fold 循环 ---
current_fold_num = 0

# 注意：这里直接迭代 skf.split() 结果，它提供了索引
for train_idx_fold, val_idx_fold in skf.split(np.arange(len(base_full_dataset_no_transform)), all_original_labels):
    current_fold_num += 1
    fold_id_str = f"Fold {current_fold_num}/{K}"
    print(f"\n--- 开始 {fold_id_str} ---")

    # --- 1. 创建当前折的 Subset 数据集和 DataLoader ---
    # 在 Subset 中应用变换
    train_subset_fold = Subset(base_full_dataset_no_transform, train_idx_fold)
    val_subset_fold = Subset(base_full_dataset_no_transform, val_idx_fold)

    # 为 Subset 应用变换 (需要一个包装类或手动应用)
    # 更简单的方法是创建数据集时直接使用变换，然后在 split 后 Subset
    # 但这里我们是按需加载和变换的，所以需要确保 Subset 使用正确的变换
    # 为了兼容性，我们假设 base_full_dataset_no_transform 只是提供了路径和标签
    # 在 DataLoaer 中，getItem 会调用 transforms。所以 Subset 可以直接用于新的 Dataset 实例
    # 更好的方法是：
    # train_dataset_fold = PterygiumDataset(labels_df.iloc[train_idx_fold], image_dir, train_transform) # 需要修改 Dataset 初始化接受 df
    # val_dataset_fold = PterygiumDataset(labels_df.iloc[val_idx_fold], image_dir, val_transform)
    # 简化起见，我们直接用 Subset 索引原始的 Dataset 实例，并在 Dataset 中处理变换
    # 原有的 PterygiumDataset 已经可以接受 label_file 和 image_dir
    # 我们需要临时创建针对当前 fold 的 label 文件
    
    # 临时保存当前折的训练和验证标签文件
    fold_train_df = full_labels_df.iloc[train_idx_fold].copy()
    fold_val_df = full_labels_df.iloc[val_idx_fold].copy()
    
    # 确定临时文件路径 (例如，在 /kaggle/working 或 /tmp)
    if 'google.colab' in sys.modules or os.path.exists("/kaggle/working"):
        temp_dir_for_folds = os.path.join(kaggle_temp_path, "fold_labels")
    else: # Local or other non-kaggle/colab env
        temp_dir_for_folds = "./temp_fold_labels"

    os.makedirs(temp_dir_for_folds, exist_ok=True)
    
    fold_train_label_file = os.path.join(temp_dir_for_folds, f"fold_{current_fold_num}_train_labels.xlsx")
    fold_val_label_file = os.path.join(temp_dir_for_folds, f"fold_{current_fold_num}_val_labels.xlsx")

    fold_train_df.to_excel(fold_train_label_file, index=False)
    fold_val_df.to_excel(fold_val_label_file, index=False)

    train_dataset_fold = PterygiumDataset(label_file=fold_train_label_file, image_dir=image_dir, transform=train_transform)
    val_dataset_fold = PterygiumDataset(label_file=fold_val_label_file, image_dir=image_dir, transform=val_transform)

    train_loader_fold = DataLoader(train_dataset_fold,
                                batch_size=64,
                                shuffle=True,
                                num_workers=num_workers,
                                prefetch_factor=2 if platform.system() == "Windows" else 10,
                                pin_memory=False if platform.system() == "Windows" else True)
    val_loader_fold = DataLoader(val_dataset_fold,
                                batch_size=64,
                                shuffle=False,
                                num_workers=num_workers,
                                prefetch_factor=2 if platform.system() == "Windows" else 10,
                                pin_memory=False if platform.system() == "Windows" else True)
    print(f"Fold {current_fold_num}: Train size={len(train_dataset_fold)}, Val size={len(val_dataset_fold)}")

    # --- 2. 微调 CNN 特征提取器 (在当前折叠的训练集上) ---
    print(f"--- Fold {current_fold_num}: 微调 CNN 特征提取器 ---")
    
    fold_cnn_feature_extractors = {}
    
    for cnn_name in CNN_FEATURE_EXTRACTORS:
        print(f"  微调 {cnn_name}...")
        # 创建一个新的 CNN 模型实例 (带分类头，因为我们需要训练它)
        cnn_model_for_finetuning = globals()[cnn_name](num_classes=3).to(device)
        
        # 配置优化器和调度器（针对当前 CNN 的微调）
        cnn_optimizer = optim.AdamW(cnn_model_for_finetuning.parameters(), 
                                    lr=cnn_micro_train_params['lr'], 
                                    weight_decay=cnn_micro_train_params['weight_decay'])
        cnn_scheduler = optim.lr_scheduler.CosineAnnealingLR(cnn_optimizer, 
                                                            T_max=cnn_micro_train_params['num_epochs'], 
                                                            eta_min=1e-6)
        cnn_criterion = nn.CrossEntropyLoss()
        
        # 执行微调（使用简化的训练循环，不需要早停，只需要固定的 epoch）
        start_time_cnn_finetune = time.time()
        scaler_cnn = torch.amp.GradScaler('cuda') # AMP scaler for CNN training
        
        for cnn_epoch in range(cnn_micro_train_params['num_epochs']):
            cnn_model_for_finetuning.train()
            train_loss_cnn = 0
            train_correct_cnn = 0
            train_total_cnn = 0
            
            cnn_train_loader_tqdm = tqdm(train_loader_fold, desc=f'  {cnn_name} Epoch {cnn_epoch+1}/{cnn_micro_train_params["num_epochs"]}', leave=False)
            
            for batch_idx, (inputs, targets) in enumerate(cnn_train_loader_tqdm):
                inputs, targets = inputs.to(device), targets.to(device)
                cnn_optimizer.zero_grad()
                with torch.amp.autocast('cuda'):
                    outputs = cnn_model_for_finetuning(inputs)
                    loss = cnn_criterion(outputs, targets)
                scaler_cnn.scale(loss).backward()
                scaler_cnn.step(cnn_optimizer)
                scaler_cnn.update()
                train_loss_cnn += loss.item()
                _, predicted = outputs.max(1)
                train_total_cnn += targets.size(0)
                train_correct_cnn += predicted.eq(targets).sum().item()
                
                cnn_train_loader_tqdm.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'acc': f'{100. * train_correct_cnn / train_total_cnn:.2f}%',
                    'lr': f'{cnn_optimizer.param_groups[0]["lr"]:.1e}'
                })
            if cnn_epoch%3==0: print(f"    Fold {current_fold_num} {cnn_name} Epoch {cnn_epoch+1} Final Train Acc: {100. * train_correct_cnn / train_total_cnn:.2f}%, Avg Loss: {train_loss_cnn / len(train_loader_fold):.4f}")
            cnn_scheduler.step() # 更新 CNN 学习率
    
        end_time_cnn_finetune = time.time()
        print(f"  微调 {cnn_name} 完成，耗时: {end_time_cnn_finetune - start_time_cnn_finetune:.2f} 秒")

        # --- NEW: Evaluate the finetuned CNN classifier directly on the validation fold ---
        print(f"  评估微调后的 {cnn_name} 在验证集上...")
        cnn_model_for_finetuning.eval() # Set to evaluation mode
        val_correct_cnn = 0
        val_total_cnn = 0
        all_val_labels_cnn = []
        all_val_preds_cnn = []

        with torch.no_grad(): # Disable gradient calculation for evaluation
            cnn_val_loader_tqdm = tqdm(val_loader_fold, desc=f'  Evaluating {cnn_name}', leave=False)
            for batch_idx, (inputs, targets) in enumerate(cnn_val_loader_tqdm):
                inputs, targets = inputs.to(device), targets.to(device)

                with torch.amp.autocast('cuda'): # Use AMP for consistent inference
                    outputs = cnn_model_for_finetuning(inputs)

                _, predicted = outputs.max(1)
                val_total_cnn += targets.size(0)
                val_correct_cnn += predicted.eq(targets).sum().item()

                all_val_labels_cnn.extend(targets.cpu().numpy())
                all_val_preds_cnn.extend(predicted.cpu().numpy())

        # Calculate metrics
        cnn_val_accuracy = 100. * val_correct_cnn / val_total_cnn
        # Added zero_division=0 to handle potential cases where a class is not present in the validation fold
        cnn_val_macro_precision = precision_score(all_val_labels_cnn, all_val_preds_cnn, average='macro', zero_division=0)
        cnn_val_macro_f1 = f1_score(all_val_labels_cnn, all_val_preds_cnn, average='macro', zero_division=0)

        print(f"    Fold {current_fold_num} {cnn_name} Validation Accuracy: {cnn_val_accuracy:.2f}%")
        print(f"    Fold {current_fold_num} {cnn_name} Validation Macro Precision: {cnn_val_macro_precision:.4f}")
        print(f"    Fold {current_fold_num} {cnn_name} Validation Macro F1: {cnn_val_macro_f1:.4f}")
        # --- END NEW ---

        # 获取微调后的特征提取器版本（移除分类头）
        feature_extractor = get_feature_extractor(cnn_name, device)
        # 将微调后的模型权重加载到特征提取器中
        # 需要注意的是，如果原始模型和特征提取器层的名字不完全一致，这里可能需要手动映射
        # 幸运的是，ResNet 移除最后一层后，前面的层名字是保留的
        feature_extractor.load_state_dict(cnn_model_for_finetuning.state_dict(), strict=False) # strict=False 忽略 fc 层的 key 缺失
        
        # 将特征提取器设置为评估模式
        feature_extractor.eval()
        
        fold_cnn_feature_extractors[cnn_name] = feature_extractor

    # --- 3. 提取特征 ---
    print(f"--- Fold {current_fold_num}: 提取特征 ---")
    
    train_features_list_fold = []
    val_features_list_fold = []
    
    # 对每个微调好的 CNN 提取特征
    for cnn_name, feature_extractor in fold_cnn_feature_extractors.items():
        print(f"  使用 {cnn_name} 提取训练集特征...")
        train_features, train_labels = extract_features(feature_extractor, train_loader_fold, device)
        train_features_list_fold.append(train_features)
        
        print(f"  使用 {cnn_name} 提取验证集特征...")
        val_features, val_labels = extract_features(feature_extractor, val_loader_fold, device)
        val_features_list_fold.append(val_features)
    
    # 拼接所有 CNN 的特征
    scaler = StandardScaler()
    X_train_fold = np.concatenate(train_features_list_fold, axis=1)
    X_train_fold = scaler.fit_transform(X_train_fold)
    y_train_fold = train_labels # 所有 CNN 提取出的标签应该是相同的

    X_val_fold = np.concatenate(val_features_list_fold, axis=1)
    X_val_fold = scaler.transform(X_val_fold)
    y_val_fold = val_labels # 所有 CNN 提取出的标签应该是相同的

    print(f"拼接后训练特征形状: {X_train_fold.shape}")
    print(f"拼接后验证特征形状: {X_val_fold.shape}")

    # --- 4. 训练 Boosting 模型 (LightGBM) ---
    print(f"--- Fold {current_fold_num}: 训练 LightGBM 模型 ---")
    
    # LightGBM 训练需要指定 eval_set 用于早停
    lgbm_model = lgb.LGBMClassifier(**lgbm_params)
    
    # 使用早停，监控验证集上的 Log Loss
    eval_set = [(X_val_fold, y_val_fold)]
    
    start_time_lgbm_train = time.time()
    lgbm_model.fit(X_train_fold, y_train_fold,
                eval_set=eval_set,
                eval_metric='multi_logloss',
                callbacks=[lgb.early_stopping(50, verbose=True)]) # 早停参数
    end_time_lgbm_train = time.time()
    print(f"LightGBM 训练完成，耗时: {end_time_lgbm_train - start_time_lgbm_train:.2f} 秒")
    print(f"LightGBM 在训练集上的最佳迭代次数: {lgbm_model.booster_.best_iteration}")

    # --- 5. 评估 Boosting 模型 (在当前折叠的验证集上) ---
    print(f"--- Fold {current_fold_num}: 评估 LightGBM 模型 ---")
    
    y_pred_fold = lgbm_model.predict(X_val_fold)
    
    fold_accuracy = accuracy_score(y_val_fold, y_pred_fold)
    fold_macro_precision = precision_score(y_val_fold, y_pred_fold, average='macro')
    fold_macro_f1 = f1_score(y_val_fold, y_pred_fold, average='macro')
    
    print(f"Fold {current_fold_num} 验证准确率: {fold_accuracy:.4f}")
    print(f"Fold {current_fold_num} 验证Macro Precision: {fold_macro_precision:.4f}")
    print(f"Fold {current_fold_num} 验证Macro F1: {fold_macro_f1:.4f}")
    
    # 存储结果
    fold_val_accuracies.append(fold_accuracy)
    fold_val_macro_precision_scores.append(fold_macro_precision)
    fold_val_macro_f1_scores.append(fold_macro_f1)
    
    # 清理临时文件
    if os.path.exists(fold_train_label_file): os.remove(fold_train_label_file)
    if os.path.exists(fold_val_label_file): os.remove(fold_val_label_file)

# --- 6. K-Fold 循环结束后，进行分析 ---
if K > 0 and fold_val_accuracies: # 确保有结果可分析
    print("\n--- K-Fold Cross-Validation 结果分析 ---")

    # 计算平均值和标准差
    mean_accuracy = np.mean(fold_val_accuracies)
    std_accuracy = np.std(fold_val_accuracies)
    mean_precision = np.mean(fold_val_macro_precision_scores)
    std_precision = np.std(fold_val_macro_precision_scores)
    mean_f1 = np.mean(fold_val_macro_f1_scores)
    std_f1 = np.std(fold_val_macro_f1_scores)

    print(f"平均验证准确率 (across {K} folds): {mean_accuracy:.4f} ± {std_accuracy:.4f}")
    print(f"平均验证Macro Precision (across {K} folds): {mean_precision:.4f} ± {std_precision:.4f}")
    print(f"平均验证Macro F1 (across {K} folds): {mean_f1:.4f} ± {std_f1:.4f}")

    print("\n每折的验证准确率:")
    for i, acc in enumerate(fold_val_accuracies):
        print(f"  Fold {i+1}: {acc:.4f}")
    print("\n每折的验证Macro F1:")
    for i, f1 in enumerate(fold_val_macro_f1_scores):
        print(f"  Fold {i+1}: {f1:.4f}")
        
# --- 清理临时文件夹 ---
if 'temp_dir_for_folds' in locals() and os.path.exists(temp_dir_for_folds):
    print(f"清理临时文件夹: {temp_dir_for_folds}")
    try:
        shutil.rmtree(temp_dir_for_folds)
    except Exception as e:
        print(f"警告: 清理临时文件夹失败: {e}")

# 训练最终用于提交的模型
使用整个训练集训练 CNN 特征提取器，提取特征，然后训练最终的 LightGBM 模型。

In [None]:
print("\n--- 开始训练最终提交模型 ---")

# 使用整个训练集的数据加载器 (即原始的 train_loader)
final_cnn_train_loader = train_loader # 这里 train_loader 应该包含所有训练数据
# 使用验证集的变换来提取特征 (为了特征一致性，通常用 eval transform)
# 创建一个包含所有训练数据的 DataLoader，但使用 val_transform
full_train_dataset_eval_transform = PterygiumDataset(label_file=label_file, image_dir=image_dir, transform=val_transform)
full_train_loader_eval_transform = DataLoader(full_train_dataset_eval_transform,
                                            batch_size=64,
                                            shuffle=False, # 提取特征不需要 shuffle
                                            num_workers=num_workers,
                                            prefetch_factor=train_loader.prefetch_factor, # 使用相同的预取设置
                                            pin_memory=train_loader.pin_memory)


# --- 1. 训练或加载 CNN 特征提取器 (在整个训练集上) ---
print("--- 训练或加载 CNN 特征提取器 (在整个训练集上) ---")

final_cnn_feature_extractors = {}

for cnn_name in CNN_FEATURE_EXTRACTORS:
    print(f"  训练 {cnn_name} 在完整数据集上...")
    # 创建一个新的 CNN 模型实例 (带分类头)
    cnn_model_for_final_training = globals()[cnn_name](num_classes=3).to(device)

    # 配置优化器和调度器
    cnn_optimizer_final = optim.AdamW(cnn_model_for_final_training.parameters(), 
                                        lr=cnn_micro_train_params['lr'], # 可以使用与 fold 相同的 lr，或重新调优
                                        weight_decay=cnn_micro_train_params['weight_decay'])
    # T_max 应该基于完整的 num_epochs
    cnn_scheduler_final = optim.lr_scheduler.CosineAnnealingLR(cnn_optimizer_final, 
                                                                T_max=cnn_micro_train_params['num_epochs'], 
                                                                eta_min=1e-6) # 这里可以考虑增加 T_max，因为数据量更大
    cnn_criterion_final = nn.CrossEntropyLoss()

    # 执行训练 (使用简化的训练循环)
    start_time_cnn_final_train = time.time()
    scaler_cnn_final = torch.amp.GradScaler('cuda') # AMP scaler for CNN training
    
    # 这里可以考虑加入一个简单的早停，监控训练集上的损失或一个小的保留验证集
    # 为了简单和遵循“不考虑时间”原则，我们先运行固定的 epoch 数
    for cnn_epoch in range(cnn_micro_train_params['num_epochs']): # 可以增加这个 epoch 数
        cnn_model_for_final_training.train()
        train_loss_cnn_final = 0
        train_correct_cnn_final = 0
        train_total_cnn_final = 0
        
        cnn_train_loader_final_tqdm = tqdm(final_cnn_train_loader, desc=f'  {cnn_name} Final Epoch {cnn_epoch+1}/{cnn_micro_train_params["num_epochs"]}', leave=False)
        
        for batch_idx, (inputs, targets) in enumerate(cnn_train_loader_final_tqdm):
            inputs, targets = inputs.to(device), targets.to(device)
            cnn_optimizer_final.zero_grad()
            with torch.amp.autocast('cuda'):
                outputs = cnn_model_for_final_training(inputs)
                loss = cnn_criterion_final(outputs, targets)
            scaler_cnn_final.scale(loss).backward()
            scaler_cnn_final.step(cnn_optimizer_final)
            scaler_cnn_final.update()
            train_loss_cnn_final += loss.item()
            _, predicted = outputs.max(1)
            train_total_cnn_final += targets.size(0)
            train_correct_cnn_final += predicted.eq(targets).sum().item()
            
            cnn_train_loader_final_tqdm.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{100. * train_correct_cnn_final / train_total_cnn_final:.2f}%',
                'lr': f'{cnn_optimizer_final.param_groups[0]["lr"]:.1e}'
            })
        if cnn_epoch%3==0: print(f"    Final Training {cnn_name} Epoch {cnn_epoch+1} Final Train Acc: {100. * train_correct_cnn_final / train_total_cnn_final:.2f}%, Avg Loss: {train_loss_cnn / len(train_loader_fold):.4f}")
        cnn_scheduler_final.step() # 更新 CNN 学习率
    
    end_time_cnn_final_train = time.time()
    print(f"  训练 {cnn_name} 在完整数据集上完成，耗时: {end_time_cnn_final_train - start_time_cnn_final_train:.2f} 秒")

    # 获取最终的特征提取器版本
    feature_extractor = get_feature_extractor(cnn_name, device)
    feature_extractor.load_state_dict(cnn_model_for_final_training.state_dict(), strict=False)
    feature_extractor.eval()
    final_cnn_feature_extractors[cnn_name] = feature_extractor
    save_cnn_state_dict(feature_extractor, f'./final_{cnn_name}_feature_extractor.pth')

# --- 2. 提取特征 (从整个训练集) ---
print("--- 提取整个训练集的特征 ---")

final_train_features_list = []

# 对每个训练好的 CNN 提取特征 (使用 eval transform 的 DataLoader)
for cnn_name, feature_extractor in final_cnn_feature_extractors.items():
    print(f"  使用 {cnn_name} 提取完整训练集特征...")
    # 注意：这里提取的是完整训练集的特征，用于训练最终的 Boosting 模型
    features, labels = extract_features(feature_extractor, full_train_loader_eval_transform, device)
    final_train_features_list.append(features)

X_final_train = np.concatenate(final_train_features_list, axis=1)
final_scaler = StandardScaler()
X_final_train = final_scaler.fit_transform(X_final_train)
y_final_train = labels # 所有 CNN 提取出的标签应该是相同的

print(f"拼接后完整训练集特征形状: {X_final_train.shape}")

# --- 3. 训练最终的 Boosting 模型 ---
print("--- 训练最终 LightGBM 模型 ---")

# 使用与 K-Fold 中相同的 LightGBM 参数，或者基于 K-Fold 结果进行微调
final_lgbm_model = lgb.LGBMClassifier(**lgbm_params)

# 在整个训练集上训练最终模型，不再需要 eval_set 和早停（除非你再分一个小的最终验证集）
# 为了最大化利用数据，通常直接在全量数据上训练
start_time_final_lgbm_train = time.time()
final_lgbm_model.fit(X_final_train, y_final_train)
end_time_final_lgbm_train = time.time()
print(f"最终 LightGBM 模型训练完成，耗时: {end_time_final_lgbm_train - start_time_final_lgbm_train:.2f} 秒")

# --- 4. 保存最终模型 ---
final_model_save_path = "./final_boosted_classifier.joblib"
save_boosting_model(final_lgbm_model, final_model_save_path)

# 保存 scaler 以便预测时使用
final_scaler_save_path = "./final_feature_scaler.joblib"
joblib.dump(final_scaler, final_scaler_save_path)
print(f"特征Scaler已保存到 {final_scaler_save_path}")

print("\n--- 最终提交模型训练完成 ---")
# -------------

# 最终模型预测
加载训练好的 Boosted 模型，并使用它对新图像进行预测。

In [None]:
# 加载最终的 Boosting 模型
final_lgbm_model_loaded = load_boosting_model("./final_boosted_classifier.joblib")
# 加载最终的 Scaler
final_scaler_loaded = joblib.load("./final_feature_scaler.joblib")
print("Scaler已从 ./final_feature_scaler.joblib 加载")

final_cnn_feature_extractors_loaded = {}
print("加载微调后的 CNN 特征提取器...")
for cnn_name in CNN_FEATURE_EXTRACTORS:
    print(f"  加载 {cnn_name} 特征提取器...")
    # 创建特征提取器模型实例 (不带分类头，使用 get_feature_extractor)
    feature_extractor = get_feature_extractor(cnn_name, device)
    final_cnn_state_dict_path = f'./final_{cnn_name}_feature_extractor.pth'
    if os.path.exists(final_cnn_state_dict_path):
        load_cnn_state_dict(feature_extractor, final_cnn_state_dict_path, device)
        final_cnn_feature_extractors_loaded[cnn_name] = feature_extractor
        print(f"  {cnn_name} 权重加载成功。")
    else:
        print(f"警告: 未找到 {cnn_name} 的微调权重文件 {final_cnn_state_dict_path}。将使用预训练权重。")
        # 如果文件不存在，get_feature_extractor 默认会加载 ImageNet 预训练权重
        final_cnn_feature_extractors_loaded[cnn_name] = feature_extractor


def predict_image_boosted(cnn_feature_extractors, boosted_model, scaler, image_path, transform, device):
    """
    使用 Boosted Ensemble 模型对单张图像进行预测。
    """
    # 1. 加载和预处理图像
    try:
        image = Image.open(image_path).convert("RGB")
        image_tensor = transform(image).unsqueeze(0).to(device) # 应用 eval transform
    except FileNotFoundError:
        print(f"错误: 预测图像文件未找到 {image_path}")
        return None # Or raise error
    except Exception as e:
        print(f"错误处理预测图像 {image_path}: {e}")
        return None # Or raise error

    # 2. 使用每个 CNN 提取特征
    features_list = []
    with torch.no_grad():
        for cnn_name, feature_extractor in cnn_feature_extractors.items():
            # 确保 CNN 处于评估模式
            feature_extractor.eval() 
            with torch.amp.autocast('cuda'): # AMP 推理
                outputs = feature_extractor(image_tensor)
                features_list.append(outputs.view(outputs.size(0), -1).cpu().numpy())
    image_features_unscaled = np.concatenate(features_list, axis=1)

    # 3. 标准化特征
    image_features_scaled = scaler.transform(image_features_unscaled)

    # 4. 使用 Boosted 模型进行预测
    predicted_class = boosted_model.predict(image_features_scaled)[0] # predict 返回数组
    
    return predicted_class

# --- 预测示例 (使用你的 val_image_dir 结构) ---
def test_final_model_on_val_dir(cnn_feature_extractors, boosted_model, scaler, input_dir, transform, device):
    """
    在验证集目录上测试最终 Boosted 模型。
    """
    image_paths = glob.glob(os.path.join(input_dir, "*.png")) 
    
    results = []
    
    for img_path in tqdm(image_paths, desc="Predicting images with Boosted Model", leave=True):
        try:
            predicted_class = predict_image_boosted(cnn_feature_extractors, boosted_model, scaler, img_path, transform, device)
            base_name = os.path.splitext(os.path.basename(img_path))[0]
            results.append({"Image": int(base_name), "Pterygium": predicted_class})
        except Exception as e:
            tqdm.write(f"处理预测图像 {img_path} 时出错: {e}")
    
    # 按图像名称排序
    results.sort(key=lambda x: x["Image"])
    df = pd.DataFrame(results, columns=["Image", "Pterygium"])
    
    # 保存结果到文件
    output_result_path = "./Classification_Results.xlsx"
    df.to_excel(output_result_path, index=False)
    print(f"\n分类结果已保存到 {output_result_path}")

# 调用测试函数
test_final_model_on_val_dir(final_cnn_feature_extractors_loaded,
                            final_lgbm_model_loaded,
                            final_scaler_loaded,
                            val_image_dir,
                            val_transform,
                            device)