In [None]:
# 导入必要的库
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import bitsandbytes as bnb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 设置随机种子
torch.manual_seed(42)
np.random.seed(42)

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 数据路径
data_dir = "/Users/mac/Desktop/lunwen/code/HAM10000_dataset"
metadata_path = os.path.join(data_dir, "HAM10000_metadata.csv")
images_dir = os.path.join(data_dir, "images")

# 读取元数据
df = pd.read_csv(metadata_path)
print(f"数据集大小: {len(df)}")
print(df.head())

# 查看类别分布
print(df['dx'].value_counts())

# 类别映射
class_dict = {
    'akiec': '光化性角化病和上皮内癌',
    'bcc': '基底细胞癌',
    'bkl': '良性角化病',
    'df': '皮肤纤维瘤',
    'mel': '黑色素瘤',
    'nv': '黑素细胞痣',
    'vasc': '血管病变'
}



In [None]:
# 创建自定义数据集
class HAM10000Dataset(Dataset):
    def __init__(self, dataframe, img_dir, tokenizer, max_length=512):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_id = row['image_id']
        img_path = os.path.join(self.img_dir, f"{img_id}.jpg")
        image = Image.open(img_path).convert('RGB')
        
        # 构建提示文本
        prompt = f"<img>这是什么类型的皮肤病变？请从以下选项中选择：光化性角化病和上皮内癌、基底细胞癌、良性角化病、皮肤纤维瘤、黑色素瘤、黑素细胞痣、血管病变。"
        
        # 构建目标文本
        target = class_dict[row['dx']]
        
        # 编码输入
        inputs = self.tokenizer(
            prompt, 
            return_tensors="pt", 
            truncation=True, 
            max_length=self.max_length
        )
        
        # 添加图像到输入
        inputs["images"] = self.tokenizer.process_images(image)
        
        # 编码目标
        target_encoding = self.tokenizer(
            target,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_length
        )
        
        # 准备模型输入
        input_ids = inputs["input_ids"][0]
        attention_mask = inputs["attention_mask"][0]
        images = inputs["images"][0]
        
        # 准备标签
        labels = target_encoding["input_ids"][0]
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "images": images,
            "labels": labels
        }

# 加载Qwen-VL模型和分词器
model_name = "Qwen/Qwen-VL-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    load_in_8bit=True
)

# 准备模型进行LoRA微调
model = prepare_model_for_kbit_training(model)

# 配置LoRA
lora_config = LoraConfig(
    r=16,  # LoRA的秩
    lora_alpha=32,  # LoRA的alpha参数
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # 目标模块
    lora_dropout=0.05,  # LoRA的dropout率
    bias="none",  # 是否包含偏置项
    task_type="CAUSAL_LM"  # 任务类型
)

# 应用LoRA配置
model = get_peft_model(model, lora_config)
print(f"模型参数总数: {model.num_parameters()}")
print(f"可训练参数数量: {model.num_parameters(True)}")

# 划分训练集和验证集
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['dx'])
print(f"训练集大小: {len(train_df)}, 验证集大小: {len(val_df)}")

# 创建数据集
train_dataset = HAM10000Dataset(train_df, images_dir, tokenizer)
val_dataset = HAM10000Dataset(val_df, images_dir, tokenizer)

# 定义训练参数
training_args = TrainingArguments(
    output_dir="/Users/mac/Desktop/lunwen/code/qwen_vl_ham10000_lora",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=100,
    save_total_limit=3,
    push_to_hub=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
)

# 定义训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# 开始训练
trainer.train()


In [None]:
# 保存模型
trainer.save_model("/Users/mac/Desktop/lunwen/code/qwen_vl_ham10000_lora/final")



In [None]:
# 模型评估
def evaluate_model(model, dataset, tokenizer):
    model.eval()
    predictions = []
    true_labels = []
    
    dataloader = DataLoader(dataset, batch_size=4)
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="评估中"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            images = batch["images"].to(device)
            
            # 获取模型输出
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                images=images,
                max_new_tokens=50,
                do_sample=False
            )
            
            # 解码预测结果
            pred_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            
            # 解码真实标签
            true_texts = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
            
            # 将预测结果映射到类别
            for pred_text in pred_texts:
                for class_key, class_name in class_dict.items():
                    if class_name in pred_text:
                        predictions.append(class_key)
                        break
                else:
                    # 如果没有找到匹配的类别，选择最相似的
                    max_similarity = 0
                    best_class = None
                    for class_key, class_name in class_dict.items():
                        similarity = sum(1 for a, b in zip(pred_text, class_name) if a == b) / max(len(pred_text), len(class_name))
                        if similarity > max_similarity:
                            max_similarity = similarity
                            best_class = class_key
                    predictions.append(best_class)
            
            # 将真实标签映射到类别
            for true_text in true_texts:
                for class_key, class_name in class_dict.items():
                    if class_name in true_text:
                        true_labels.append(class_key)
                        break
    
    # 计算准确率
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=list(class_dict.values()))
    
    return accuracy, report

# 评估模型
accuracy, report = evaluate_model(model, val_dataset, tokenizer)
print(f"验证集准确率: {accuracy:.4f}")
print("分类报告:")
print(report)

# 可视化一些预测结果
def visualize_predictions(model, dataset, tokenizer, num_samples=5):
    model.eval()
    indices = np.random.choice(len(dataset), num_samples, replace=False)
    
    plt.figure(figsize=(15, 10))
    
    for i, idx in enumerate(indices):
        sample = dataset[idx]
        
        # 获取图像
        row = dataset.dataframe.iloc[idx]
        img_id = row['image_id']
        img_path = os.path.join(dataset.img_dir, f"{img_id}.jpg")
        image = Image.open(img_path).convert('RGB')
        
        # 获取真实标签
        true_label = row['dx']
        true_label_name = class_dict[true_label]
        
        # 获取模型预测
        with torch.no_grad():
            input_ids = sample["input_ids"].unsqueeze(0).to(device)
            attention_mask = sample["attention_mask"].unsqueeze(0).to(device)
            images = sample["images"].unsqueeze(0).to(device)
            
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                images=images,
                max_new_tokens=50,
                do_sample=False
            )
            
            pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # 从预测文本中提取类别
            pred_label_name = "未知"
            for class_key, class_name in class_dict.items():
                if class_name in pred_text:
                    pred_label_name = class_name
                    break
        
        # 显示图像和预测结果
        plt.subplot(num_samples, 2, 2*i+1)
        plt.imshow(image)
        plt.title(f"真实: {true_label_name}")
        plt.axis('off')
        
        plt.subplot(num_samples, 2, 2*i+2)
        plt.text(0.1, 0.5, f"预测: {pred_label_name}\n\n完整回答:\n{pred_text}", fontsize=10)
        plt.axis('off')
    
    plt.tight_layout()
    plt.savefig("/Users/mac/Desktop/lunwen/code/qwen_vl_ham10000_predictions.png")
    plt.show()

# 可视化一些预测结果
visualize_predictions(model, val_dataset, tokenizer)

print("训练和评估完成！")