In [None]:
import json
import os
from dataclasses import dataclass
import torch
from datasets import Dataset
import gc
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)

from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

In [12]:
# 配置路径
MODEL_NAME = "/model/HuggingFace/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
# YAOLAO_JSON = "/root/yaolao/train_data_filtered.json"
YAOLAO_JSON = r"E:\论文\train_data\train_data_prompted2.json"
OUTPUT_DIR = "lora_DeepSeek-R1-Distill-Qwen-14B_yaolao"

In [None]:
# 训练超参
NUM_EPOCHS = 3                              # 训练轮数
LEARNING_RATE = 2e-4                        # 学习率        
PER_DEVICE_BATCH_SIZE = 2                   # 每个设备的批大小（取决于显存）
GRAD_ACCUM_STEPS = 8                        # 梯度累积步数   
WEIGHT_DECAY = 0.0                          # 权重衰减
LOGGING_STEPS = 50                          # 日志记录步数
SAVE_STEPS = 500                            # 模型保存步数
MAX_LENGTH = 1024                           # 最大输入长度

# LoRA 配置
LORA_R = 16                                 # 秩（rank）
LORA_ALPHA = 32                             # alpha         （缩放因子）
LORA_DROPOUT = 0.05                         # dropout       （丢弃率）

In [13]:
def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    # 过滤掉不适合的模块
    exclude = {'lm_head'}
    lora_module_names = [name for name in lora_module_names if name not in exclude]
    return lora_module_names

In [14]:
def load_data(json_path: str):
    """从json文件中加载数据"""
    with open(json_path, "r") as f:
        conversations = json.load(f)
    return conversations

In [15]:
def build_examples_from_conversations_single(conversations_list:list[dict]) -> list[dict]:
    """从conversations中构建单轮对话训练样本"""
    examples = []
    for conversations in conversations_list:
        idx = 0
        system_content = ""
        conv = conversations['conversations']
        if conv[0]['role'] == 'system':
            system_content = conv[0]['content']
            idx = 1
            while idx + 1 < len(conv):
                if (conv[idx]['role'] == 'user' and
                    conv[idx + 1]['role'] == 'assistant'):
                    user_content = conv[idx]['content']
                    assistant_content = conv[idx + 1]['content']
                    prompt_parts = []
                    prompt_parts.append(f"<|system|>:\n{system_content}\n")
                    prompt_parts.append(f"<|user|>:\n{user_content}\n<|assistant|>:\n")
                    prompt = ''.join(prompt_parts)
                    examples.append({
                        "input_text": prompt,
                        "target_text": assistant_content
                    })
                idx += 2
        else:
            continue
        
    return examples

In [16]:
def build_examples_from_conversations_multiple(conversations:list[dict]) -> list[dict]:
    """
    从conversations中构建多轮对话训练样本
    每个样本包含输入input_text和期望回复target_text
    input_text为前面所有对话拼接,
    target_text为最后一轮回复
    """
    system_content = ""
    examples = []
    history = []
    idx = 0
    if len(conversations) > 0 and conversations[0]["role"] == "system":
        system_content = conversations[0].get("content", "")
        idx = 1

    while idx < len(conversations):
        #获取当前role和content
        current = conversations[idx]
        role = current.get("role", "")
        content = current.get("content", "")
        
        if role == "user":
            # 检查是否有对应回复
            if idx + 1 < len(conversations) and conversations[idx + 1]["role"] == "assistant":
                assistant_content = conversations[idx + 1].get("content", "")
                
                # 构建上下文
                prompt_parts = []
                if system_content:
                    prompt_parts.append(f"<|system|>:\n{system_content}\n")
                
                # 添加历史对话
                for user_msg, assistant_msg in history:
                    prompt_parts.append(f"<|user|>:\n{user_msg}\n<|assistant|>:\n{assistant_msg}\n")
                
                # 添加当前用户消息
                prompt_parts.append(f"<|user|>:\n{content}\n<|assistant|>:\n")
                prompt = "".join(prompt_parts)
                
                # 添加当前回复
                examples.append({
                    "input_text": prompt,
                    "target_text": assistant_content
                })

                # 添加到历史
                history.append((content, assistant_content))
                idx += 2
            # 没有对应的 assistant，跳过这个 user
            else:
                idx += 1
        # 不应该出现单独的 assistant，如果有则跳过
        elif role == "assistant":
            idx += 1
    return examples

In [17]:
def prepare_data(conversations, single_turn=False):
    """
    读取对话数据，构造训练样本
    
    Args:
        conversations: 对话数据
        single_turn: 是否提取单轮对话，False为多轮对话
    """
    examples_all = []
    if isinstance(conversations, dict) and "conversations" in conversations:
        # 单个对话对象
        conv_list = conversations["conversations"]
        if single_turn:
            examples = build_examples_from_conversations_single(conv_list)
        else:
            examples = build_examples_from_conversations_multiple(conv_list)
        examples_all.extend(examples)
        
    elif isinstance(conversations, list):
        # 对话列表
        if single_turn:
            examples_all = build_examples_from_conversations_single(conversations)
        else:
            for conv in conversations:
                if isinstance(conv, dict) and "conversations" in conv:
                    conv_list = conv["conversations"]
                    examples = build_examples_from_conversations_multiple(conv_list)
                    examples_all.extend(examples)
    return examples_all

In [20]:
import json
def main():
    """主函数，执行微调流程"""
    examples_all = []
    # 加载对话数据
    conversations = load_data(YAOLAO_JSON)
    print("数据集加载完成")
    
    # 构造训练样本
    examples_multiple = prepare_data(conversations)
    examples_single = prepare_data(conversations, single_turn=True)
    examples_all = examples_multiple + examples_single[:len(examples_multiple)//2]
    print("训练样本构造完成")
    print(f"构造了 {len(examples_all)} 个训练样本示例")
    
    # 加载模型和分词器
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    tokenizer.padding_side = "right"
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    print("分词器加载完成")
    
    # 模型加载
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,                     # 使用自定义代码
        device_map="auto",                          # 自动分配设备
        load_in_8bit=True,                          # 8-bit 模型
        # llm_int8_enable_fp32_cpu_offload=True
        torch_dtype=torch.float16                   # 使用半精度浮点数
    )
    print("模型加载完成")

    # 查找所有线性层名称
    TARGET_MODULES = find_all_linear_names(model)
    print(f"找到的线性层模块: {TARGET_MODULES}")
    
    # 使模型为 k-bit/8bit 微调做好准备
    model = prepare_model_for_kbit_training(model)
    
    # 配置 LoRA
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=TARGET_MODULES,
    )
    model = get_peft_model(model, peft_config)
    
    # 打印可训练参数数量
    trainable_params = 0
    total_params = 0
    for n, p in model.named_parameters():
        num = p.numel()
        total_params += num
        if p.requires_grad:
            trainable_params += num
    print(f"可训练参数: {trainable_params} / 总参数: {total_params}")
    
    # 构造 Dataset（tokenize 并且 labels 仅包含 assistant）
    def tokenize_and_build_labels(example):
        """
        把 input_text 和 target_text 合并： input_ids = tokenize(input_text + target_text)
        构造labels,输入部分用-100填充
        截断到最大长度 MAX_LENGTH
        """
        input_text = example["input_text"]
        target_text = example["target_text"]
        full_text = input_text + target_text
        
        # 编码
        input_encoding = tokenizer(input_text, add_special_tokens=False)
        target_encoding = tokenizer(target_text, add_special_tokens=False)
        
        # 合并并截断
        input_ids = input_encoding.input_ids + target_encoding.input_ids
        input_ids = input_ids[:MAX_LENGTH]
        
        # 构造labels
        labels = [-100] * len(input_encoding.input_ids) + target_encoding.input_ids
        labels = labels[:MAX_LENGTH]
        
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": [1] * len(input_ids)
        }
    
    # 创建数据集
    ds = Dataset.from_list(examples_all)
    ds_tokenized = ds.map(
        lambda x: tokenize_and_build_labels(x), 
        remove_columns=ds.column_names, 
        num_proc=4
    )

    # 使用Data collator 进行填充
    data_collator = DataCollatorForSeq2Seq(
        tokenizer, 
        model=model, 
        padding=True,
        pad_to_multiple_of=8  # 优化GPU效率
    )

    # 训练参数配置
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,                               # 输出目录
        # 批次大小相关
        per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,   # 每个GPU的批次大小
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,        # 梯度累积步数
        # 学习率
        warmup_ratio=0.03,                                   # 学习率warmup比例（前3%的step进行warmup）
        learning_rate=LEARNING_RATE,                         # 初始学习率
        # 训练周期
        num_train_epochs=NUM_EPOCHS,                         # 训练的总轮数
        # 混合精度训练
        fp16=True,                                           # 使用FP16混合精度训练，减少显存使用，加快训练速度
        # 日志与保存策略
        logging_steps=LOGGING_STEPS,                         # 每多少步记录一次日志
        save_strategy="steps",                               # 保存策略 ： 按步数保存模型
        save_steps=SAVE_STEPS,                               # 每多少步保存一次模型
        save_total_limit=3,                                  # 最多保存模型数量
        # 数据列处理
        remove_unused_columns=False,                         # 不要移除未使用的列
        report_to="none",                                    # 不使用任何报告工具（如 WandB）
        # 优化器配置
        optim="paged_adamw_8bit",                            # 使用 8-bit Adam 优化器
    )

    # 训练器
    trainer = Trainer(
        model=model,                    
        args=training_args,
        train_dataset=ds_tokenized,
        data_collator=data_collator,
    )

    # 开始训练
    trainer.train()

    # 保存 LoRA 权重
    model.save_pretrained(OUTPUT_DIR)
    print(f"训练完成并将 LoRA 权重保存到 {OUTPUT_DIR}")
    
    # 清理内存
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()

if __name__ == "__main__":
    main()

数据集加载完成
训练样本构造完成
构造了 1432 个训练样本示例


NameError: name 'AutoTokenizer' is not defined