In [None]:
# !git clone https://huggingface.co/Qwen/Qwen2.5-3B-Instruct

Cloning into 'Qwen2.5-3B-Instruct'...
Filtering content: 100% (2/2)
Filtering content: 100% (2/2), 5.74 GiB | 17.35 MiB/s
Filtering content: 100% (2/2), 5.74 GiB | 7.80 MiB/s, done.


In [1]:
import json

def convert_json_file(input_file, output_file):
    try:
        with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
            for line in infile:
                try:
                    item = json.loads(line.strip())
                except json.JSONDecodeError:
                    print(f"Skip invalid line: {line.strip()}")
                    continue

                # 提取原始数据
                question = item.get("question", "")
                exp = item.get("exp", "")
                cop = item.get("cop", 0)
                opa = item.get("opa", "")
                opb = item.get("opb", "")
                opc = item.get("opc", "")
                opd = item.get("opd", "")
                subject_name = item.get("subject_name", "")
                topic_name = item.get("topic_name", "")
                choice_type = item.get("choice_type", "")

                # 确定正确答案
                correct_option_letter = ""
                correct_option = ""
                if cop == 1:
                    correct_option_letter = "A."
                    correct_option = opa
                elif cop == 2:
                    correct_option_letter = "B."
                    correct_option = opb
                elif cop == 3:
                    correct_option_letter = "C."
                    correct_option = opc
                elif cop == 4:
                    correct_option_letter = "D."
                    correct_option = opd

                # 如果没有解释，则用正确答案填充
                if not exp:
                    exp = correct_option

                # 更新解释字段
                updated_exp = f"The correct answer is: {correct_option_letter} {correct_option}. {exp}"

                # 构造新的字典
                converted_item = {
                    "instruction": f"Answer the following {choice_type}-choice questions which From the subject of {subject_name}, questions about the {topic_name}, provide the correct answers and explanations",
                    "input": f"{question}\nA. {opa}\nB. {opb}\nC. {opc}\nD. {opd}",
                    "output": updated_exp
                }

                # 将转换后的数据逐行写入输出文件
                json.dump(converted_item, outfile, ensure_ascii=False)
                outfile.write('\n')

        print(f"Completed！Save as {output_file}")

    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    input_path = "train.json"
    output_path = "converted_train.jsonl"
    convert_json_file(input_path, output_path)

Completed！Save as converted_train.jsonl


In [None]:
# !pip install swanlab
import swanlab
import json
import pandas as pd
import torch
from datasets import Dataset
from modelscope import snapshot_download, AutoTokenizer
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    TrainerCallback,
)
import numpy as np
from torch.utils.data import DataLoader

# ---------------------------- 初始化 SwanLab ----------------------------
swanlab.login(api_key="YH3XHJl5mRRIwco61Cp96", save=True)

run = swanlab.init(
    project="Qwen2.5-LoRA-Law",
    experiment_name="3b-2",
    config={
        # 实验配置
        "model": "https://modelscope.cn/models/Qwen/Qwen2.5-3B-Instruct",
        "dataset": "https://drive.google.com/uc?export=download&id=15VkJdq5eyWIkfb_aoD3oS8i4tScbHYky",
        "github": "https://github.com/PeaceChoy/Medical-Questions-Answering-via-Fine-tuned-Qwen2.5-3B-Mode",
        "system_prompt": "You are a medical expert. Please provide professional answers based on the users' questions.",
        # LoRA 配置
        "lora_rank": 8,
        "lora_alpha": 32,
        "lora_dropout": 0.1,
        # 训练参数
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "gradient_accumulation_steps": 4,
        "learning_rate": 1e-4,
        "num_train_epochs": 2,
        "evaluation_strategy": "steps",
        "eval_steps": 60,
    },
)

# ---------------------------- 数据处理函数 ----------------------------
def process_func(example):
    MAX_LENGTH = 384
    instruction = tokenizer(
        f"<|im_start|>system\n{example['instruction']}<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]

    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
        
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# ---------------------------- 预测函数 ----------------------------
def predict(messages, model, tokenizer):
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

# ---------------------------- 验证损失计算函数 ----------------------------
def compute_validation_loss(model, val_dataloader, device):
    model.eval()
    total_loss = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            
            total_loss += loss.item() * len(batch["input_ids"])
            total_samples += len(batch["input_ids"])
            
    avg_loss = total_loss / total_samples
    model.train()
    return avg_loss

# ---------------------------- 自定义回调 ----------------------------
class PredictionCallback(TrainerCallback):
    def __init__(self, test_df, model, tokenizer):
        self.test_df = test_df
        self.model = model
        self.tokenizer = tokenizer

    def on_train_begin(self, args, state, control, **kwargs):
        print("\n训练开始，初始预测：")
        test_text_list = []
        for index, row in self.test_df[:3].iterrows():
            messages = [
                {"role": "system", "content": row["instruction"]},
                {"role": "user", "content": row["input"]},
            ]
            response = predict(messages, self.model, self.tokenizer)
            result_text = f"【Q】{row['input']}\n【LLM】{response}\n"
            test_text_list.append(swanlab.Text(result_text, caption=response))
            print(result_text)
        swanlab.log({"Prediction": test_text_list}, step=0)

    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"\nEpoch {int(state.epoch)} 结束，开始预测：")
        test_text_list = []
        for index, row in self.test_df.iterrows():
            messages = [
                {"role": "system", "content": row["instruction"]},
                {"role": "user", "content": row["input"]},
            ]
            response = predict(messages, self.model, self.tokenizer)
            result_text = f"【Q】{row['input']}\n【LLM】{response}\n【GT】{row['output']}"
            test_text_list.append(swanlab.Text(result_text, caption=response))
            if index == 0: print(result_text)  # 只打印第一条
        swanlab.log({"Prediction": test_text_list}, step=int(state.epoch))

class ValidationCallback(TrainerCallback):
    def __init__(self, val_dataloader, model, eval_steps=60):
        self.val_dataloader = val_dataloader
        self.model = model
        self.eval_steps = eval_steps
        
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.eval_steps == 0:
            val_loss = compute_validation_loss(self.model, self.val_dataloader, self.model.device)
            print(f"\nStep {state.global_step}: Validation Loss = {val_loss:.4f}")
            swanlab.log({"val_loss": val_loss}, step=state.global_step)

class SwanLabCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            swanlab.log({
                "loss": logs.get("loss", None),
                "learning_rate": logs.get("learning_rate", None),
                "grad_norm": logs.get("grad_norm", None)
            }, step=state.global_step)
        

if __name__ == "__main__":
    # 加载模型
    tokenizer = AutoTokenizer.from_pretrained("./Qwen2.5-3B-Instruct/", use_fast=False, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained("./Qwen2.5-3B-Instruct/", device_map="auto", torch_dtype=torch.bfloat16)
    model.enable_input_require_grads()

    # 准备数据集
    all_data = pd.read_json("./converted_train.jsonl", lines=True)
    train_df = all_data[:5000]
    test_df = all_data[5001:5006]
    val_df = all_data[5006:5021]  # 取15条作为验证集
    
    train_dataset = Dataset.from_pandas(train_df).map(process_func, remove_columns=train_df.columns.tolist())
    val_dataset = Dataset.from_pandas(val_df).map(process_func, remove_columns=val_df.columns.tolist())
    
    # 创建验证数据加载器
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
    val_dataloader = DataLoader(
        val_dataset, 
        batch_size=4,
        collate_fn=data_collator
    )

    # LoRA 配置
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        inference_mode=False,
    )
    peft_model = get_peft_model(model, peft_config)

    # 训练参数
    training_args = TrainingArguments(
        output_dir="./new_output/Qwen2.5-3b",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        logging_steps=10,
        num_train_epochs=2,
        save_steps=100,
        learning_rate=1e-4,
        gradient_checkpointing=True,
        report_to="none",  # 禁用默认的日志记录
        logging_dir="./logs",  # 添加日志目录
        lr_scheduler_type="cosine",  # 使用余弦退火调度器
        warmup_ratio=0.1,  # 预热期占总步数的比例，这里设为10%
    )

    # 初始化训练器
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
        callbacks=[
            PredictionCallback(test_df, peft_model, tokenizer),
            SwanLabCallback(),
            ValidationCallback(val_dataloader, peft_model, eval_steps=60),
        ],
    )

    # 开始训练
    trainer.train()
    swanlab.finish()