# Qwen2-7B-Instruct 微调
微调的基本方法参照 [Qwen2-7B-Instruct Lora 微调](https://blog.csdn.net/xiaobing259/article/details/140594017)


In [1]:
import os
import re
import pdb
import json
import torch
import optuna
from datasets import Dataset
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig, EarlyStoppingCallback
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.model_selection import train_test_split
# from data_processing.utils import extract_from_output

!nvidia-smi

GPU_ID = 4
os.environ["CUDA_VISIBLE_DEVICES"] = f"{GPU_ID}"  # 只使用一张显卡
device = torch.device(f"cuda:{GPU_ID}" if torch.cuda.is_available() else "cpu")  # 指定使用的设备
device_map = {"": f"cuda:{GPU_ID}"}
print(device)

# 多 GPU
# os.environ["CUDA_VISIBLE_DEVICES"] = '0,1'
# device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
# device_map = {: ''}

# 任务代号
TASK_ID = '5000'
# torch.cuda.device_count()
# # 指定使用第1号GPU
# torch.cuda.set_device(0)
# # 检查设置是否生效
# print(torch.cuda.current_device())  # 应该输出1
# print(torch.cuda.get_device_name(0))  # 输出第1号GPU的名称

  from .autonotebook import tqdm as notebook_tqdm


Fri Aug  9 11:51:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:4F:00.0 Off |                    0 |
| N/A   68C    P0             287W / 300W |  48472MiB / 81920MiB |     88%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:52:00.0 Off |  

In [2]:
# # 读取原数据集
# df1 = pd.read_json('/data/disk4/home/chenrui/ai-project/logical_reasoning/data/input/raw/round1_train_data_instruction.json')
# /data/disk4/home/chenrui/ai-project/logical_reasoning/data/input/train_data_raw_new_inst.json  # 包含原数据集噪声
# 读取新增数据集
# df2 = pd.read_json('/data/disk4/home/chenrui/ai-project/logical_reasoning/data/input/new/train_data_new_inst.json')  # 人工减弱噪声后的数据集

# # # 读取生成数据集
# df3 = pd.read_json('/data/disk4/home/chenrui/ai-project/logical_reasoning/data/input/gpt/gpt4_4590_806_inst.json')

# # 合并两个 DataFrame
# merged_df = pd.concat([df2, df3], ignore_index=True)
# print(len(merged_df))

# 数据集（指令集）路径
data_path = '/data/disk4/home/chenrui/ai-project/logical_reasoning/LLaMA-Factory/data/logical_problems_large.json'

# # 合并好的指令集
merged_df = pd.read_json(data_path)

# 检查DataFrame的长度
num_rows = len(merged_df)
print(f"The DataFrame has {num_rows} rows.")

# 如果DataFrame的行数大于或等于5000，随机抽取5000行
if num_rows >= 5000:
    random_seed = 42
    sample_df = merged_df.sample(n=5000, random_state=random_seed)
else:
    print("The DataFrame has fewer than 5000 rows. Returning the entire DataFrame.")
    sample_df = merged_df

# 划分训练集和验证集
train_data, val_data = train_test_split(sample_df, test_size=0.1, random_state=42)  # 80% 训练，20% 验证
# print(val_data)

# 将验证集保存为JSON格式到tmp文件夹
tmp_dir = '/data/disk4/home/chenrui/ai-project/logical_reasoning/data/input/tmp'
os.makedirs(tmp_dir, exist_ok=True)  # 创建tmp文件夹（如果不存在的话）
val_data.to_json(os.path.join(tmp_dir, 'validation_data.json'), orient='records', force_ascii=False)

# 将数据集转换为Dataset对象
train_ds = Dataset.from_pandas(train_data)
val_ds = Dataset.from_pandas(val_data)

train_ds

The DataFrame has 35149 rows.


Dataset({
    features: ['instruction', 'input', 'output', '__index_level_0__'],
    num_rows: 4500
})

In [3]:
# 加载分词器和模型
model_path = '/data/disk4/home/chenrui/.cache/modelscope/hub/qwen/Qwen2-Math-7B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device_map, torch_dtype=torch.bfloat16)
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法
model.dtype # 查看精度
print(next(model.parameters()).device)
# model

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.11it/s]

cuda:4





In [4]:
def process_func(example):
    MAX_LENGTH = 1800    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n你是一个逻辑推理专家，擅长解决逻辑推理问题。<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }
# 处理训练集和验证集
train_ds = train_ds.map(process_func, remove_columns=train_ds.column_names)
val_ds = val_ds.map(process_func, remove_columns=val_ds.column_names)
num_train_samples = len(train_ds)
train_ds, tokenizer.decode(train_ds[0]['input_ids']), tokenizer.decode(list(filter(lambda x: x != -100, train_ds[1]["labels"])))

Map: 100%|██████████| 4500/4500 [00:11<00:00, 407.47 examples/s]
Map: 100%|██████████| 500/500 [00:01<00:00, 332.69 examples/s]


(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 4500
 }),
 '<|im_start|>system\n你是一个逻辑推理专家，擅长解决逻辑推理问题。<|im_end|>\n<|im_start|>user\n\n你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有的问题都是（close-world assumption）闭世界假设，即未观测事实都为假。请逐步分析问题，最终只输出答案对应的选项字母，如"A"。题目如下：\n\n### 题目:\n**逻辑推理测试题：**\n\n以下是关于几种不同的职业人员的假设及其特性的推理系统。请根据已知条件和推理规则回答问题。\n\n已知条件：\n1. 穿白大褂的是医生。\n2. 持有教鞭的是教师。\n3. 穿制服并且戴帽子的是警察。\n4. 如果一个人在图书馆工作，那么他是图书管理员。\n5. 假设“John”穿着白大褂。\n6. 假设“Emma”持有教鞭。\n7. 假设“Oliver”穿着制服并且戴着帽子。\n8. 假设“Alice”在图书馆工作。\n\n请回答以下选择题：\n\n### 问题:\n选择题 2：\nEmma的职业是什么？\nA. 医生\nB. 教师\nC. 警察\nD. 图书管理员\n<|im_end|>\n<|im_start|>assistant\nB<|endoftext|>',
 'D<|endoftext|>')

In [5]:
# 设置 lora 参数
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=16,  # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.3  # Dropout 比例
)
model = get_peft_model(model, config)

# 查看可训练参数
model.print_trainable_parameters()

# 多 GPU 分布式训练
# if torch.cuda.device_count() > 1:
#     model = torch.nn.DataParallel(model)
#     print(torch.cuda.device_count())

trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643


In [6]:
# 自定义 Trainer 类
class CustomTrainer(Trainer):
    def training_step(self, model, inputs):
        loss = super().training_step(model, inputs)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # 梯度裁剪
        return loss

# 设置训练轮次和批量大小
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
num_train_epochs = 4
gradient_accumulation_steps = 4  # 2 次前向传播才进行一次反向传播和参数更新，有利于防止梯度爆炸

# 查看训练总步数
total_training_steps = (num_train_samples // per_device_train_batch_size // gradient_accumulation_steps) * num_train_epochs
print(f"Total training steps: {total_training_steps}")

Total training steps: 1124


In [7]:
# 设置训练超参数
args = TrainingArguments(
    output_dir=f"./checkpoints/Qwen2_math_7B_instruct_lora_{TASK_ID}",
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=num_train_epochs,
    logging_steps=10,
    save_steps=200,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    lr_scheduler_type="cosine",
    warmup_steps=100,
    gradient_accumulation_steps=gradient_accumulation_steps,
    evaluation_strategy="steps",  # 设置评估策略
    eval_steps=100,
    weight_decay=0.01,
    # fp16=True,  # 使用混合精度训练，防止梯度爆炸
    load_best_model_at_end=True,  # 在训练结束时加载最佳模型
    metric_for_best_model="eval_loss",  # 监控的指标
    greater_is_better=False,  # 因为我们希望损失最小化
    logging_dir='./logs',  # TensorBoard 日志目录
    report_to="tensorboard",  # 启用 TensorBoard
    # dataloader_num_workers=4,
    # distributed_data_parallel=True,
)



In [8]:
# 训练 可用 CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,  # 添加验证集
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],  # 设置早停法，耐心为 3
)
torch.backends.cuda.enable_mem_efficient_sdp(False)
trainer.train()

[2024-08-09 11:52:02,955] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/data/disk4/home/chenrui/miniconda3/envs/datawhale/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
100,0.3877,0.387831
200,0.3231,0.304432
300,0.1765,0.328981
400,0.207,0.295358
500,0.1338,0.302061
600,0.0679,0.363511
700,0.0316,0.432238
800,0.0376,0.461664
900,0.0257,0.468061


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=900, training_loss=0.5953734336958991, metrics={'train_runtime': 3635.0512, 'train_samples_per_second': 4.952, 'train_steps_per_second': 0.309, 'total_flos': 1.8455826108716237e+17, 'train_loss': 0.5953734336958991, 'epoch': 3.2})

In [9]:
trainer.save_model(f"./output/Qwen2_math_7B_instruct_lora_{TASK_ID}_final")
!tensorboard --logdir=./logs --host=172.20.2.1



TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

TensorBoard 2.17.0 at http://172.20.2.1:6006/ (Press CTRL+C to quit)
^C


# 用微调后的模型对验证集进行推理
此时可以重启 Jupyter Kernel，释放显存

In [None]:
import os
import re
import pdb
import json
import torch
import shutil
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from collections import Counter

os.environ["CUDA_VISIBLE_DEVICES"] = "2"  # 只使用第一张显卡
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")  # 指定使用的设备

!nvidia-smi

In [None]:
# 加载 lora 权重
model_path = '/data/disk4/home/chenrui/ai-project/Qwen2-7B-Instruct'
lora_path = '/data/disk4/home/chenrui/ai-project/logical_reasoning/output/Qwen2_7B_instruct_lora_1_final'  # 这里改称你的 lora 输出对应 checkpoint 地址

# 加载模型
model = AutoModelForCausalLM.from_pretrained(model_path, device_map={"":"cuda:2"}, torch_dtype=torch.bfloat16, trust_remote_code=True).eval()
model = PeftModel.from_pretrained(model, model_id=lora_path).to(device)  # 将 lora 权重加进原模型
print(f"Model is on device: {next(model.parameters()).device}")

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 读取JSONL文件
data = []
input_file = '/data/disk4/home/chenrui/ai-project/logical_reasoning/data/input/tmp/validation_data.json'
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)  # 读取 JSON 数据
print(f"The size of validation data: {len(data)}")

In [None]:
# 批量推理并计算正确率
correct_count = 0
total_count = 0

for idx, item in enumerate(data):
    instruction = item['instruction']
    input_text = item['input']
    expected_output = item['output']
    # match = re.findall(r'[A-G]', output)
    # if match:
    #     expected_output = match[-1]

    prompt = instruction + input_text
    messages = [
        {"role": "user", "content": prompt}
    ]
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True).to(device)

    gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
    
    # 存储多次调用的输出
    outputs_list = []
    
    # 三次调用模型
    for i in range(3):
        with torch.no_grad():
            outputs = model.generate(**inputs, **gen_kwargs)
            outputs = outputs[:, inputs['input_ids'].shape[1]:]
            output = tokenizer.decode(outputs[0], skip_special_tokens=True)
            match = re.findall(r'[A-G]', output)
            if match:
                output = match[-1]
            outputs_list.append(output)
            print(f"Model output {i}: {output}")

    # 进行多路投票
    vote_counts = Counter(outputs_list)
    final_output = vote_counts.most_common(1)[0][0]  # 选择出现次数最多的结果
    print(f"Final voted output: {final_output}, Expected output: {expected_output}")

    # 比较预测答案和正确答案
    if final_output == expected_output:
        correct_count += 1
    total_count += 1

    # 每处理 n 条数据输出一次实时的正确率
    if (total_count) % 50 == 0:
        accuracy = correct_count / total_count
        print(f"Processed {total_count} items, Current Accuracy: {accuracy:.2%}")

# 最终正确率
accuracy = correct_count / total_count
print(f"Final Accuracy: {accuracy:.2%}")

In [None]:
# 清除生成的临时文件
# if os.path.exists(tmp_dir):
#     shutil.rmtree(tmp_dir)
#     os.makedirs(tmp_dir)
#     print(f'The folder {tmp_dir} has been cleared.')
# else:
#     print(f'The folder {tmp_dir} does not exist.')

In [None]:
# # 模型合并存储
# new_model_directory = "./merged_model_an"
# merged_model = model.merge_and_unload()

# # 将权重保存为safetensors格式的权重, 且每个权重文件最大不超过2GB(2048MB)
# merged_model.save_pretrained(new_model_directory, max_shard_size="2048MB", safe_serialization=True)