## 导库

In [12]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
import torch

## 读取数据

In [4]:
df = pd.read_json('data/data.json')
ds = Dataset.from_pandas(df)
print(len(ds))
print(ds[0])

3000
{'instruction': '保持健康的三个提示。', 'input': '', 'output': '以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。'}


## 处理数据

In [5]:
# 模型下载：https://huggingface.co/Qwen
tokenizer = AutoTokenizer.from_pretrained('Qwen2-0.5B-Instruct', use_fast=False, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
def process_func(example):
    MAX_LENGTH = 384    # 分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n你是一个有用的助手<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token也是要关注所以补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [7]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [8]:
tokenized_id

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3000
})

In [9]:
tokenized_id[0]['input_ids']

[151644,
 8948,
 198,
 56568,
 101909,
 115405,
 110498,
 151645,
 198,
 151644,
 872,
 198,
 100662,
 108136,
 101124,
 45139,
 1773,
 151645,
 198,
 151644,
 77091,
 198,
 114566,
 100662,
 108136,
 101124,
 45139,
 48443,
 16,
 13,
 220,
 100662,
 101099,
 99600,
 1773,
 101922,
 99190,
 102618,
 106214,
 101079,
 3837,
 29524,
 111261,
 5373,
 107530,
 57191,
 107140,
 3837,
 26232,
 101902,
 114718,
 99722,
 3837,
 101138,
 105640,
 101102,
 90395,
 105767,
 101940,
 107235,
 3407,
 17,
 13,
 4891,
 251,
 229,
 99967,
 104579,
 1773,
 101922,
 105086,
 104838,
 9370,
 104451,
 5373,
 104618,
 5373,
 35987,
 100203,
 52853,
 33108,
 105349,
 104982,
 99285,
 9370,
 107151,
 102153,
 3837,
 101153,
 44636,
 100443,
 5373,
 44636,
 105349,
 33108,
 101130,
 101083,
 3837,
 23031,
 100662,
 108136,
 104579,
 100784,
 3407,
 18,
 13,
 10236,
 251,
 94,
 101519,
 103119,
 1773,
 105552,
 113357,
 99722,
 107940,
 3837,
 113459,
 101922,
 50511,
 101907,
 220,
 22,
 12,
 23,
 58230,
 237

In [10]:
tokenizer.decode(tokenized_id[0]['input_ids'])

'<|im_start|>system\n你是一个有用的助手<|im_end|>\n<|im_start|>user\n保持健康的三个提示。<|im_end|>\n<|im_start|>assistant\n以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。<|endoftext|>'

In [11]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[0]["labels"])))

'以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。<|endoftext|>'

## 创建模型

In [13]:
model = AutoModelForCausalLM.from_pretrained('Qwen2-0.5B-Instruct', device_map="auto",torch_dtype=torch.bfloat16) # 以BF16精度加载，节省显存

In [14]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Linear(in_featur

In [15]:
model.enable_input_require_grads() # 开启梯度检查点，具体解释： https://blog.csdn.net/qq_30438779/article/details/135229610

In [16]:
# 查看所有的块和名称
for name,param in model.named_parameters():
    print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.q_proj.bias
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.k_proj.bias
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.v_proj.bias
model.layers.0.self_attn.o_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.q_proj.bias
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.k_proj.bias
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.v_proj.bias
model.layers.1.self_attn.o_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self

## 配置LoRA

In [17]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj"], # 选择合适的target_modules：https://github.com/huggingface/peft/blob/main/src/peft/utils/constants.py#L78
    inference_mode=False, # 训练模式
    r=8, # LoRA 秩大小
    lora_alpha=32, # LoRA alaph，具体作用参见 LoRA 原理
    lora_dropout=0.1 # Dropout 比例
)

In [18]:
model = get_peft_model(model, config)

In [19]:
# 查看可训练的模型参数占比
model.print_trainable_parameters()

trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


## 配置训练参数

In [24]:
args = TrainingArguments(
    output_dir="save_checkpoint",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=1,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)

# 更多可设置参数：https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments

In [25]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [26]:
# 开始训练
trainer.train()

  0%|          | 0/187 [00:00<?, ?it/s]

{'loss': 1.9092, 'grad_norm': 0.8530758619308472, 'learning_rate': 9.46524064171123e-05, 'epoch': 0.05}
{'loss': 1.8638, 'grad_norm': 0.9801871180534363, 'learning_rate': 8.930481283422461e-05, 'epoch': 0.11}
{'loss': 1.7289, 'grad_norm': 0.9955060482025146, 'learning_rate': 8.39572192513369e-05, 'epoch': 0.16}
{'loss': 1.7916, 'grad_norm': 0.8481578826904297, 'learning_rate': 7.86096256684492e-05, 'epoch': 0.21}
{'loss': 1.841, 'grad_norm': 1.144881248474121, 'learning_rate': 7.326203208556151e-05, 'epoch': 0.27}
{'loss': 1.7885, 'grad_norm': 1.0255229473114014, 'learning_rate': 6.79144385026738e-05, 'epoch': 0.32}
{'loss': 1.8207, 'grad_norm': 1.2602465152740479, 'learning_rate': 6.25668449197861e-05, 'epoch': 0.37}
{'loss': 1.8749, 'grad_norm': 0.9851454496383667, 'learning_rate': 5.721925133689839e-05, 'epoch': 0.43}
{'loss': 1.7461, 'grad_norm': 0.9647185206413269, 'learning_rate': 5.1871657754010694e-05, 'epoch': 0.48}
{'loss': 1.8435, 'grad_norm': 1.1139891147613525, 'learning_r



{'loss': 1.8332, 'grad_norm': 1.2603214979171753, 'learning_rate': 4.11764705882353e-05, 'epoch': 0.59}
{'loss': 1.8401, 'grad_norm': 1.3341035842895508, 'learning_rate': 3.582887700534759e-05, 'epoch': 0.64}
{'loss': 1.7855, 'grad_norm': 0.9131620526313782, 'learning_rate': 3.0481283422459894e-05, 'epoch': 0.69}
{'loss': 1.86, 'grad_norm': 0.9579825401306152, 'learning_rate': 2.5133689839572196e-05, 'epoch': 0.75}
{'loss': 1.8263, 'grad_norm': 1.0677027702331543, 'learning_rate': 1.9786096256684494e-05, 'epoch': 0.8}
{'loss': 1.8215, 'grad_norm': 1.197729229927063, 'learning_rate': 1.4438502673796791e-05, 'epoch': 0.85}
{'loss': 1.9077, 'grad_norm': 1.0631718635559082, 'learning_rate': 9.090909090909091e-06, 'epoch': 0.91}
{'loss': 1.8848, 'grad_norm': 1.343358039855957, 'learning_rate': 3.7433155080213903e-06, 'epoch': 0.96}
{'train_runtime': 322.9633, 'train_samples_per_second': 9.289, 'train_steps_per_second': 0.579, 'train_loss': 1.8280096972052426, 'epoch': 1.0}


TrainOutput(global_step=187, training_loss=1.8280096972052426, metrics={'train_runtime': 322.9633, 'train_samples_per_second': 9.289, 'train_steps_per_second': 0.579, 'total_flos': 1641397289816064.0, 'train_loss': 1.8280096972052426, 'epoch': 0.9973333333333333})

## 合并推理

In [27]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = 'Qwen2-0.5B-Instruct'
lora_path = 'save_checkpoint/checkpoint-100' # 这里改称你的 lora 输出对应 checkpoint 地址

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()
# 加载LoRA权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

prompt = "你是谁？"
inputs = tokenizer.apply_chat_template([{"role": "user", "content": "你是一个有用的助手"},{"role": "user", "content": prompt}],
                                       add_generation_prompt=True,
                                       tokenize=True,
                                       return_tensors="pt",
                                       return_dict=True
                                       ).to('cuda')


gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


我是来自阿里云的超大规模语言模型，我叫通义千问。


## 合并权重

In [30]:
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
from transformers.generation.utils import GenerationConfig

def apply_lora(model_name_or_path, output_path, lora_path, model_format="safetensors"):
    print(f"Loading the base model from {model_name_or_path}")
    base_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False, trust_remote_code=True)
    base = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="cuda:0", torch_dtype=torch.bfloat16, trust_remote_code=True)
    # base.generation_config = GenerationConfig.from_pretrained(model_name_or_path)

    print(f"Loading the LoRA adapter from {lora_path}")
 
    lora_model = PeftModel.from_pretrained(
        base,
        lora_path,
        torch_dtype=torch.float16,
    )
 
    print("Applying the LoRA")
    model = lora_model.merge_and_unload()
    
    if model_format=="safetensors":
        print(f"Saving the target model to {output_path}")
        model.save_pretrained(output_path)
        base_tokenizer.save_pretrained(output_path)
    if model_format=="bin":
        # Save the model weights to a .bin file
        bin_file_path = f"{output_path}/pytorch_model.bin"
        print(f"Saving the model weights to {bin_file_path}")
        torch.save(model.state_dict(), bin_file_path)

        print(f"Saving the tokenizer to {output_path}")
        base_tokenizer.save_pretrained(output_path)


if __name__ == "__main__":
    lora_path = 'save_checkpoint/checkpoint-100'
    model_path = 'Qwen2-0.5B-Instruct'
    output = 'merge_checkpoint'
    model_format = 'safetensors'

    apply_lora(model_path,output,lora_path,model_format)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading the base model from Qwen2-0.5B-Instruct
Loading the LoRA adapter from save_checkpoint/checkpoint-100
Applying the LoRA
Saving the model weights to merge_checkpoint_bin/pytorch_model.bin
Saving the tokenizer to merge_checkpoint_bin


## 推理微调模型

In [34]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "merge_checkpoint",
    device_map="auto",
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained("merge_checkpoint")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


"A large language model is an artificial intelligence system that can generate human-like text and respond to questions in natural language. These models are trained on vast amounts of data, which includes texts from various sources, such as social media, news articles, and scientific papers.\nThe primary goal of a large language model is to generate human-like responses to a wide range of questions. It can be used for tasks such as answering trivia questions, providing answers to complex questions, and generating creative content.\nOne of the key advantages of large language models is their ability to process vast amounts of information quickly and accurately. They can generate text at speeds up to several million words per second, making them ideal for many applications where speed is critical.\nIn addition to their ability to generate human-like responses, large language models have also been shown to have valuable insights into a variety of topics, including language and communicat

## FastAPI部署

部署代码：

In [None]:
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import uvicorn
import json
import datetime
import torch

# 设置设备参数
DEVICE = "cuda"  # 使用CUDA
DEVICE_ID = "0"  # CUDA设备ID，如果未设置则为空
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE  # 组合CUDA设备信息

# 清理GPU内存函数
def torch_gc():
    if torch.cuda.is_available():  # 检查是否可用CUDA
        with torch.cuda.device(CUDA_DEVICE):  # 指定CUDA设备
            torch.cuda.empty_cache()  # 清空CUDA缓存
            torch.cuda.ipc_collect()  # 收集CUDA内存碎片

# 创建FastAPI应用
app = FastAPI()

# 处理POST请求的端点
@app.post("/")
async def create_item(request: Request):
    global model, tokenizer  # 声明全局变量以便在函数内部使用模型和分词器
    json_post_raw = await request.json()  # 获取POST请求的JSON数据
    json_post = json.dumps(json_post_raw)  # 将JSON数据转换为字符串
    json_post_list = json.loads(json_post)  # 将字符串转换为Python对象
    prompt = json_post_list.get('prompt')  # 获取请求中的提示

    messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
    ]

    # 调用模型进行对话生成
    input_ids = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
    model_inputs = tokenizer([input_ids], return_tensors="pt").to('cuda')
    generated_ids = model.generate(model_inputs.input_ids,max_new_tokens=512)
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    now = datetime.datetime.now()  # 获取当前时间
    time = now.strftime("%Y-%m-%d %H:%M:%S")  # 格式化时间为字符串
    # 构建响应JSON
    answer = {
        "response": response,
        "status": 200,
        "time": time
    }
    # 构建日志信息
    log = "[" + time + "] " + '", prompt:"' + prompt + '", response:"' + repr(response) + '"'
    print(log)  # 打印日志
    torch_gc()  # 执行GPU内存清理
    return answer  # 返回响应

# 主函数入口
if __name__ == '__main__':
    # 加载预训练的分词器和模型
    model_name_or_path = "merge_checkpoint"
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", torch_dtype=torch.bfloat16)

    # 启动FastAPI应用
    # 用6006端口可以将autodl的端口映射到本地，从而在本地使用api
    uvicorn.run(app, host='127.0.0.1', port=6006, workers=1)  # 在指定端口和主机上启动应用

请求代码：

In [39]:
import requests
import json

def get_completion(prompt):
    headers = {'Content-Type': 'application/json'}
    data = {"prompt": prompt}
    response = requests.post(url='http://127.0.0.1:6006', headers=headers, data=json.dumps(data))
    return response.json()['response']

if __name__ == '__main__':
    print(get_completion('你好'))

你好！有什么我可以帮助你的吗？
