## 导库

In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
import torch

## 读取数据

In [2]:
df = pd.read_json('./data.json')
ds = Dataset.from_pandas(df)
print(len(ds))
print(ds[0])

3000
{'instruction': '保持健康的三个提示。', 'input': '', 'output': '以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。'}


## 处理数据

In [3]:
# 模型下载：https://huggingface.co/Qwen
tokenizer = AutoTokenizer.from_pretrained('./Qwen2-0.5B-Instruct', use_fast=False, trust_remote_code=True)

In [4]:
def process_func(example):
    MAX_LENGTH = 384    # 分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n你是一个有用的助手<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token也是要关注所以补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [5]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [6]:
tokenized_id

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3000
})

In [7]:
tokenized_id[0]['input_ids']

[151644,
 8948,
 198,
 56568,
 101909,
 115405,
 110498,
 151645,
 198,
 151644,
 872,
 198,
 100662,
 108136,
 101124,
 45139,
 1773,
 151645,
 198,
 151644,
 77091,
 198,
 114566,
 100662,
 108136,
 101124,
 45139,
 48443,
 16,
 13,
 220,
 100662,
 101099,
 99600,
 1773,
 101922,
 99190,
 102618,
 106214,
 101079,
 3837,
 29524,
 111261,
 5373,
 107530,
 57191,
 107140,
 3837,
 26232,
 101902,
 114718,
 99722,
 3837,
 101138,
 105640,
 101102,
 90395,
 105767,
 101940,
 107235,
 3407,
 17,
 13,
 4891,
 251,
 229,
 99967,
 104579,
 1773,
 101922,
 105086,
 104838,
 9370,
 104451,
 5373,
 104618,
 5373,
 35987,
 100203,
 52853,
 33108,
 105349,
 104982,
 99285,
 9370,
 107151,
 102153,
 3837,
 101153,
 44636,
 100443,
 5373,
 44636,
 105349,
 33108,
 101130,
 101083,
 3837,
 23031,
 100662,
 108136,
 104579,
 100784,
 3407,
 18,
 13,
 10236,
 251,
 94,
 101519,
 103119,
 1773,
 105552,
 113357,
 99722,
 107940,
 3837,
 113459,
 101922,
 50511,
 101907,
 220,
 22,
 12,
 23,
 58230,
 237

In [8]:
tokenizer.decode(tokenized_id[0]['input_ids'])

'<|im_start|>system\n你是一个有用的助手<|im_end|>\n<|im_start|>user\n保持健康的三个提示。<|im_end|>\n<|im_start|>assistant\n以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。<|endoftext|>'

In [9]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[0]["labels"])))

'以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。<|endoftext|>'

## 创建模型

In [10]:
model = AutoModelForCausalLM.from_pretrained('./Qwen2-0.5B-Instruct', device_map="auto",torch_dtype=torch.bfloat16) # 以BF16精度加载，节省显存

In [11]:
# model

In [12]:
model.enable_input_require_grads() # 开启梯度检查点，具体解释： https://blog.csdn.net/qq_30438779/article/details/135229610

In [13]:
# 查看所有的块和名称
# for name,param in model.named_parameters():
#     print(name)

## 配置训练参数

In [15]:
args = TrainingArguments(
    output_dir="save_checkpoint",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=1,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)

# 更多可设置参数：https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments

In [16]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [17]:
# 开始训练, FULL SFT 显存占用13249 MiB, LoRA显存11377 MiB, QLoRA显存占用10833MiB
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,2.1829
20,2.2798
30,2.1086
40,2.1609
50,2.1784
60,2.1164
70,2.1153
80,2.1443
90,2.0153
100,2.093




TrainOutput(global_step=187, training_loss=2.090473399442785, metrics={'train_runtime': 76.1907, 'train_samples_per_second': 39.375, 'train_steps_per_second': 2.454, 'total_flos': 1638921392689152.0, 'train_loss': 2.090473399442785, 'epoch': 0.9973333333333333})

## 推理

In [28]:
!pip install markdownify

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting markdownify
  Downloading http://mirrors.aliyun.com/pypi/packages/6c/e9/6e2757a670b8c48bc48eff1c20cb9d71f1476e844038bdbdb76f17e6a12b/markdownify-0.13.1-py3-none-any.whl (10 kB)
Installing collected packages: markdownify
Successfully installed markdownify-0.13.1
[0m

In [34]:
from transformers import pipeline
from markdownify import markdownify as md

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

input_query = "用英文怎么说：以爱之名"

ipt = '<|im_start|>system\n你是一个有用的助手<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n'.format(input_query)
response = pipe(ipt, max_length=512, do_sample=True)
text = (response[0]["generated_text"]).replace(ipt, "")
print(md(text))

In English, "For Love of" can be translated as "With Love".
