## 导库

In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
import torch

## 读取数据

In [2]:
df = pd.read_json('data/data.json')
ds = Dataset.from_pandas(df)
print(len(ds))
print(ds[0])

3000
{'instruction': '保持健康的三个提示。', 'input': '', 'output': '以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。'}


## 处理数据

In [3]:
# 模型下载：https://huggingface.co/google/gemma-2-2b
tokenizer = AutoTokenizer.from_pretrained('gemma-2-2b', use_fast=False, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right'

In [4]:
def process_func(example):
    MAX_LENGTH = 384    # 分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<bos><start_of_turn>user\n{example['instruction'] + example['input']}<end_of_turn>\n<start_of_turn>model\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}<end_of_turn>\n", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [5]:
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [6]:
tokenized_id

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3000
})

In [7]:
tokenized_id[0]['input_ids']

[2,
 106,
 1645,
 108,
 39876,
 211280,
 47573,
 39340,
 235362,
 107,
 108,
 106,
 2516,
 108,
 176950,
 39876,
 211280,
 47573,
 39340,
 235465,
 109,
 235274,
 235265,
 26702,
 236028,
 27298,
 23515,
 235362,
 42005,
 235928,
 135432,
 158861,
 34965,
 235365,
 235744,
 237068,
 236202,
 235394,
 145786,
 236132,
 132760,
 235365,
 235587,
 102236,
 235675,
 113142,
 24049,
 235365,
 97492,
 102913,
 39009,
 235365,
 236203,
 181442,
 64050,
 90517,
 235362,
 109,
 235284,
 235265,
 152647,
 238599,
 101134,
 235362,
 42005,
 116307,
 115770,
 235370,
 83093,
 235394,
 80253,
 235394,
 235731,
 236892,
 235742,
 235581,
 86636,
 157628,
 204217,
 231755,
 47333,
 235365,
 50537,
 235673,
 237295,
 235394,
 235673,
 86636,
 235581,
 56327,
 55197,
 235365,
 235542,
 39876,
 211280,
 101134,
 67186,
 235362,
 109,
 235304,
 235265,
 235248,
 80023,
 166060,
 235362,
 80023,
 235735,
 125423,
 24049,
 236242,
 235988,
 17479,
 235365,
 124883,
 235444,
 42005,
 236087,
 56567,
 235248

In [8]:
tokenizer.decode(tokenized_id[0]['input_ids'])

'<bos><start_of_turn>user\n保持健康的三个提示。<end_of_turn>\n<start_of_turn>model\n以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。<end_of_turn>\n<eos>'

In [9]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[0]["labels"])))

'以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。<end_of_turn>\n<eos>'

## 创建模型

In [12]:
!pip install transformers==4.42.3

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting transformers==4.42.3
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/20/5c/244db59e074e80248fdfa60495eeee257e4d97c3df3487df68be30cd60c8/transformers-4.42.3-py3-none-any.whl (9.3 MB)
     ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/9.3 MB 130.4 kB/s eta 0:01:12
     ---------------------------------------- 0.0/9.3 MB 130.4 kB/s eta 0:01:12
     ---------------------------------------- 0.0/9.3 MB 122.9 kB/s eta 0:01:16
     ---------------------------------------- 0.1/9.3 MB 156.1 kB/s eta 0:01:00
     ---------------------------------------- 0.1/9.3 MB 156.1 kB/s eta 0:01:00
     ---------------------------------------- 0.1/9.3

DEPRECATION: Loading egg at e:\miniconda3\lib\site-packages\whisper_live-0.0.11-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
faster-whisper 0.10.0 requires tokenizers<0.16,>=0.13, but you have tokenizers 0.19.1 which is incompatible.
pyannote-audio 3.1.1 requires torchmetrics>=0.11.0, but you have torchmetrics 0.6.0 which is incompatible.
surya-ocr 0.4.7 requires torch<3.0.0,>=2.3.0, but you have torch 2.1.1+cu118 which is incompatible.


In [10]:
model = AutoModelForCausalLM.from_pretrained('gemma-2-2b', device_map="auto",torch_dtype=torch.bfloat16) # 以BF16精度加载，节省显存

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [11]:
model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2SdpaAttention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm()
        (post_attention_layernorm): Gemma2RMSNorm()
        (pre_feedforward_

In [12]:
model.enable_input_require_grads() # 开启梯度检查点，具体解释： https://blog.csdn.net/qq_30438779/article/details/135229610

In [13]:
# 查看所有的块和名称
for name,param in model.named_parameters():
    print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.o_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.0.pre_feedforward_layernorm.weight
model.layers.0.post_feedforward_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.o_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.1.pre_feedforward_layernorm.weight
model.layers.1.post_feedforward_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self_attn.k_proj.weight
model.la

## 配置LoRA

In [14]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj"], # 选择合适的target_modules：https://github.com/huggingface/peft/blob/main/src/peft/utils/constants.py#L78
    inference_mode=False, # 训练模式
    r=8, # LoRA 秩大小
    lora_alpha=32, # LoRA alaph，具体作用参见 LoRA 原理
    lora_dropout=0.1 # Dropout 比例
)

In [15]:
model = get_peft_model(model, config)

In [16]:
# 查看可训练的模型参数占比
model.print_trainable_parameters()

trainable params: 1,597,440 || all params: 2,615,939,328 || trainable%: 0.0611


## 配置训练参数

In [17]:
args = TrainingArguments(
    output_dir="save_checkpoint",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=1,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)

# 更多可设置参数：https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments

In [19]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

NotImplementedError: Cannot copy out of meta tensor; no data!

In [26]:
# 开始训练
trainer.train()

  0%|          | 0/187 [00:00<?, ?it/s]

{'loss': 1.9092, 'grad_norm': 0.8530758619308472, 'learning_rate': 9.46524064171123e-05, 'epoch': 0.05}
{'loss': 1.8638, 'grad_norm': 0.9801871180534363, 'learning_rate': 8.930481283422461e-05, 'epoch': 0.11}
{'loss': 1.7289, 'grad_norm': 0.9955060482025146, 'learning_rate': 8.39572192513369e-05, 'epoch': 0.16}
{'loss': 1.7916, 'grad_norm': 0.8481578826904297, 'learning_rate': 7.86096256684492e-05, 'epoch': 0.21}
{'loss': 1.841, 'grad_norm': 1.144881248474121, 'learning_rate': 7.326203208556151e-05, 'epoch': 0.27}
{'loss': 1.7885, 'grad_norm': 1.0255229473114014, 'learning_rate': 6.79144385026738e-05, 'epoch': 0.32}
{'loss': 1.8207, 'grad_norm': 1.2602465152740479, 'learning_rate': 6.25668449197861e-05, 'epoch': 0.37}
{'loss': 1.8749, 'grad_norm': 0.9851454496383667, 'learning_rate': 5.721925133689839e-05, 'epoch': 0.43}
{'loss': 1.7461, 'grad_norm': 0.9647185206413269, 'learning_rate': 5.1871657754010694e-05, 'epoch': 0.48}
{'loss': 1.8435, 'grad_norm': 1.1139891147613525, 'learning_r



{'loss': 1.8332, 'grad_norm': 1.2603214979171753, 'learning_rate': 4.11764705882353e-05, 'epoch': 0.59}
{'loss': 1.8401, 'grad_norm': 1.3341035842895508, 'learning_rate': 3.582887700534759e-05, 'epoch': 0.64}
{'loss': 1.7855, 'grad_norm': 0.9131620526313782, 'learning_rate': 3.0481283422459894e-05, 'epoch': 0.69}
{'loss': 1.86, 'grad_norm': 0.9579825401306152, 'learning_rate': 2.5133689839572196e-05, 'epoch': 0.75}
{'loss': 1.8263, 'grad_norm': 1.0677027702331543, 'learning_rate': 1.9786096256684494e-05, 'epoch': 0.8}
{'loss': 1.8215, 'grad_norm': 1.197729229927063, 'learning_rate': 1.4438502673796791e-05, 'epoch': 0.85}
{'loss': 1.9077, 'grad_norm': 1.0631718635559082, 'learning_rate': 9.090909090909091e-06, 'epoch': 0.91}
{'loss': 1.8848, 'grad_norm': 1.343358039855957, 'learning_rate': 3.7433155080213903e-06, 'epoch': 0.96}
{'train_runtime': 322.9633, 'train_samples_per_second': 9.289, 'train_steps_per_second': 0.579, 'train_loss': 1.8280096972052426, 'epoch': 1.0}


TrainOutput(global_step=187, training_loss=1.8280096972052426, metrics={'train_runtime': 322.9633, 'train_samples_per_second': 9.289, 'train_steps_per_second': 0.579, 'total_flos': 1641397289816064.0, 'train_loss': 1.8280096972052426, 'epoch': 0.9973333333333333})

## 合并推理

In [27]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from peft import PeftModel

mode_path = 'gemma-2-9b'
lora_path = 'save_checkpoint/checkpoint-100' # 这里改称你的 lora 输出对应 checkpoint 地址

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

# 调用模型进行对话生成
chat = [
    { "role": "user", "content": '你好' },
]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=150)
outputs = tokenizer.decode(outputs[0])
response = outputs.split('model')[-1].replace('<end_of_turn>\n<eos>', '')
print(response)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


我是来自阿里云的超大规模语言模型，我叫通义千问。
