aplace_lora

In [61]:
import os
import sys
from typing import List

import fire
import torch
import transformers
from datasets import load_dataset

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer

from utils.prompter import Prompter


In [62]:
base_model: str= "models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16"  # the only required argument
data_path: str = "data/aplaca-cleaned-en"
output_dir: str = "./lora-alpaca"
# training hyperparams
batch_size: int = 128 # 每次迭代中用于训练的样本数量。
micro_batch_size: int = 4 # 每个 GPU 核心上处理的样本数量。
num_epochs: int = 3 # 3 epochs
learning_rate: float = 3e-4
cutoff_len: int = 256 # max length of input to model
val_set_size: int = 2000 # size of validation set
# lora hyperparams
lora_r: int = 8 # rank of the low rank matrix
lora_alpha: int = 16 # alpha of the low rank matrix
lora_dropout: float = 0.05 # dropout of the low rank matrix
lora_target_modules: List[str] = [
    "q_proj",
    "v_proj",
] # 要应用 LoRA 方法的模型模块名称。QV模块
# llm hyperparams
train_on_inputs: bool = True  # if False, masks out inputs in loss是否在损失计算中包含输入。
add_eos_token: bool = False # 是否在输出中添加 EOS 标记。
group_by_length: bool = False  # faster, but produces an odd training loss curve是否根据序列长度对数据进行分组
# wandb params
wandb_project: str = "" # wandb project name
wandb_run_name: str = "" # wandb run name
wandb_watch: str = ""  # options: false | gradients | all 是否让 wandb 观察模型的梯度或全部参数。
wandb_log_model: str = ""  # options: false | true是否在 wandb 中记录模型的权重。
resume_from_checkpoint: str = None  # either training checkpoint or final adapter 表示从哪个检查点恢复训练，可以是训练过程中的检查点或最终的适配器。
prompt_template_name: str = "alpaca"  # The prompt template to use, will default to alpaca.    # 提示模板的名称，默认为 alpaca。

In [63]:
if int(os.environ.get("LOCAL_RANK", 0)) == 0:
    print(
        f"Training Alpaca-LoRA model with params:\n"
        f"base_model: {base_model}\n"
        f"data_path: {data_path}\n"
        f"output_dir: {output_dir}\n"
        f"batch_size: {batch_size}\n"
        f"micro_batch_size: {micro_batch_size}\n"
        f"num_epochs: {num_epochs}\n"
        f"learning_rate: {learning_rate}\n"
        f"cutoff_len: {cutoff_len}\n"
        f"val_set_size: {val_set_size}\n"
        f"lora_r: {lora_r}\n"
        f"lora_alpha: {lora_alpha}\n"
        f"lora_dropout: {lora_dropout}\n"
        f"lora_target_modules: {lora_target_modules}\n"
        f"train_on_inputs: {train_on_inputs}\n"
        f"add_eos_token: {add_eos_token}\n"
        f"group_by_length: {group_by_length}\n"
        f"wandb_project: {wandb_project}\n"
        f"wandb_run_name: {wandb_run_name}\n"
        f"wandb_watch: {wandb_watch}\n"
        f"wandb_log_model: {wandb_log_model}\n"
        f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
        f"prompt template: {prompt_template_name}\n"
    )
assert ( 
    base_model
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"



Training Alpaca-LoRA model with params:
base_model: models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16
data_path: data/aplaca-cleaned-en
output_dir: ./lora-alpaca
batch_size: 128
micro_batch_size: 4
num_epochs: 3
learning_rate: 0.0003
cutoff_len: 256
val_set_size: 2000
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: ['q_proj', 'v_proj']
train_on_inputs: True
add_eos_token: False
group_by_length: False
wandb_project: 
wandb_run_name: 
wandb_watch: 
wandb_log_model: 
resume_from_checkpoint: False
prompt template: alpaca



In [64]:
gradient_accumulation_steps = batch_size // micro_batch_size # 计算梯度累积的步数 128/4=32 32个step后再更新梯度

prompter = Prompter(prompt_template_name) # 创建prompt模板

device_map = "auto" # 自动分配设备
world_size = int(os.environ.get("WORLD_SIZE", 1)) # 获取world_size参与训练的总设备数
ddp = world_size != 1 # 若使用分布式数据并行Distributed Data Parallel，需要多个训练设备
if ddp:
        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} # device_map 指定了模型的哪些部分应该放置在哪些设备上
        gradient_accumulation_steps = gradient_accumulation_steps // world_size #调整了梯度累积的步数，使其适应分布式训练环境
ddp

False

In [65]:
use_wandb = len(wandb_project) > 0 or (
    "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
) # 是否使用wandb工具Weights & Biases（简称W&B 或 wandb）是一个用于机器学习实验跟踪、可视化和项目管理的软件工具。
# Only overwrite environ if wandb param passed
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project
if len(wandb_watch) > 0:
    os.environ["WANDB_WATCH"] = wandb_watch
if len(wandb_log_model) > 0:
    os.environ["WANDB_LOG_MODEL"] = wandb_log_model
use_wandb

False

In [66]:
model = LlamaForCausalLM.from_pretrained( # 用llamaForCausalLM类加载模型
    base_model,
    load_in_8bit=True, # 量化模型8bit
    torch_dtype=torch.float16, # 数据为16浮点精度
    device_map=device_map,
)

tokenizer = LlamaTokenizer.from_pretrained(base_model) # 加载与模型配套的分词器

tokenizer.pad_token_id = ( # 设置分词器的填充（pad）标记的ID为 0，在处理批次（batch）数据时，通常需要将所有序列填充到相同的长度，填充标记用于表示序列的填充部分。
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"  # Allow batched inference填充操作应该在序列的左侧进行

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.29s/it]


In [67]:
model.state_dict

<bound method Module.state_dict of LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_at

In [52]:
def tokenize(prompt, add_eos_token=True):
        # there's probably a way to do this with the tokenizer settings
        # but again, gotta move fast
        # 处理输入的提示（prompt），并根据需要添加结束标记（end-of-sentence token）
        result = tokenizer( # 对提示分词
            prompt,
            truncation=True, # 如果输入超过了最大长度，将截断输入。
            max_length=cutoff_len,  # 截断长度
            padding=False, # 不将序列填充到最大长度。
            return_tensors=None, # 不返回任何特定的张量类型，通常用于框架无关的分词。
        )
        if (
            result["input_ids"][-1] != tokenizer.eos_token_id # 如果结尾不是结束标记
            and len(result["input_ids"]) < cutoff_len
            and add_eos_token
        ):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1) # 1表示有效输入(结束标记)

        result["labels"] = result["input_ids"].copy()

        return result

In [53]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = prompter.generate_prompt( # 生成完整提示
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(full_prompt) # 分词
    if not train_on_inputs: # 训练时不使用输入（input）部分来计算损失。
        user_prompt = prompter.generate_prompt(
            data_point["instruction"], data_point["input"]
        )
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=add_eos_token
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]  # could be sped up, probably
    return tokenized_full_prompt

In [54]:
data_point =  {
    "instruction": "保持健康的三个提示。",
    "input": "",
    "output": "以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。"
  }
generate_and_tokenize_prompt(data_point=data_point)

{'input_ids': [1, 13866, 338, 385, 15278, 393, 16612, 263, 3414, 29889, 14350, 263, 2933, 393, 7128, 2486, 1614, 2167, 278, 2009, 29889, 13, 13, 2277, 29937, 2799, 4080, 29901, 13, 30982, 31695, 31863, 31577, 30210, 30457, 30502, 31302, 30858, 30267, 13, 13, 2277, 29937, 13291, 29901, 13, 30651, 30557, 30392, 30982, 31695, 31863, 31577, 30210, 30457, 30502, 31302, 30858, 30383, 13, 13, 29896, 29889, 29871, 30982, 31695, 31687, 30988, 31704, 30846, 30267, 31951, 30408, 232, 132, 157, 236, 131, 133, 30948, 30210, 31687, 30988, 31894, 30846, 30214, 30847, 233, 152, 166, 233, 176, 168, 30330, 235, 186, 148, 233, 176, 168, 31391, 233, 187, 187, 233, 182, 182, 30214, 30815, 231, 194, 134, 31174, 30869, 235, 164, 131, 31624, 31863, 31577, 30214, 232, 165, 161, 232, 191, 189, 235, 133, 143, 235, 133, 140, 31074, 31180, 30214, 31666, 30417, 31931, 30909, 232, 138, 146, 31022, 30988, 30908, 30267, 13, 13, 29906, 29889, 29871, 232, 160, 138, 235, 164, 164, 236, 168, 177, 31855, 30267, 31951, 3040

In [55]:


config = LoraConfig(
    r=lora_r,   # r is the rank of the low-rank matrix
    lora_alpha=lora_alpha,  # lora_alpha is the scaling factor for the low-rank matrix
    target_modules=lora_target_modules, #要在哪些模型模块上应用LoRA,如注意力层
    lora_dropout=lora_dropout,  #dropout率
    bias="none",    # bias="none"表示不添加偏置
    task_type="CAUSAL_LM",  #表示因果语言模型（用于生成文本的任务）
)
model = get_peft_model(model, config)   #将模型和配置合并
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622


In [56]:
if data_path.endswith(".json") or data_path.endswith(".jsonl"):
    data = load_dataset("json", data_files=data_path)
else:
    data = load_dataset(data_path)
data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 51760
    })
})

In [57]:
if resume_from_checkpoint:
    # Check the available weights and load them
    checkpoint_name = os.path.join(
        resume_from_checkpoint, "pytorch_model.bin"
    )  # Full checkpoint
    if not os.path.exists(checkpoint_name):
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "adapter_model.bin"
        )  # only LoRA model - LoRA config above has to fit
        resume_from_checkpoint = (
            False  # So the trainer won't try loading its state
        )
    # The two files above have a different name depending on how they were saved, but are actually the same.
    if os.path.exists(checkpoint_name):
        print(f"Restarting from {checkpoint_name}")
        adapters_weights = torch.load(checkpoint_name)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {checkpoint_name} not found")
if val_set_size > 0:
    train_val = data["train"].train_test_split( # 从训练集中分割出指定大小的验证集
        test_size=val_set_size, shuffle=True, seed=42   # shuffle随机打乱
    )
    train_data = (
        train_val["train"].shuffle().map(generate_and_tokenize_prompt) #对每个数据项应用generate_and_tokenize_prompt函数
    )
    val_data = (
        train_val["test"].shuffle().map(generate_and_tokenize_prompt)
    )
else:
    train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
    val_data = None

if not ddp and torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True


Map: 100%|██████████| 49760/49760 [01:12<00:00, 682.25 examples/s]
Map: 100%|██████████| 2000/2000 [00:02<00:00, 688.79 examples/s]


In [58]:
print(train_val["train"][0])
print(train_data[0])
print(val_data)

{'instruction': "Write a JavaScript code to display an alert message when the 'Submit' button is pressed.", 'input': '', 'output': 'Here is the JavaScript code you can use to display an alert message when the \'Submit\' button is pressed:\n\n```javascript\ndocument.getElementById(\'submit-button\').addEventListener(\'click\', function() {\n  alert(\'Your form has been submitted!\');\n});\n```\n\nThis code selects the "Submit" button by its `id` attribute using `document.getElementById(\'submit-button\')`. It then adds an event listener to the button that listens for click events, using the `addEventListener` method. When the button is clicked, the callback function is executed, which displays an alert message using the `alert` function. In this case, the alert message says "Your form has been submitted!".\n\nMake sure to assign the proper `id` attribute (in this example, `id="submit-button"`) to the submit button in HTML, so that the element can be properly selected by the script above

In [59]:
trainer = transformers.Trainer( #配置训练器
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=micro_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps" if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=200 if val_set_size > 0 else None,
        save_steps=200,
        output_dir=output_dir,
        save_total_limit=3,
        load_best_model_at_end=True if val_set_size > 0 else False,
        ddp_find_unused_parameters=False if ddp else None,
        group_by_length=group_by_length,
        report_to="wandb" if use_wandb else None,
        run_name=wandb_run_name if use_wandb else None,
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(  #数据整理
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)
model.config.use_cache = False  #禁用了模型的缓存功能

old_state_dict = model.state_dict   #修改模型状态字典
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)    #torch.compile对模型进行编译
model.state_dict




<bound method Module.state_dict of OptimizedModule(
  (_orig_mod): PeftModelForCausalLM(
    (base_model): LoraModel(
      (model): LlamaForCausalLM(
        (model): LlamaModel(
          (embed_tokens): Embedding(32000, 4096, padding_idx=0)
          (layers): ModuleList(
            (0-31): 32 x LlamaDecoderLayer(
              (self_attn): LlamaSdpaAttention(
                (q_proj): lora.Linear8bitLt(
                  (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=4096, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_

In [None]:
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

if not os.path.exists(output_dir):  
    os.makedirs(output_dir)  
model.save_pretrained(output_dir)

print(
    "\n If there's a warning about missing keys above, please disregard :)"
)