以下仅为本人第一次 LoRA 微调所用程序，仍然需要进一步的修改。

1. Libraries

In [None]:
import os
# Setting GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
# Parameter-Efficient-Fine-Tuning
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    TaskType,
    get_peft_model,
    get_peft_model_state_dict
)

2. Setting the finetuning parameters and some constants. 

In [None]:
# Path to the model
MODEL_PATH = "../models/chat/llama-3-chinese-8b-instruct-v2"
# Size of each batch for a single gradient update
MICRO_BATCH_SIZE = 4
# Total size of the batch
BATCH_SIZE = 128
# Number of steps to accumulate gradients before performing a backward/update pass
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
# Number of training epoch
EPOCHS = 3
# Learning rate for the optimizer
LEARNING_RATE = 3e-4
# Maximum sequence length for tokenized input
CUTOFF_LEN = 256
# Hyperparameters specific to LoRA
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
# Size of the validation set 
VAL_SET_SIZE = 2000

3. Load the model into the GPU memory.

In [None]:
print(f"Starting to load the model into memory")

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH, 
    use_fast=False, 
    add_eos_token=True
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH, 
    load_in_8bit=True,
    device_map="auto"
)
model = prepare_model_for_kbit_training(model)

print(f"Successfully loaded the model into memory")

4. Setting LoRA parameters.

In [None]:
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, config)

# llama-3-8b 模型结构
(model): LlamaModel(
  (embed_tokens): Embedding (128256, 4096)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer ( 
      (self._attn): LlamaSdpaAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out _features=1024, bias=False) 
        (v_proj): Linear (in_features=4096, out_features=1024, _features=1024, bias=False)
        (o_proj): Linear(in_reatures=4096, out out_features=4096, bias=False)
        (rotary_emb): LlamaRotaryEmbedding ()
      )
      (mlp) : LlamaMLP(
        (gate_proj ): Linear (in_reatures=4096, out_reatures=14336, bias=false)
        (up_proj): Linear(in_reatures=4096, out_reatures=14336, bias=False) 
        (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
        (act_fn) : SiLU()
      )
      (input_layernorm): LlamaRMSNorm ( )
      (post_attention_layernorm) : LlamaRMSNorm()
    )
  )
  (norm): LlamaRMSNorm ( )
)
(Im_head): Linear (in_features=4096, out_features=128256, bias=False)

# r(rank):
• 定义: 低秩矩阵 (A) 和 (B) 的秩。它决定了新的权重矩阵在低秩空间中的维度。
• 作用: 秩 (r) 越大，模型的表达能力越强，但需要调整的参数也越多。较小的 (r) 值能够减少参数数量，从而降低计算成本，但可能会损失部分模型性能。
• 设置建议: 一般选择 1 到 64 之间的值。较小的模型或较小的数据集可以选择较小的 (r) 值，而较大的模型和数据集可以选择较大的 (r) 值。
# lora_alpha:
• 定义: 用于缩放低秩矩阵 (A) 和 (B) 的比例因子。
• 作用: 控制低秩矩阵在权重更新中的影响。较大的 lora_alpha 值会增加低秩矩阵对模型权重的影响，反之则减小。
• 设置建议: 通常设置为与 r 相同或较接近的值。比如，如果 r 为 16，那么 lora_alpha 可以设为 16。
# lora_dropout:
• 定义: 在应用低秩矩阵之前对其进行 Dropout 操作的概率。
• 作用: Dropout 用于正则化，防止模型过拟合。lora_dropout 控制在应用低秩矩阵前丢弃一些元素的概率。
• 设置建议: 一般选择 0 到 0.5 之间的值。可以根据训练过程中模型的表现调整这个值。如果模型过拟合，可以增加 lora_dropout；如果模型欠拟合，可以减少或设置为 0。

In [None]:
tokenizer.pad_token_id = 0  
data = load_dataset("json", data_files="../data/alpaca_data.json")

# Split the original alpaca dataset to training dataset and validation dataset
train_val = data["train"].train_test_split(
    test_size=VAL_SET_SIZE, shuffle=True, seed=42
)
train_data = train_val["train"]
val_data = train_val["test"]


# Why setting pad_token_id to 0?
• By setting the pad_token_id to 0, which typically represents an unknown token (often <unk> in many tokenizers), you ensure that padding tokens are easily distinguishable from other tokens in the sequence. 
• This helps the model to ignore padding tokens during training and evaluation.

In [None]:
# Generate a prompt in a certain format.
def generate_prompt(data_point):
    if data_point["input"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

                ### Instruction:
                {data_point["instruction"]}

                ### Input:
                {data_point["input"]}

                ### Response:
                {data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

                ### Instruction:
                {data_point["instruction"]}

                ### Response:
                {data_point["output"]}"""

In [None]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN + 1,
        padding="max_length",
    )
    # Narrow the length of input_id to max_length.
    return {
        "input_ids": result["input_ids"][:-1],
        "attention_mask": result["attention_mask"][:-1],
    }

# Why trim the last token?
• Sequence Length Management:
    By setting max_length=CUTOFF_LEN + 1, the function ensures that even after trimming, the resulting sequence length is CUTOFF_LEN. This is useful for controlling the sequence length precisely.
• Consistency with Model Input Requirements:
    Some models might require sequences to be of a specific length or have certain constraints on input sizes. Trimming ensures that the tokenized sequences meet these requirements.

In [None]:
train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)))
val_data = val_data.shuffle().map(lambda x: tokenize(generate_prompt(x)))

# shuffle() method
• This method shuffles the dataset to randomize the order of the data points. Shuffling is important to prevent the model from learning any spurious patterns due to the order of the data.

# lambda x: tokenize(generate_prompt(x))
lambda is a way to definite a function in Python.
# Which equals:
def function(x):
    return tokenize(generate_prompt(x))

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        # Number of steps between logging training metrics.
        logging_steps=20,
        evaluation_strategy="steps",
        save_strategy="steps",
        # Number of steps between evaluations.
        eval_steps=200,
        # Number of steps between saving checkpoints.
        save_steps=200,
        output_dir="lora_weight/llama-3",
        # The maximum number of checkpoints to keep. Older checkpoints will be deleted to save space.
        save_total_limit=3,
        # Whether to load the best model found during training at the end of the training process.
        load_best_model_at_end=True,
    ),
    # Prepare batches of data.
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
# Disables caching in the model’s configuration. (Save memory)
model.config.use_cache = False
old_state_dict = model.state_dict
# The custom implementation uses get_peft_model_state_dict to modify the behavior of the original state_dict method.
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
).__get__(model, type(model))

trainer.train()

In [None]:
# Save the lora weight
lora_path = './llama3_lora'
trainer.model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)