# 在 AdvertiseGen 数据集上使用 QLoRA 微调 ChatGLM3-6B

In [1]:
model_name_or_path = "THUDM/chatglm3-6b"
train_data_path = "shibing624/AdvertiseGen"
eval_data_path = None
seed = 8
max_input_length = 512
max_output_length = 1536
lora_rank = 4
lora_alpha = 32
lora_dropout = 0.05
resume_from_checkpoint = None
prompt_text = ""
compute_dtype = "fp32"

## 数据准备

In [2]:
from datasets import load_dataset
dataset = load_dataset(train_data_path)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset["train"][0]

{'content': '类型#裤*版型#宽松*风格#性感*图案#线条*裤型#阔腿裤',
 'summary': '宽松的阔腿裤这两年真的吸粉不少，明星时尚达人的心头爱。毕竟好穿时尚，谁都能穿出腿长2米的效果宽松的裤腿，当然是遮肉小能手啊。上身随性自然不拘束，面料亲肤舒适贴身体验感棒棒哒。系带部分增加设计看点，还让单品的设计感更强。腿部线条若隐若现的，性感撩人。颜色敲温柔的，与裤子本身所呈现的风格有点反差萌。'}

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    trust_remote_code = True,
    revision = "b098244"
)

In [5]:
def tokenize_func(example,tokenizer, ignore_label_id=-100):
    # 构建问题文本
    question = prompt_text + example["content"]
    if example.get("input",None) and example["input"].strip():
        question += f'\n{example["input"]}'

    # 构建答案文本
    answer = example["summary"]

    # 对问题和答案文本进行 tokenize 处理
    q_ids = tokenizer.encode(text = question,add_special_tokens = False)
    a_ids = tokenizer.encode(text = answer, add_special_tokens = False)

    # 如果 tokenize 后的长度超过最大长度限制，则进行截断
    if len(q_ids) > max_input_length - 2:
        q_ids = q_ids[:max_input_length -2]

    if len(a_ids) > max_output_length - 1:
        a_ids = a_ids[:max_output_length - 1]

    # 构建模型的输入模式
    input_ids = tokenizer.build_inputs_with_special_tokens(q_ids,a_ids)
    question_length = len(q_ids) + 2

    labels = [ignore_label_id] * question_length + input_ids[question_length:]

    return {"input_ids": input_ids, "labels": labels}

In [6]:
column_names = dataset["train"].column_names
tokenized_dataset = dataset["train"].map(
    lambda example: tokenize_func(example,tokenizer),
    batched = False,
    remove_columns = column_names
)

In [7]:
tokenized_dataset = tokenized_dataset.shuffle(seed = seed)
tokenized_dataset = tokenized_dataset.flatten_indices()

## 定义 DataCollatorForChatGLM

In [8]:
import torch
from typing import List,Dict,Optional

class DataCollatorForChatGLM:
    def __init__(self,pad_token_id: int, max_length: int = 2048, ignore_label_id: int = -100 ):
        self.pad_token_id = pad_token_id
        self.max_length = max_length
        self.ignore_label_id = ignore_label_id


    def __call__(self,batch_data: List[Dict[str,List]]) -> Dict[str,torch.Tensor]:
        len_list = [len(d['input_ids']) for d in batch_data]
        batch_max_len = max(len_list)  # 找到最长的样本长度

        input_ids, labels = [], []
        for len_of_d, d in sorted(zip(len_list, batch_data), key=lambda x: -x[0]):
            pad_len = batch_max_len - len_of_d  # 计算需要填充的长度
            # 添加填充，并确保数据长度不超过最大长度限制
            ids = d['input_ids'] + [self.pad_token_id] * pad_len
            label = d['labels'] + [self.ignore_label_id] * pad_len
            if batch_max_len > self.max_length:
                ids = ids[:self.max_length]
                label = label[:self.max_length]
            input_ids.append(torch.LongTensor(ids))
            labels.append(torch.LongTensor(label))

        # 将处理后的数据堆叠成一个tensor
        input_ids = torch.stack(input_ids)
        labels = torch.stack(labels)
        
        return {'input_ids': input_ids, 'labels': labels}
        

In [9]:
# 准备数据整理器
data_collator = DataCollatorForChatGLM(pad_token_id=tokenizer.pad_token_id)

## 训练模型

In [10]:
from transformers import AutoModel, BitsAndBytesConfig

_compute_dtype_map = {
    "fp32": torch.float32,
    "fp16": torch.float16,
    "bf16": torch.bfloat16
}

# 量化配置
q_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=_compute_dtype_map["bf16"]
)

In [11]:
model = AutoModel.from_pretrained(
    model_name_or_path,
    quantization_config = q_config,
    device_map = "auto",
    trust_remote_code = True,
    revision = "b098244"
)

Loading checkpoint shards: 100%|█████████████████████████████| 7/7 [00:12<00:00,  1.84s/it]


### 预处理量化后的模型 

In [12]:
from peft import TaskType, LoraConfig, get_peft_model, prepare_model_for_kbit_training

kbit_model = prepare_model_for_kbit_training(model)

2024-03-26 01:02:17.180981: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-26 01:02:17.212788: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-26 01:02:17.368850: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-26 01:02:17.368886: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-26 01:02:17.400693: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [13]:
from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
target_modules = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING["chatglm"]

### LORA 适配器设置

In [14]:
lora_config = LoraConfig(
    target_modules = target_modules,
    r = lora_rank,
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    bias = "none",
    inference_mode = False,
    task_type = TaskType.CAUSAL_LM
)

In [15]:
qlora_model = get_peft_model(kbit_model,lora_config)

In [16]:
qlora_model.print_trainable_parameters()

trainable params: 974,848 || all params: 6,244,558,848 || trainable%: 0.01561115883009451


### 训练超参数配置

In [17]:
from transformers import TrainingArguments,Trainer

training_args = TrainingArguments(
    output_dir = f"./models/{model_name_or_path}",
    per_device_train_batch_size = 16,
    gradient_accumulation_steps = 4,
    learning_rate = 1e-3,
    num_train_epochs = 3,
    lr_scheduler_type = "linear",
    warmup_ratio = 0.1,
    logging_steps = 10,
    save_strategy = "steps",
    save_steps = 100,
    optim = "adamw_torch",
    fp16 = True
)

trainer = Trainer(
    model = qlora_model,
    args = training_args,
    train_dataset = tokenized_dataset,
    data_collator = data_collator
)

In [18]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,5.1727
20,5.09
30,4.8196
40,4.3705
50,4.0315
60,3.8566
70,3.7368
80,3.6434
90,3.5708
100,3.499


Checkpoint destination directory ./models/THUDM/chatglm3-6b/checkpoint-100 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=5370, training_loss=3.0625738591455214, metrics={'train_runtime': 65619.8668, 'train_samples_per_second': 5.239, 'train_steps_per_second': 0.082, 'total_flos': 2.074797694850519e+18, 'train_loss': 3.0625738591455214, 'epoch': 3.0})

In [19]:
trainer.model.save_pretrained(f"./models/saved/{model_name_or_path}")