In [1]:
import os

import torch
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import Dataset
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments

2023-04-20 11:44:12.600284: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-20 11:44:12.774593: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-20 11:44:13.540069: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-04-20 11:44:13.540178: W tensorflow/

## 读取模型和tokenizer

In [2]:
if os.path.exists('chatglm-6b'):
    checkpoint = "./chatglm-6b" # 本地读取模型文件
else:
    checkpoint = "THUDM/chatglm-6b"
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
device = "cuda"
max_src_length = 256   # 输入最大长度
max_dst_length = 256   # 输出最大长度

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


## 模型转为lora

In [3]:
def load_lora_config(model):
    config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                        inference_mode=False,
                        r=8,
                        lora_alpha=32,
                        lora_dropout=0.1,
                        target_modules=["query_key_value"])
    return get_peft_model(model, config)


model = load_lora_config(model)
model.print_trainable_parameters()



trainable params: 3670016 || all params: 6176956416 || trainable%: 0.05941463324063059


## 数据处理函数

In [4]:
PROMPT_PATTERN = "问：{}"
SEP_PATTERN = "\n答： "


def create_prompt(question):
    return PROMPT_PATTERN.format(question), SEP_PATTERN


def create_prompt_ids(tokenizer, question, max_src_length):
    prompt, sep = create_prompt(question)
    sep_ids = tokenizer.encode(sep, add_special_tokens=True)
    sep_len = len(sep_ids)
    special_tokens_num = 2
    prompt_ids = tokenizer.encode(prompt,
                                  max_length=max_src_length -
                                  (sep_len - special_tokens_num),
                                  truncation=True,
                                  add_special_tokens=False)

    return prompt_ids + sep_ids


def create_inputs_and_labels(tokenizer, question, answer, device):
    prompt = create_prompt_ids(tokenizer, question, max_src_length)
    completion = tokenizer.encode(answer,
                                  max_length=max_dst_length,
                                  truncation=True,
                                  add_special_tokens=False)

    inputs = prompt + completion + [tokenizer.eos_token_id]
    labels = [-100] * len(prompt) + completion + [tokenizer.eos_token_id]

    inputs = torch.tensor(inputs, dtype=torch.long, device=device)
    labels = torch.tensor(labels, dtype=torch.long, device=device)
    return inputs, labels


def get_attention_mask(tokenizer, input_ids, device):
    seq = input_ids.tolist()
    context_len = seq.index(tokenizer.bos_token_id)
    seq_len = len(seq)
    attention_mask = torch.ones((seq_len, seq_len), device=device)
    attention_mask.tril_()
    attention_mask[..., :context_len] = 1
    attention_mask.unsqueeze_(0)
    attention_mask = (attention_mask < 0.5).bool()
    return attention_mask


def get_position_ids(tokenizer, input_ids, device, position_encoding_2d=True):
    seq = input_ids.tolist()
    context_len = seq.index(tokenizer.bos_token_id)
    seq_len = len(seq)

    mask = tokenizer.mask_token_id
    gmask = tokenizer.gmask_token_id

    mask_token = mask if mask in seq else gmask
    use_gmask = False if mask in seq else gmask

    mask_position = seq.index(mask_token)

    if position_encoding_2d:
        position_ids = torch.arange(seq_len, dtype=torch.long, device=device)
        if not use_gmask:
            position_ids[context_len:] = mask_position
        block_position_ids = torch.cat(
            (torch.zeros(context_len, dtype=torch.long, device=device),
             torch.arange(
                 seq_len - context_len, dtype=torch.long, device=device) + 1))
        position_ids = torch.stack((position_ids, block_position_ids), dim=0)
    else:
        position_ids = torch.arange(seq_len, dtype=torch.long, device=device)
        if not use_gmask:
            position_ids[context_len:] = mask_position

    return position_ids

## 测试用的私有数据

In [5]:
train_data = [
    {
        "question": "为什么 Midjourney 效果远远好于开源的 Stable Diffusion Model?",
        "answer": "因为题主不会用SD,从门外汉角度得出了错误结论。 Midjourney特点是新手友好,但可控性差、细节优化难。下限高，但上限低。张张都精致唬人，但你想要调节细节时，会发现越调越歪，哪哪都不对劲，抽盲盒一样。"
    },
    {
        "question": "核酸检测机构需要什么资质",
        "answer": "市卫健委将审核申报资料，并结合市区新型冠状病毒核酸检测需求进行综合评估。经评估，具备相应资质和条件的，纳入我市开展新型冠状病毒核酸检测机构名单，并通知进行新型冠状病毒实验活动备案。"
    },
]

In [6]:
# 包装成dataset
class QADataset(Dataset):
    def __init__(self, data, tokenizer) -> None:
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        item_data = self.data[index]
        tokenizer = self.tokenizer
        input_ids, labels = create_inputs_and_labels(
            tokenizer,
            device=device,
            **item_data
        )

        attention_mask = get_attention_mask(tokenizer, input_ids, device)
        position_ids = get_position_ids(tokenizer, input_ids, device)

        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
            "position_ids": position_ids
        }

    def __len__(self):
        return len(self.data)

# 整理函数 在trainer里用的
def collate_fn(batch):
    input_ids = []
    attention_mask = []
    labels = []
    position_ids = []

    for obj in batch:
        input_ids.append(obj['input_ids'])
        labels.append(obj['labels'])
        attention_mask.append(obj['attention_mask'])
        position_ids.append(obj['position_ids'])

    return {
        'input_ids': torch.stack(input_ids),
        'attention_mask': torch.stack(attention_mask),
        'labels': torch.stack(labels),
        'position_ids': torch.stack(position_ids)
    }

## 模型训练

In [7]:
# 模型进入gpu
model.to(device)

# 训练参数设置
training_args = TrainingArguments("output",
                                  fp16=True,
                                  save_steps=100,
                                  save_total_limit=2,
                                  gradient_accumulation_steps=1,
                                  per_device_train_batch_size=1,
                                  learning_rate=1e-4,
                                  max_steps=300,
                                  logging_steps=10,
                                  remove_unused_columns=False,
                                  seed=114514,
                                  data_seed=1919810,
                                  group_by_length=False,
                                  dataloader_pin_memory=False)

# 设置自己的trainer
class ModifiedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        return model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            position_ids=inputs["position_ids"],
            labels=inputs["labels"],
        ).loss

# 实例化训练数据
train_dataset = QADataset(train_data, tokenizer=tokenizer)

# 实例化trainer
trainer = ModifiedTrainer(model=model,
                          train_dataset=train_dataset,
                          args=training_args,
                          data_collator=collate_fn,
                          tokenizer=tokenizer)

In [8]:
trainer.train()



Step,Training Loss
10,4.8525
20,3.5741
30,2.1685
40,0.9684
50,0.2189
60,0.0223
70,0.0056
80,0.003
90,0.0021
100,0.0016


TrainOutput(global_step=300, training_loss=0.3944643449022745, metrics={'train_runtime': 151.0658, 'train_samples_per_second': 1.986, 'train_steps_per_second': 1.986, 'total_flos': 868352082739200.0, 'train_loss': 0.3944643449022745, 'epoch': 150.0})

## 保存参数

In [11]:
def save_tuned_parameters(model, path):
    saved_params = {
        k: v.to(device)
        for k, v in model.named_parameters()
        if v.requires_grad
    }
    torch.save(saved_params, path)

model_save_path = 'outputpath'

if not os.path.exists(model_save_path):
    os.mkdir(model_save_path)
save_tuned_parameters(model, os.path.join(
    "outputpath", "chatglm-6b-lora.pt"))