In [1]:
#这是一段基于Loss_detect得到的model训练代码，重点在优化训练资源消耗

In [2]:
class CFG:
    model_path = '/root/autodl-tmp/weights/chatglm3-6b'
    output_dir = '/root/autodl-tmp/checkpoints/glm3'
    
    num_train_epochs = 5
    max_train_steps = None
    
    batch_size = 2
    max_tokens = 512
    max_query = 256
    
    lr = 1e-5
    warm_up_steps = 200
    
    data_path = '/root/autodl-tmp/dataset/OESD-GG-zh_cn-1/single_query.jsonl'
    query_key = 'User'
    answer_key = 'Assisstant'

In [2]:
import torch
import deepspeed
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

import sys
import json
import pandas as pd
from tqdm import tqdm

[2024-03-06 16:47:29,055] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
%matplotlib inline
%config InlineBackend.figure_format='svg'

In [5]:
sys.path.append('/root/tuning_space/Components/')
import interact
import model_tools
from Static import prompt_dict, st, si

In [6]:
model_path = '/root/autodl-fs/weights/chatglm3-6b'
data_path = '/root/autodl-tmp/dataset/psychology-dataset/data/train.jsonl'

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],  # lora的目标位置，具体有哪些可选项可打印出源码中的key_list 注意不同的模型中的定义名称不同
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [7]:
%%time
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(CFG.model_path, trust_remote_code=True).cuda().float()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

CPU times: user 6.65 s, sys: 20.9 s, total: 27.6 s
Wall time: 12.2 s


In [8]:
#施加peft lora
model_tools.model_profile(model)
print('conducting peft lora ---------------')
model = get_peft_model(model, config)
model_tools.model_profile(model)

Total Parameters: 6243584000
Trainable Parameters: 6243584000
Percentage of Trainable Parameters: 100.00%
conducting peft lora ---------------
Total Parameters: 6247483392
Trainable Parameters: 3899392
Percentage of Trainable Parameters: 0.06%


In [9]:
class instruction_dataset(Dataset):
    def __init__(self, data_path:'str', tokenizer, truncate_length, max_query_length, query_key, answer_key):
        super().__init__()
        self.tokenizer = tokenizer
        self.examples = []
        
        with open(data_path, 'r') as file:
            for line in file:
                sample=json.loads(line)
                # input_ids的结构应该为：prompt_tokens, src_tokens, [gMASK], <sop>, tgt_tokens, <eop>, [PAD]... 
                # 或者简化一点，即为 query, [gMASK], <sop>, answer, <eop>, [PAD]... 
                # padding的目的是为了对齐各个instance，以组成batch（当然batch_size=1时其实没必要）
                # 总体的input_ids的长度不超过truncate_length，其中query的长度不超过max_query_length，同理可以计算出answer的最大长度
                max_answer_length = truncate_length - max_query_length - 3
                
                # 判断query的长度
                query = sample[query_key]
                query_ids = tokenizer.encode(query, add_special_tokens=False)
                if len(query_ids) > max_query_length:
                    query_ids = query_ids[:max_query_length]
                
                # 判断answer的长度
                answer = sample[answer_key]
                answer_ids = tokenizer.encode(answer, add_special_tokens=False)
                if len(answer) > max_answer_length:
                    answer_ids = answer_ids[:max_answer_length]
                    
                # 合并
                input_ids = query_ids + [si['[gMASK]']] + [si['sop']] + answer_ids + [si['eop']]
                pre_context_length = input_ids.index(si['sop'])
                end_answer_index = input_ids.index(si['eop'])
                
                # padding
                padding_length=truncate_length-len(input_ids)
                input_ids+=padding_length*[tokenizer.pad_token_id]
                
                # 制作labels；其中query部分，pad部分均不参与loss的计算 # 因为需要整体向左移动，所以要少填充一个
                labels = [-100] * (pre_context_length+1) + input_ids[pre_context_length+1: end_answer_index+1]
                labels = labels + [-100] * (truncate_length-len(labels))
                
                # 制作attention_mask
                eop_position = input_ids.index(si['eop'])+1
                attention_mask = [True]*eop_position
                attention_mask += [False]*(truncate_length-len(attention_mask))
                
                self.examples.append({
                    'query' : query,
                    'answer' : answer,
                    'input_ids' : input_ids,
                    'labels' : labels,
                    'attention_mask' : attention_mask,
                })
                
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        instance = self.examples[item]
        return instance

In [10]:
def coll_fn(batch:list):
    input_labels = []
    labels = []
    attention_mask = []
    for sample in batch:
        # 实际上词表长度只有65024，所以int32就可以了 # attention_mask用bool就行 (我收回我的画，完全是玄学)
        input_labels.append(torch.tensor(sample['input_ids'], dtype=torch.long))
        labels.append(torch.tensor(sample['labels'], dtype=torch.long))
        attention_mask.append(torch.tensor(sample['attention_mask'], dtype=torch.float64)) #, dtype=torch.bool
    batch = {'input_ids':input_labels, 'labels':labels, 'attention_mask': attention_mask}
    batch = {name:torch.stack(item).cuda() for name,item in batch.items()} #对于一个元素不为基本元素的list，需要使用stack方法
    return batch

In [11]:
%%time
finetuning_instruction = instruction_dataset(CFG.data_path, tokenizer, CFG.max_tokens, CFG.max_query, CFG.query_key, CFG.answer_key)
instruction_loader = DataLoader(finetuning_instruction, batch_size=CFG.batch_size, shuffle=True, collate_fn=coll_fn)

CPU times: user 9.22 s, sys: 201 ms, total: 9.42 s
Wall time: 9.41 s


In [12]:
# 定义参数优化器 & 学习率优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr)

lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=CFG.warm_up_steps,
        num_training_steps=(len(instruction_loader) * CFG.num_train_epochs),
    )

In [None]:
deepspeed_config_path = "ds_config.json"
model, optimizer, _, lr_scheduler = deepspeed.initialize(
    model=model,
    optimizer=optimizer,
    lr_scheduler=lr_scheduler,
    config_params=deepspeed_config_path
)

In [None]:
def lora_tuning():
    # 训练阶段
    model.train()
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(CFG.num_train_epochs):
        for step, batch in tqdm(enumerate(instruction_loader)):
            # 前向传播 & 计算loss (使用fp16)
            with torch.cuda.amp.autocast():
                outputs = model(**batch)
                loss = outputs.loss
                status['loss'].append(loss)
            # 反向传播
            scaler.scale(loss).backward()
            # 优化器调度
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            lr_scheduler.step()

In [13]:
def lora_tuning():
    status=dict()
    status['loss']=[]
    # 训练阶段
    model.train()
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(CFG.num_train_epochs):
        for step, batch in tqdm(enumerate(instruction_loader)):
            # 前向传播 & 计算loss (使用fp16)
            with torch.cuda.amp.autocast():
                outputs = model(**batch)
                loss = outputs.loss
                status['loss'].append(loss)
                if step%10==0:
                    print(loss)
                if CFG.max_train_steps!=None:
                    if step>=CFG.max_train_steps:
                        return status
            # 反向传播
            scaler.scale(loss).backward()
            # 优化器调度
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            lr_scheduler.step()
    return status

In [None]:
#status=lora_tuning()

In [None]:
#模型的保存
#model.save_pretrained(CFG.output_dir)

In [None]:
!ls $CFG.output_dir

```
import matplotlib.pyplot as plt
import torch

loss = [value.item() for times, value in enumerate(status['loss'])]

# 创建图表
plt.figure(figsize=(10, 5))  # 可以调整图表大小
plt.scatter(range(len(loss)), loss, marker='o')  # 使用圆圈标记每个点

# 添加标题和标签
plt.title('Loss over Steps')
plt.xlabel('Step')
plt.ylabel('Loss')

# 显示网格
plt.grid(True)

# 显示图表
plt.show()
```