In [1]:
#这是第一个完整训练代码

In [2]:
class CFG:
    num_train_epochs = 1
    output_dir = '/root/autodl-fs/weights/chatglm3-6b_checkpoint'

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from peft import LoraConfig, get_peft_model

import sys
import json
import pandas as pd
from tqdm import tqdm

In [4]:
sys.path.append('/root/tuning_space/Components/')
import interact
import model_tools
from Static import prompt_dict, st, si

In [5]:
model_path = '/root/autodl-fs/weights/chatglm3-6b'
data_path = '/root/autodl-tmp/dataset/psychology-dataset/data/train.jsonl'

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],  # lora的目标位置，具体有哪些可选项可打印出源码中的key_list 注意不同的模型中的定义名称不同
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [6]:
%%time
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).cuda().float()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

CPU times: user 7.34 s, sys: 28.8 s, total: 36.1 s
Wall time: 1min 11s


In [7]:
#施加peft lora
model_tools.model_profile(model)
print('conducting peft lora ---------------')
model = get_peft_model(model, config)
model_tools.model_profile(model)

Total Parameters: 6243584000
Trainable Parameters: 6243584000
Percentage of Trainable Parameters: 100.00%
conducting peft lora ---------------
Total Parameters: 6247483392
Trainable Parameters: 3899392
Percentage of Trainable Parameters: 0.06%


In [8]:
class instruction_dataset(Dataset):
    def __init__(self, data_path:'str', tokenizer, truncate_length, max_query_length):
        super().__init__()
        self.tokenizer = tokenizer
        self.examples = []
        
        with open(data_path, 'r') as file:
            for line in file:
                sample=json.loads(line)
                # input_ids的结构应该为：prompt_tokens, src_tokens, [gMASK], <sop>, tgt_tokens, <eop>, [PAD]... 
                # 或者简化一点，即为 query, [gMASK], <sop>, answer, <eop>, [PAD]... 
                # padding的目的是为了对齐各个instance，以组成batch（当然batch_size=1时其实没必要）
                # 总体的input_ids的长度不超过truncate_length，其中query的长度不超过max_query_length，同理可以计算出answer的最大长度
                max_answer_length = truncate_length - max_query_length - 3
                
                # 判断query的长度
                query = sample['question']
                query_ids = tokenizer.encode(query, add_special_tokens=False)
                if len(query_ids) > max_query_length:
                    query_ids = query_ids[:max_query_length]
                
                # 判断answer的长度
                answer = sample['response_j']
                answer_ids = tokenizer.encode(answer, add_special_tokens=False)
                if len(answer) > max_answer_length:
                    answer_ids = answer_ids[:max_answer_length]
                    
                # 合并
                input_ids = query_ids + [si['[gMASK]']] + [si['sop']] + answer_ids + [si['eop']]
                pre_context_length = input_ids.index(si['sop'])
                end_answer_index = input_ids.index(si['eop'])
                
                # padding
                padding_length=truncate_length-len(input_ids)
                input_ids+=padding_length*[tokenizer.pad_token_id]
                
                # 制作labels；其中query部分，pad部分均不参与loss的计算 # 因为需要整体向左移动，所以要少填充一个
                labels = [-100] * (pre_context_length-0) + input_ids[pre_context_length+1: end_answer_index+1]
                labels = labels + [-100] * (truncate_length-len(labels))
                
                # 制作attention_mask
                eop_position = input_ids.index(si['eop'])+1
                attention_mask = [True]*eop_position
                attention_mask += [False]*(truncate_length-len(attention_mask))
                
                self.examples.append({
                    'query' : query,
                    'answer' : answer,
                    'input_ids' : input_ids,
                    'labels' : labels,
                    'attention_mask' : attention_mask,
                })
                
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        instance = self.examples[item]
        return instance

In [9]:
def coll_fn(batch:list):
    input_labels = []
    labels = []
    attention_mask = []
    for sample in batch:
        # 实际上词表长度只有65024，所以int32就可以了 # attention_mask用bool就行 (我收回我的画，完全是玄学)
        input_labels.append(torch.tensor(sample['input_ids'], dtype=torch.long))
        labels.append(torch.tensor(sample['labels'], dtype=torch.long))
        attention_mask.append(torch.tensor(sample['attention_mask'], dtype=torch.float64)) #, dtype=torch.bool
    batch = {'input_ids':input_labels, 'labels':labels, 'attention_mask': attention_mask}
    batch = {name:torch.stack(item).cuda() for name,item in batch.items()} #对于一个元素不为基本元素的list，需要使用stack方法
    return batch

In [10]:
%%time
finetuning_instruction=instruction_dataset(data_path, tokenizer, 128, 32)
instruction_loader = DataLoader(finetuning_instruction, batch_size=2, shuffle=True, collate_fn=coll_fn)

CPU times: user 4.7 s, sys: 66.1 ms, total: 4.77 s
Wall time: 4.77 s


In [11]:
for item in instruction_loader:
    pass
    print(item)
    break

{'input_ids': tensor([[  307, 30953, 30924, 10346,   356,  7841,   290, 19834, 30928,   291,
           552,  2482, 30930, 64790, 64792,  2871, 14474, 30928,   457,   330,
           260,  2529, 10057,   289,  5559, 30932,   498,   354, 30953, 30917,
          1447,   289,  5550,   267,  6840,  6157,   291,   260,  3544,   293,
          1625, 30941, 11000,  4686,   285,  2313, 30930,  2641, 30953, 30917,
           636,  1811,   289,  5478,   612, 10419,  2326,   293,  1053,   260,
          1377,   289,   833,   344,  5559,   475, 19834, 30928, 30930,   666,
           740,  1860,  5000,  4917,  3462, 30932,  2310,  3500,   356,   475,
          4881, 30932,   400, 10944,  2666,   289,  1378,   475,  1254, 30941,
         29038, 30930, 64793,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [12]:
# 定义参数优化器 & 学习率优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=50,
        num_training_steps=(len(instruction_loader) * CFG.num_train_epochs),
    )

In [None]:
model.train()

for epoch in range(CFG.num_train_epochs):
    for step, batch in tqdm(enumerate(instruction_loader)):
        # 前向传播
        outputs = model(**batch)
        #break
        # 计算loss
        loss = outputs.loss
        if step%10==0:print(loss)
        # 反向传播
        loss.backward()
        # 优化器调度
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

0it [00:00, ?it/s]

tensor(14.2441, device='cuda:0', grad_fn=<NllLossBackward0>)


11it [00:05,  2.78it/s]

tensor(15.5638, device='cuda:0', grad_fn=<NllLossBackward0>)


21it [00:08,  2.84it/s]

tensor(15.5320, device='cuda:0', grad_fn=<NllLossBackward0>)


31it [00:12,  2.83it/s]

tensor(16.0660, device='cuda:0', grad_fn=<NllLossBackward0>)


41it [00:15,  2.84it/s]

tensor(16.8132, device='cuda:0', grad_fn=<NllLossBackward0>)


51it [00:19,  2.84it/s]

tensor(16.7698, device='cuda:0', grad_fn=<NllLossBackward0>)


61it [00:22,  2.83it/s]

tensor(15.8154, device='cuda:0', grad_fn=<NllLossBackward0>)


71it [00:26,  2.84it/s]

tensor(15.6342, device='cuda:0', grad_fn=<NllLossBackward0>)


81it [00:29,  2.83it/s]

tensor(17.2673, device='cuda:0', grad_fn=<NllLossBackward0>)


91it [00:33,  2.81it/s]

tensor(13.6912, device='cuda:0', grad_fn=<NllLossBackward0>)


101it [00:36,  2.82it/s]

tensor(13.9155, device='cuda:0', grad_fn=<NllLossBackward0>)


111it [00:40,  2.81it/s]

tensor(13.0864, device='cuda:0', grad_fn=<NllLossBackward0>)


121it [00:43,  2.82it/s]

tensor(12.2593, device='cuda:0', grad_fn=<NllLossBackward0>)


131it [00:47,  2.81it/s]

tensor(13.5788, device='cuda:0', grad_fn=<NllLossBackward0>)


141it [00:50,  2.80it/s]

tensor(12.0909, device='cuda:0', grad_fn=<NllLossBackward0>)


151it [00:54,  2.81it/s]

tensor(11.6222, device='cuda:0', grad_fn=<NllLossBackward0>)


161it [00:58,  2.80it/s]

tensor(10.8207, device='cuda:0', grad_fn=<NllLossBackward0>)


171it [01:01,  2.80it/s]

tensor(10.3772, device='cuda:0', grad_fn=<NllLossBackward0>)


181it [01:05,  2.80it/s]

tensor(9.6305, device='cuda:0', grad_fn=<NllLossBackward0>)


191it [01:08,  2.80it/s]

tensor(8.4686, device='cuda:0', grad_fn=<NllLossBackward0>)


201it [01:12,  2.80it/s]

tensor(8.5191, device='cuda:0', grad_fn=<NllLossBackward0>)


211it [01:15,  2.80it/s]

tensor(7.7992, device='cuda:0', grad_fn=<NllLossBackward0>)


221it [01:19,  2.81it/s]

tensor(7.7659, device='cuda:0', grad_fn=<NllLossBackward0>)


231it [01:23,  2.81it/s]

tensor(7.4591, device='cuda:0', grad_fn=<NllLossBackward0>)


241it [01:26,  2.81it/s]

tensor(7.4170, device='cuda:0', grad_fn=<NllLossBackward0>)


251it [01:30,  2.81it/s]

tensor(6.8910, device='cuda:0', grad_fn=<NllLossBackward0>)


261it [01:33,  2.80it/s]

tensor(6.8380, device='cuda:0', grad_fn=<NllLossBackward0>)


271it [01:37,  2.80it/s]

tensor(6.8038, device='cuda:0', grad_fn=<NllLossBackward0>)


281it [01:40,  2.80it/s]

tensor(6.3111, device='cuda:0', grad_fn=<NllLossBackward0>)


291it [01:44,  2.81it/s]

tensor(6.1503, device='cuda:0', grad_fn=<NllLossBackward0>)


301it [01:47,  2.81it/s]

tensor(6.5290, device='cuda:0', grad_fn=<NllLossBackward0>)


311it [01:51,  2.80it/s]

tensor(5.9817, device='cuda:0', grad_fn=<NllLossBackward0>)


321it [01:55,  2.80it/s]

tensor(5.7942, device='cuda:0', grad_fn=<NllLossBackward0>)


331it [01:58,  2.80it/s]

tensor(6.1678, device='cuda:0', grad_fn=<NllLossBackward0>)


341it [02:02,  2.79it/s]

tensor(5.9975, device='cuda:0', grad_fn=<NllLossBackward0>)


351it [02:05,  2.80it/s]

tensor(5.8508, device='cuda:0', grad_fn=<NllLossBackward0>)


361it [02:09,  2.79it/s]

tensor(6.3617, device='cuda:0', grad_fn=<NllLossBackward0>)


363it [02:10,  2.79it/s]

In [None]:
#模型的保存
model.save_pretrained(CFG.output_dir)

In [None]:
model.train()

for epoch in range(CFG.num_train_epochs):
    for step, batch in tqdm(enumerate(instruction_loader)):
        # 前向传播
        outputs = model(**batch)

        logits = outputs.logits
        
        max_indices = torch.argmax(logits, dim=-1)
        
        #print(max_indices)
        #print(outputs.logits)
        #print(batch['labels'])
        #break
        # 计算loss
        loss = outputs.loss
        print(loss)
        if step==10:break
        # 反向传播
        loss.backward()
        for name,param in model.named_parameters():
            break
            print(name, param.grad)
        print('***'*30)
        if step==2:break
        # 优化器调度
        optimizer.step()
        for name,param in model.named_parameters():
            break
            print(name, param.grad)
        #lr_scheduler.step()
        optimizer.zero_grad()