In [1]:
#这是初始状态的代码

In [1]:
class CFG:
    num_train_epochs = 3

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from peft import LoraConfig, get_peft_model

import sys
import json
import pandas as pd
from tqdm import tqdm

In [3]:
sys.path.append('/root/tuning_space/Components/')
import interact
import model_tools
from Static import prompt_dict, st, si

In [4]:
model_path = '/root/autodl-fs/weights/chatglm3-6b'
data_path = '/root/autodl-tmp/dataset/psychology-dataset/data/train.jsonl'

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],  # lora的目标位置，具体有哪些可选项可打印出源码中的key_list 注意不同的模型中的定义名称不同
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()

In [None]:
model_tools.model_profile(model)

In [None]:
model = get_peft_model(model, config)

In [None]:
model_tools.model_profile(model)

In [None]:
tokenizer.pad_token_id

In [9]:
tokenizer.decode(si["[gMASK]"])

'[gMASK]'

In [13]:
max_src_len=100
max_tgt_len=20

src_tokens = tokenizer.tokenize("北京是哪国首都")

prompt_tokens = tokenizer.tokenize("接下来将是我的问题。")

if len(src_tokens) > max_src_len - len(prompt_tokens):
    src_tokens = src_tokens[:max_src_len - len(prompt_tokens)]

tgt_tokens = tokenizer.tokenize("中国")

if len(tgt_tokens) > max_tgt_len:
    tgt_tokens = tgt_tokens[:max_tgt_len]

tokens = prompt_tokens + src_tokens + [tokenizer.decode(si["[gMASK]"]), tokenizer.decode(si['sop'])] + tgt_tokens + [tokenizer.decode(si['eop'])]
print(tokens)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
context_length = input_ids.index(si['sop'])
mask_position = context_length - 1
# prompt和问题部分不参与损失值计算
labels = [-100] * context_length + input_ids[mask_position + 1:]
# 根据最大长度进行后填充
pad_len = max_len - len(input_ids)
input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
# 填充部分不参与损失值计算
labels = labels + [-100] * pad_len
# 区分有用的部分和填充的token
attention_mask = []
for input_id in input_ids:
    if input_id != tokenizer.pad_token_id:
        attention_mask.append(True)
    else:
        attention_mask.append(False)

input_ids, labels, attention_mask

['▁', '接下来', '将是', '我的', '问题', '。', '▁北京', '是', '哪', '国', '首都', '[gMASK]', 'sop', '▁中国', 'eop']


NameError: name 'max_len' is not defined

In [None]:
class Seq2SeqDataSet(Dataset):
    """数据处理函数"""
    def __init__(self, data_path, tokenizer, max_len, max_src_len, prompt_text):
        # -3是因为需要拼接三个特殊字符[gMASK]、<sop>、<eop>
        max_tgt_len = max_len - max_src_len - 3
        self.all_data = []
        with open(data_path, "r", encoding="utf-8") as fh:
            for i, line in enumerate(fh):
                sample = json.loads(line.strip())
                # chatglm的token不是中文的字，是词
                # add_special_tokens = True时会在末位添加["[gMASK]", "<sop>"]
                src_tokens = tokenizer.tokenize(sample["text"])
                # print(sample["text"])
                # print(src_tokens)
                prompt_tokens = tokenizer.tokenize(prompt_text)
                # 根据限制的长度对输入进行截断
                if len(src_tokens) > max_src_len - len(prompt_tokens):
                    src_tokens = src_tokens[:max_src_len - len(prompt_tokens)]

                tgt_tokens = tokenizer.tokenize(sample["answer"])
                # 根据限制的长度对输入进行截断
                if len(tgt_tokens) > max_tgt_len:
                    tgt_tokens = tgt_tokens[:max_tgt_len]
                # 问、答之间需要通过特殊字符进行分割，同时需要添加终止符
                # [gMASK]与<sop>作为模型生成结果的起始标记，属于同一个block，
                # 所以这两个token对应的在原始文本中所在的位置是一样的，具体可参考这个issue https://github.com/THUDM/ChatGLM-6B/issues/1313
                # tokens = prompt_tokens + src_tokens + ["[gMASK]", "<sop>"] + tgt_tokens + ["<eop>"]
                tokens = prompt_tokens + src_tokens + [tokenizer.gmask_token, tokenizer.bos_token] + tgt_tokens + [tokenizer.eos_token]
                input_ids = tokenizer.convert_tokens_to_ids(tokens)
                context_length = input_ids.index(tokenizer.bos_token_id)
                mask_position = context_length - 1
                # prompt和问题部分不参与损失值计算
                labels = [-100] * context_length + input_ids[mask_position + 1:]
                # 根据最大长度进行后填充
                pad_len = max_len - len(input_ids)
                input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
                # 填充部分不参与损失值计算
                labels = labels + [-100] * pad_len
                # 区分有用的部分和填充的token
                attention_mask = []
                for input_id in input_ids:
                    if input_id != tokenizer.pad_token_id:
                        attention_mask.append(True)
                    else:
                        attention_mask.append(False)
                self.all_data.append(
                    {"text": sample["text"], "answer": sample["answer"], "input_ids": input_ids, "labels": labels, "attention_mask": attention_mask})

    def __len__(self):
        return len(self.all_data)

    def __getitem__(self, item):
        instance = self.all_data[item]
        return instance

In [None]:
with open('/root/autodl-tmp/dataset/psychology-dataset/data/train.jsonl', 'r') as file:
    for line in file:
        sample=json.loads(line)
        print(tokenizer.encode(sample['question'], add_special_tokens=False))
        print(tokenizer.tokenize(sample['question']))
        print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample['question'])))
        break

In [None]:
class instruction_dataset(Dataset):
    def __init__(self, data_path, tokenizer, truncate_length, max_query_length):
        super().__init__()
        self.tokenizer = tokenizer
        self.examples = []
        
        with open(data_path, 'r') as file:
            for line in file:
                sample=json.loads(line)
                # input_ids的结构应该为：prompt_tokens, src_tokens, [gMASK], <sop>, tgt_tokens, <eop>, [PAD]... 
                # 或者简化一点，即为 query, [gMASK], <sop>, answer, <eop>, [PAD]... 
                # padding的目的是为了对齐各个instance，以组成batch（当然batch_size=1时其实没必要）
                # 总体的input_ids的长度不超过truncate_length，其中query的长度不超过max_query_length，同理可以计算出answer的最大长度
                max_answer_length = truncate_length - max_query_length - 3
                
                # 判断query的长度
                query = sample['question']
                query_ids = tokenizer.encode(query, add_special_tokens=False)
                if len(query_ids) > max_query_length:
                    query_ids = query_ids[:max_query_length]
                
                # 判断answer的长度
                answer = sample['response_j']
                answer_ids = tokenizer.encode(answer, add_special_tokens=False)
                if len(answer) > max_answer_length:
                    answer_ids = answer_ids[:max_answer_length]
                    
                # 合并
                input_ids = query_ids + si['gMASK'] + si['sop'] + answer_ids + si['eop']
                pre_context_length = input_ids.index(si['sop'])
                
                # padding
                padding_length=truncate_length-len(input_ids)
                input_ids+=padding_length*[tokenizer.pad_token_id]
                
                # 制作labels；其中query部分，pad部分均不参与loss的计算
                labels = [-100] * pre_context_length + input_ids[pre_context_length+1:]
                labels = labels + [-100]*padding_length
                
                

In [None]:
class instruction_dataset(Dataset):
    #初始化迭代对象
    def __init__(self, data_path, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer
        self.examples = []
        
        with open(data_path, 'r') as file:
            for line in file:
                self.examples.append(json.loads(line))
                
    def __len__(self):
        return len(self.examples)
    
    #构造迭代单位（以tensor的形式）
    def __getitem__(self, index):
        question=self.examples[index]['question']
        response=self.examples[index]['response_j']
        
        tensor_question=tokenizer.encode(question, return_tensors='pt').cuda()[:,2:]
        tensor_response=tokenizer.encode(response, return_tensors='pt').cuda()[:,2:]
        
        instruction = torch.cat([st['\n'], st['<|user|>'], tensor_question, st['\n'], st['<|assistant|>'], tensor_response], dim=1)
        #decode的label即为下一个位置的token, 最后需要额外拼接一个终止符
        label = torch.cat([instruction.squeeze()[1:].clone().view(1,-1), st['eop']], dim=1)
        
        return instruction, label

In [None]:
instructions = instruction_dataset(data_path, tokenizer)
instruction_loader = DataLoader(instructions, batch_size=1, shuffle=True)

In [None]:
# 定义优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-7)

lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=50,
        num_training_steps=(len(instruction_loader) * CFG.num_train_epochs),
    )

# 定义损失函数
loss_fn = CrossEntropyLoss()

In [None]:
# 开始训练
model.train()
for epoch in range(CFG.num_train_epochs):
    for batch in tqdm(instruction_loader):
        inputs = batch[0].squeeze(0)
        labels = batch[1].squeeze()
        outputs = model(inputs)
        #print(outputs.loss, loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), labels.view(-1)))
        #break
        loss = loss_fn(outputs.logits.view(-1, outputs.logits.size(-1)), labels.view(-1))
        #print(outputs.logits.view(-1, outputs.logits.size(-1)).shape, labels.view(-1))
        print(loss)
        #print()
        loss.backward()
        
        # 在这里检查和/或操作梯度
        for name, param in model.named_parameters():
            if param.grad is not None:
                pass
                #print(f'{name}\n {param.requires_grad}')
                #print(f"{param.grad}\n{param}")
                #print()
                

        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1} completed.")