In [1]:
class CFG:
    num_train_epochs = 3

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from peft import LoraConfig, get_peft_model

import sys
import json
import pandas as pd
from tqdm import tqdm

In [3]:
sys.path.append('/root/tuning_space/Components/')
import interact
import model_tools
from Static import si

In [4]:
model_path = '/root/autodl-fs/weights/chatglm3-6b'
data_path = '/root/autodl-tmp/dataset/psychology-dataset/data/train.jsonl'

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],  # lora的目标位置，具体有哪些可选项可打印出源码中的key_list 注意不同的模型中的定义名称不同
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

In [6]:
max_len=30
max_src_len=20
max_tgt_len = max_len - max_src_len - 3

src_tokens = tokenizer.tokenize("哪国")

prompt_tokens = tokenizer.tokenize("问题")

if len(src_tokens) > max_src_len - len(prompt_tokens):
    src_tokens = src_tokens[:max_src_len - len(prompt_tokens)]

tgt_tokens = tokenizer.tokenize("中国")

if len(tgt_tokens) > max_tgt_len:
    tgt_tokens = tgt_tokens[:max_tgt_len]

tokens = prompt_tokens + src_tokens + [tokenizer.decode(si["[gMASK]"]), tokenizer.decode(si['sop'])] + tgt_tokens + [tokenizer.decode(si['eop'])]
print(tokens)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
context_length = input_ids.index(si['sop'])
mask_position = context_length - 1
# prompt和问题部分不参与损失值计算
labels = [-100] * mask_position + input_ids[mask_position + 1:]
# 根据最大长度进行后填充
pad_len = max_len - len(input_ids)
input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
# 填充部分不参与损失值计算
labels = labels + [-100] * (pad_len+1)
# 区分有用的部分和填充的token
attention_mask = []
for input_id in input_ids:
    if input_id != tokenizer.pad_token_id:
        attention_mask.append(True)
    else:
        attention_mask.append(False)

print(f'{input_ids} \n {labels} \n {attention_mask}')
print(f'{len(input_ids)} \n {len(labels)} \n {len(attention_mask)}')

['▁', '问题', '▁', '哪', '国', '[gMASK]', 'sop', '▁中国', 'eop']
[30910, 31639, 30910, 55315, 54543, 64790, 64792, 34106, 64793, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
 [-100, -100, -100, -100, -100, 64792, 34106, 64793, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100] 
 [True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
30 
 30 
 30


In [7]:
max_len=30
max_src_len=20
max_tgt_len = max_len - max_src_len - 3

src_tokens = tokenizer.tokenize("哪国")

prompt_tokens = tokenizer.tokenize("问题")

if len(src_tokens) > max_src_len - len(prompt_tokens):
    src_tokens = src_tokens[:max_src_len - len(prompt_tokens)]

tgt_tokens = tokenizer.tokenize("中国")

if len(tgt_tokens) > max_tgt_len:
    tgt_tokens = tgt_tokens[:max_tgt_len]

tokens = prompt_tokens + src_tokens + [tokenizer.decode(si["[gMASK]"]), tokenizer.decode(si['sop'])] + tgt_tokens + [tokenizer.decode(si['eop'])]
print(tokens)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
context_length = input_ids.index(si['sop'])
mask_position = context_length - 1
# prompt和问题部分不参与损失值计算
labels = [-100] * context_length + input_ids[context_length + 1:]
# 根据最大长度进行后填充
pad_len = max_len - len(input_ids)
input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
# 填充部分不参与损失值计算
labels = labels + [-100] * (pad_len+1)
# 区分有用的部分和填充的token
attention_mask = []
for input_id in input_ids:
    if input_id != tokenizer.pad_token_id:
        attention_mask.append(True)
    else:
        attention_mask.append(False)

print(f'{input_ids} \n {labels} \n {attention_mask}')
print(f'{len(input_ids)} \n {len(labels)} \n {len(attention_mask)}')

['▁', '问题', '▁', '哪', '国', '[gMASK]', 'sop', '▁中国', 'eop']
[30910, 31639, 30910, 55315, 54543, 64790, 64792, 34106, 64793, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
 [-100, -100, -100, -100, -100, -100, 34106, 64793, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100] 
 [True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
30 
 30 
 30


In [8]:
class Seq2SeqDataSet(Dataset):
    """数据处理函数"""
    def __init__(self, data_path, tokenizer, max_len, max_src_len, prompt_text):
        # -3是因为需要拼接三个特殊字符[gMASK]、<sop>、<eop>
        max_tgt_len = max_len - max_src_len - 3
        self.all_data = []
        with open(data_path, "r", encoding="utf-8") as fh:
            for i, line in enumerate(fh):
                sample = json.loads(line.strip())
                # chatglm的token不是中文的字，是词
                # add_special_tokens = True时会在末位添加["[gMASK]", "<sop>"]
                src_tokens = tokenizer.tokenize(sample["text"])
                # print(sample["text"])
                # print(src_tokens)
                prompt_tokens = tokenizer.tokenize(prompt_text)
                # 根据限制的长度对输入进行截断
                if len(src_tokens) > max_src_len - len(prompt_tokens):
                    src_tokens = src_tokens[:max_src_len - len(prompt_tokens)]

                tgt_tokens = tokenizer.tokenize(sample["answer"])
                # 根据限制的长度对输入进行截断
                if len(tgt_tokens) > max_tgt_len:
                    tgt_tokens = tgt_tokens[:max_tgt_len]
                # 问、答之间需要通过特殊字符进行分割，同时需要添加终止符
                # [gMASK]与<sop>作为模型生成结果的起始标记，属于同一个block，
                # 所以这两个token对应的在原始文本中所在的位置是一样的，具体可参考这个issue https://github.com/THUDM/ChatGLM-6B/issues/1313
                # tokens = prompt_tokens + src_tokens + ["[gMASK]", "<sop>"] + tgt_tokens + ["<eop>"]
                tokens = prompt_tokens + src_tokens + [tokenizer.gmask_token, tokenizer.bos_token] + tgt_tokens + [tokenizer.eos_token]
                input_ids = tokenizer.convert_tokens_to_ids(tokens)
                context_length = input_ids.index(tokenizer.bos_token_id)
                mask_position = context_length - 1
                # prompt和问题部分不参与损失值计算
                labels = [-100] * context_length + input_ids[mask_position + 1:]
                # 根据最大长度进行后填充
                pad_len = max_len - len(input_ids)
                input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
                # 填充部分不参与损失值计算
                labels = labels + [-100] * pad_len
                # 区分有用的部分和填充的token
                attention_mask = []
                for input_id in input_ids:
                    if input_id != tokenizer.pad_token_id:
                        attention_mask.append(True)
                    else:
                        attention_mask.append(False)
                self.all_data.append(
                    {"text": sample["text"], "answer": sample["answer"], "input_ids": input_ids, "labels": labels, "attention_mask": attention_mask})

    def __len__(self):
        return len(self.all_data)

    def __getitem__(self, item):
        instance = self.all_data[item]
        return instance

In [9]:
with open('/root/autodl-tmp/dataset/psychology-dataset/data/train.jsonl', 'r') as file:
    for line in file:
        sample=json.loads(line)
        print(tokenizer.encode(sample['question'], add_special_tokens=False))
        print(tokenizer.tokenize(sample['question']))
        print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample['question'])))
        break

[307, 30953, 30924, 4035, 1114, 17904, 14330, 293, 307, 883, 30953, 30912, 683, 1624, 30930]
['▁I', "'", 'm', '▁feeling', '▁really', '▁anxious', '▁lately', '▁and', '▁I', '▁don', "'", 't', '▁know', '▁why', '.']
[307, 30953, 30924, 4035, 1114, 17904, 14330, 293, 307, 883, 30953, 30912, 683, 1624, 30930]


In [10]:
class instruction_dataset(Dataset):
    def __init__(self, data_path:'str', tokenizer, truncate_length, max_query_length):
        super().__init__()
        self.tokenizer = tokenizer
        self.examples = []
        
        with open(data_path, 'r') as file:
            for line in file:
                sample=json.loads(line)
                # input_ids的结构应该为：prompt_tokens, src_tokens, [gMASK], <sop>, tgt_tokens, <eop>, [PAD]... 
                # 或者简化一点，即为 query, [gMASK], <sop>, answer, <eop>, [PAD]... 
                # padding的目的是为了对齐各个instance，以组成batch（当然batch_size=1时其实没必要）
                # 总体的input_ids的长度不超过truncate_length，其中query的长度不超过max_query_length，同理可以计算出answer的最大长度
                max_answer_length = truncate_length - max_query_length - 3
                
                # 判断query的长度
                query = sample['question']
                query_ids = tokenizer.encode(query, add_special_tokens=False)
                if len(query_ids) > max_query_length:
                    query_ids = query_ids[:max_query_length]
                
                # 判断answer的长度
                answer = sample['response_j']
                answer_ids = tokenizer.encode(answer, add_special_tokens=False)
                if len(answer) > max_answer_length:
                    answer_ids = answer_ids[:max_answer_length]
                    
                # 合并
                input_ids = query_ids + [si['[gMASK]']] + [si['sop']] + answer_ids + [si['eop']]
                pre_context_length = input_ids.index(si['sop'])
                
                # padding
                padding_length=truncate_length-len(input_ids)
                input_ids+=padding_length*[tokenizer.pad_token_id]
                
                # 制作labels；其中query部分，pad部分均不参与loss的计算 # 因为需要整体向左移动，所以要少填充一个
                labels = [-100] * (pre_context_length-0) + input_ids[pre_context_length+1:]
                labels = labels + [-100] * (truncate_length-len(labels))
                
                # 制作attention_mask
                eop_position = input_ids.index(si['eop'])+1
                attention_mask = [True]*eop_position
                attention_mask += [False]*(truncate_length-len(attention_mask))
                
                self.examples.append({
                    'query' : query,
                    'answer' : answer,
                    'input_ids' : input_ids,
                    'labels' : labels,
                    'attention_mask' : attention_mask,
                })
                
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        instance = self.examples[item]
        return instance

In [22]:
def coll_fn(batch:list):
    input_labels = []
    labels = []
    attention_mask = []
    for sample in batch:
        # 实际上词表长度只有65024，所以int32就可以了 # attention_mask用bool就行
        input_labels.append(torch.tensor(sample['input_ids'], dtype=torch.int32))
        labels.append(torch.tensor(sample['labels'], dtype=torch.int32))
        attention_mask.append(torch.tensor(sample['attention_mask'], dtype=torch.bool))
    batch = {'input_ids':input_labels, 'labels':labels, 'attention_mask': attention_mask}
    batch = {name:torch.stack(item) for name,item in batch.items()} #对于一个元素不为基本元素的list，需要使用stack方法
    return batch

In [23]:
finetuning_instruction=instruction_dataset(data_path, tokenizer, 128, 32)

In [24]:
dataloader = DataLoader(finetuning_instruction, batch_size=2, shuffle=True, collate_fn=coll_fn)

In [25]:
for item in dataloader:
    print(item)
    break

{'input_ids': tensor([[  307, 30953, 30924,  1709,  6555,   356,   552,  1934,  2862,   293,
           307,   883, 30953, 30912,  1322,   878,   565,   552,  6518, 30930,
         64790, 64792,  2641, 30953, 30917,  5550,   267,  6840,   290,   475,
          1934,  2862,  2326,   293,   777,   588,   379,   457,   636,   331,
          2310,   475,  6470, 30930,   577, 30953, 30917,  1447,   289,  3170,
           343,  6337,  2114,   291,   479, 13278,   293,  9521, 30930, 64793,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [14]:
for item in finetuning_instruction:
    print([len(item[it]) for it in item])
    break

[55, 277, 128, 128, 128]
