# 完整的训练流程
1. 数据基于`https://github.com/hikariming/alpaca_chinese_dataset`
2. 部分代码来源于`https://github.com/27182812/ChatGLM-chinese-insturct/blob/main/finetune.py`
3. 基于我之前修改的`model_chatglm.py`做的一整套教程

## 清洗数据

In [1]:
# 如果没有下载这个仓库，可以使用下面命令进行clone

# !git clone https://github.com/hikariming/alpaca_chinese_dataset.git
# 是这个吧：https://github.com/hikariming/chat-dataset-baseline

In [2]:
from glob import glob
import os 
import pandas as pd 
import shutil
from itertools import chain
from tqdm import tqdm
from pathlib import Path
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
target_dir_list = ['alpaca_chinese_dataset/history/其他中文问题补充/',
                   'alpaca_chinese_dataset/history/翻译后的中文数据/',
                   'alpaca_chinese_dataset/history/chatglm问题数据补充/',
                #    'alpaca_chinese_dataset/history/原始英文数据/'
                   ]

all_json_path = [glob(i+"*.json") for i in target_dir_list]
all_json_path = list(chain(*all_json_path))
len(all_json_path), all_json_path[:5]


(20,
 ['alpaca_chinese_dataset/其他中文问题补充\\三国问题.json',
  'alpaca_chinese_dataset/其他中文问题补充\\企业管理问题.json',
  'alpaca_chinese_dataset/其他中文问题补充\\传统诗词及文化常识问题.json',
  'alpaca_chinese_dataset/其他中文问题补充\\党建类数据集.json',
  'alpaca_chinese_dataset/其他中文问题补充\\其他问题.json'])

In [4]:
def read_json(x:str):
    try:
        data = pd.read_json(x)
        return data 
    except Exception as e:
        return pd.DataFrame()

alldata = pd.concat([read_json(i) for i in all_json_path])
# alldata

In [5]:
genrate_data_dir = "data3_0328"
genrate_data_dir = Path(genrate_data_dir)

if genrate_data_dir.exists():
    shutil.rmtree(genrate_data_dir, ignore_errors=True)

os.makedirs(genrate_data_dir, exist_ok=True)


In [6]:
alldata = alldata.sample(frac=1).reset_index(drop=True)

chunk_size = 666

for index, start_id in tqdm(enumerate(range(0, alldata.shape[0], chunk_size))):
    temp_data = alldata.iloc[start_id:(start_id+chunk_size)]
    temp_data.to_csv(genrate_data_dir.joinpath(f"{index}.csv"), index=False)

28it [00:00, 397.31it/s]


## 训练模型

In [7]:
# from thuglm.modeling_chatglm import ChatGLMForConditionalGeneration
# from thuglmcode.model_chatglm import ChatGLMForConditionalGeneration
from transformers import Trainer, TrainingArguments
import random
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from peft import get_peft_model, LoraConfig, TaskType
from typing import Optional
import torch

In [8]:
tokenizer = AutoTokenizer.from_pretrained("yuanzhoulvpi/chatglm6b-dddd", trust_remote_code=True)


Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [9]:
model = AutoModel.from_pretrained(
    "yuanzhoulvpi/chatglm6b-dddd", trust_remote_code=True).half().cuda()

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,
    # ['dense','dense_h_to_4h','dense_4h_to_h'] # 'query_key_value',
    target_modules=['query_key_value',],
)
model = get_peft_model(model, peft_config)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]



In [10]:
class MyTrainer(Trainer):
    def _save(self, output_dir: Optional[str] = None, state_dict=None):
        # If we are executing this function, we are the process zero, so we don't check for that.
        output_dir = output_dir if output_dir is not None else self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        def save_tunable_parameters(model, path):
            saved_params = {
                k: v.to("cpu") for k, v in model.named_parameters() if v.requires_grad
            }
            # saved_params = model.state_dict()
            torch.save(saved_params, path)

        save_tunable_parameters(
            self.model, os.path.join(output_dir, "chatglm-lora.pt")
        )




In [11]:
random.seed(42)

all_file_list = glob(pathname=genrate_data_dir.joinpath("*.csv").__str__())

test_file_list = random.sample(all_file_list, int(len(all_file_list)*0.25))
train_file_list = [i for i in all_file_list if i not in test_file_list]

len(train_file_list), len(test_file_list)

(21, 7)

In [12]:
dataset = load_dataset(
    "csv",
    data_files={
    'train':train_file_list,
    'valid':test_file_list
    },
    cache_dir="cache_data"
)
dataset

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Downloading and preparing dataset csv/default to c:/Users/yuanz/PycharmProjects/zero_nlp/simple_thu_chatglm6b/cache_data/csv/default-94af94a7f1da5386/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to c:/Users/yuanz/PycharmProjects/zero_nlp/simple_thu_chatglm6b/cache_data/csv/default-94af94a7f1da5386/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 13986
    })
    valid: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 4185
    })
})

In [13]:
def get_masks_and_position_ids(
    seq, seq_len, context_length, device, gmask=False, position_encoding_2d=True
):
    mask_position = (
        seq_len - 2
    )  # is equal to `seq.index(mask_token)` or `seq.index(150001)`
    attention_mask = torch.ones((1, context_length, context_length), device=device)
    attention_mask.tril_()
    attention_mask[..., : mask_position - 1] = 1
    attention_mask = (attention_mask < 0.5).bool()

    if position_encoding_2d:
        seq_length = seq_len - 1  # is equal to `seq_length = seq.index(150004)`
        position_ids = torch.arange(context_length, dtype=torch.long, device=device)
        if not gmask:
            position_ids[seq_length:] = mask_position
        block_position_ids = torch.cat(
            (
                torch.zeros(seq_length, dtype=torch.long, device=device),
                torch.arange(
                    context_length - seq_length, dtype=torch.long, device=device
                )
                + 1,
            )
        )
        position_ids = torch.stack((position_ids, block_position_ids), dim=0)
    else:
        position_ids = torch.arange(context_length, dtype=torch.long, device=device)
        if not gmask:
            position_ids[context_length - 1 :] = mask_position
    return attention_mask, position_ids

def data_collator(features: list) -> dict:
    len_ids = [len(feature["input_ids"]) for feature in features]
    longest = max(len_ids) + 1
    input_ids = []
    attention_mask_list = []
    position_ids_list = []
    labels_list = []
    for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
        ids = feature["input_ids"]
        seq_len = feature["seq_len"]
        labels = (
            [-100] * (seq_len - 1)
            + ids[(seq_len - 1) :]
            + [tokenizer.eop_token_id]
            + [-100] * (longest - ids_l - 1)
        )
        ids = ids + [tokenizer.eop_token_id] * (longest - ids_l)
        _ids = torch.LongTensor(ids)
        attention_mask, position_ids = get_masks_and_position_ids(
            ids, seq_len, longest, _ids.device, gmask=False
        )
        labels_list.append(torch.LongTensor(labels))
        input_ids.append(_ids)
        attention_mask_list.append(attention_mask)
        position_ids_list.append(position_ids)
    input_ids = torch.stack(input_ids)
    labels = torch.stack(labels_list)
    attention_mask = torch.stack(attention_mask_list)
    position_ids = torch.stack(position_ids_list)
    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask,
        "position_ids": position_ids,
    }



In [14]:
def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    # {"context": context, "target": target}
    example['context'] = context
    example['target'] = target
    return example

max_seq_length = 512

def preprocess(example):
    prompt = example["context"]
    target = example["target"]
    prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)
    target_ids = tokenizer.encode(
        target, max_length=max_seq_length, truncation=True, add_special_tokens=False
    )
    input_ids = prompt_ids + target_ids + [tokenizer.eos_token_id]
    return {"input_ids": input_ids, "seq_len": len(prompt_ids)}

def filter_nan(example):
    return example['target'] is not None and example['context'] is not  None


tokenized_datasets = dataset.map(
    function=format_example, remove_columns=dataset['train'].column_names
    ).filter(function=filter_nan)
tokenized_datasets = tokenized_datasets.map(function=preprocess)
tokenized_datasets


Map:   0%|          | 0/13986 [00:00<?, ? examples/s]

Map:   0%|          | 0/4185 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13986 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4185 [00:00<?, ? examples/s]



Map:   0%|          | 0/13980 [00:00<?, ? examples/s]

Map:   0%|          | 0/4182 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['context', 'target', 'input_ids', 'seq_len'],
        num_rows: 13980
    })
    valid: Dataset({
        features: ['context', 'target', 'input_ids', 'seq_len'],
        num_rows: 4182
    })
})

In [None]:
# model.gradient_checkpointing

In [None]:
# model.gradient_checkpointing = True
# model._set_gradient_checkpointing(value=True)

In [15]:
from transformers.trainer_callback import TrainerCallback, TrainerState, TrainerControl



class EmptyCacheCallBack(TrainerCallback):
    """
    通过callback的形式，解决显存不够的问题

    """

    def __init__(self) -> None:
        super().__init__()

    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs, **kwargs):
        """
        Event called after logging the last logs.
        """
        torch.cuda.empty_cache()

    def on_epoch_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        torch.cuda.empty_cache()

    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        torch.cuda.empty_cache()
        
    

eccb = EmptyCacheCallBack()


In [16]:
args = TrainingArguments(
    output_dir="test004",
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=100,
    fp16=True,
    push_to_hub=False,
    remove_unused_columns=False
)

trainer = MyTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    # callbacks=[eccb]
)
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
