# ChatGLM LoRA Finetune

此Jupyter内核运行在服务器上，可以直接加载模型到本机的cuda上，使用的显卡为RTX A6000。
使用LoRA的方法对ChatGLM进行监督微调。
参考项目：
> https://github.com/mymusise/ChatGLM-Tuning

## 数据加载
读取数据并tokenize


In [1]:
from transformers import AutoTokenizer
import transformers


model_name = "THUDM/chatglm-6b"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=True, device_map='auto')

  from .autonotebook import tqdm as notebook_tqdm
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.


In [2]:
import pandas as pd

def preprocess_data(tokenizer, config, title, context):
        prompt = f"Instruction: 请根据新闻内容帮我写一个新闻短标题：\n{context}\nAnswer: "
        target = title
        prompt_ids = tokenizer.encode(prompt)
        target_ids = tokenizer.encode(target)
        input_ids = prompt_ids + target_ids + [config.eos_token_id]
        return {"input_ids": input_ids, "seq_len": len(prompt_ids)}
        
    
    


def read_data(path, tokenizer, config,):
        df_trian_data = pd.read_csv(path)
        for row in df_trian_data.iterrows():
                yield preprocess_data(tokenizer, config, title = row[1]['title'], context = row[1]['context'])

In [3]:
import datasets

# 读取数据
dataset = datasets.Dataset.from_generator(
    lambda: read_data("data/train_data.csv",tokenizer, config)
)
dataset.save_to_disk("train_data")

Found cached dataset generator (/home/tmh/.cache/huggingface/datasets/generator/default-5c2623fc586f8070/0.0.0)
                                                                                               

In [4]:
dataset = datasets.Dataset.from_generator(
    lambda: read_data("data/test_data.csv",tokenizer, config)
)
dataset.save_to_disk("test_data")

Found cached dataset generator (/home/tmh/.cache/huggingface/datasets/generator/default-b10cf1583203cada/0.0.0)
                                                                                            

## 模型微调

In [3]:
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModel
import torch
from peft import get_peft_model, LoraConfig, TaskType
import datasets
import os


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /home/tmh/anaconda3/envs/glm/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /home/tmh/anaconda3/envs/glm/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


In [6]:
default_peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

# 加载模型与数据
def load_model(peft_config = None):
    # 设置LoRA参数
    if peft_config is None : peft_config = default_peft_config
    model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    model.is_parallelizable = True
    model.model_parallel = True
    model.config.use_cache = (False)
    model = get_peft_model(model, peft_config)
    return model

In [7]:
# 将文本转换成神经网络模型所需的数据格式
def data_collator(features: list) -> dict:
    len_ids = [len(feature["input_ids"]) for feature in features]
    longest = max(len_ids)
    input_ids = []
    labels_list = []
    for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
        ids = feature["input_ids"]
        seq_len = feature["seq_len"]
        labels = (
            [-100] * (seq_len - 1) + ids[(seq_len - 1) :] + [-100] * (longest - ids_l)
        )
        ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)
        _ids = torch.LongTensor(ids)
        labels_list.append(torch.LongTensor(labels))
        input_ids.append(_ids)
    input_ids = torch.stack(input_ids)
    labels = torch.stack(labels_list)
    return {
        "input_ids": input_ids,
        "labels": labels,
    }


class ModifiedTrainer(Trainer):
    # def compute_loss(self, model, inputs, return_outputs=False):
    #     return model(
    #         input_ids=inputs["input_ids"],
    #         labels=inputs["labels"],
    #     ).loss

    def save_model(self, output_dir=None, _internal_call=False):
        from transformers.trainer import TRAINING_ARGS_NAME

        os.makedirs(output_dir, exist_ok=True)
        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
        saved_params = {
            k: v.to("cpu") for k, v in self.model.named_parameters() if v.requires_grad
        }
        torch.save(saved_params, os.path.join(output_dir, "adapter_model.bin"))


def run_training(model, train_args, train_dataset, eval_dataset, test_mode = False):
    if test_mode:
        train_args = TrainingArguments(
            output_dir='output_test',
            per_device_train_batch_size=3,
            gradient_accumulation_steps=1,
            max_steps=20,
            save_steps=10,
            save_total_limit=2,
            learning_rate=1e-4,
            remove_unused_columns=False,
            logging_steps=5,
            fp16=True
        )
        eval_dataset = None

    trainer = ModifiedTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        args=train_args,
        data_collator=data_collator,
    )
    # start train
    trainer.train()
    model.save_pretrained(train_args.output_dir)
    del model,trainer
    torch.cuda.empty_cache()


In [8]:
# 先训练20个Step测试一下训练效果
torch.cuda.empty_cache()
model_test = load_model()
run_training(model=model_test, train_args = None,train_dataset=dataset, eval_dataset=None, test_mode=True)



Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.21it/s]


Step,Training Loss
5,6.7357
10,6.7374
15,6.4935
20,6.4188


## 推理

In [8]:
default_peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

# 加载模型
def get_infer_model(peft_config,lora_path = 'output_test', test_mode = False):
    
    if test_mode : peft_config = default_peft_config
    torch.set_default_tensor_type(torch.cuda.HalfTensor) # type: ignore
    infer_model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
    infer_model = get_peft_model(infer_model, peft_config)
    infer_model.load_state_dict(torch.load(lora_path+'/adapter_model.bin'), strict=False)
    torch.set_default_tensor_type(torch.cuda.FloatTensor) # type: ignore
    return infer_model

def generate_finetuned_title(context, model, tokenizer):
    with torch.no_grad():
        prompt = f"Instruction: 请根据新闻内容帮我写一个新闻短标题：\n{context}\nAnswer: "
        response = model.chat(tokenizer,
                                   prompt,
                                   history=[],
                                   max_length=2048,
                                   top_p= 0.7,
                                   temperature= 0.95)[0] # type: ignore
        return response

In [14]:


def format_generate_result(result):
    # 通过测试集数据发现可能存在以下情况：“新闻标题是：\n”、“新闻短标题是：”，或者直接列出多个标题，或者生成一长段话
    # 删除句号及句号之后的内容,处理生成一长段的情况
    result = result.split("。")[0]
    # 处理“新闻标题是：\n”、“新闻短标题是：”的情况
    if "：\n" in result and "标题" in result:
        result = result.split("：\n")[1]
    elif "：" in result and "标题" in result:
        result = result.split("：")[1]
    # 处理多个标题的情况
    result = result.split("\n")[0]    
    return result


from tqdm import tqdm
import pandas as pd
def run_infer(name,generate_method,infer_model,save_path = "",read_path = 'data/test_data.csv',saving_step=20,test_mode=False,append=False):
    df = pd.read_csv(read_path)
    tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
    if not append or name not in df.columns:
        df[name] = pd.Series()
    if test_mode:
        max_iter = 5
        saving_step = 1
    else: 
        max_iter = len(df)
    for i in tqdm(range(max_iter)):
        if df[name].isna()[i] or df[name].iloc[i] == "ChatGLM调用失败" :
            title,context = df.iloc[i,0],df.iloc[i,1]
            # print(title,context)
            result = generate_method(context,infer_model,tokenizer)
            result = format_generate_result(result)
            if test_mode :
                print(f'{title}-----{result}')
            else:
                df[name].iloc[i] = result
                if save_path != "":
                    if i%saving_step==0 or i==max_iter-1:
                        df.to_csv(save_path,index=False)
    del infer_model
    torch.cuda.empty_cache()
    return df

def run_infer_test(name,generate_method,infer_model):
    run_infer(name,generate_method,infer_model=infer_model,test_mode=True)

In [11]:
torch.cuda.empty_cache()
infer_model = get_infer_model('',test_mode=True)
run_infer_test(name='finetuned_title' ,generate_method = generate_finetuned_title,infer_model=infer_model)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.16it/s]
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
  df[name] = pd.Series()
  0%|          | 0/5 [00:00<?, ?it/s]The dtype of attention mask (torch.int64) is not bool
 20%|██        | 1/5 [00:02<00:10,  2.74s/it]

刘烨儿子取名诺一：生个女儿就叫“千金”-----刘烨给儿子取好名字，Angelababy成为《建党伟业》小凤仙


 40%|████      | 2/5 [00:04<00:05,  1.92s/it]

乡村乐组合Sugarland遭前队友索赔1400万美元-----Sugarland组合诉讼前队友 双方庭外和解


 60%|██████    | 3/5 [00:13<00:10,  5.43s/it]

彩民周刊10139期双色球：一区2路号已亟待回补-----下期遗漏值综述：07、10、13、19、33、09、25、15、16、22、27、28、25、02、12、05、06、08、11、14、20、29、31、23、26、30


 80%|████████  | 4/5 [00:15<00:03,  3.97s/it]

女子锤杀出轨丈夫割下其生殖器(图)-----女子锤杀丈夫并割下其生殖器 法院轻判凶手


100%|██████████| 5/5 [00:16<00:00,  3.31s/it]

北大考研记：打开双手世界就在你手中-----考研成功的背后：与北大的情缘





## 生成数据

In [10]:
test_result_path = "result/test_result.csv"
val_result_path = "result/val_result.csv"

In [13]:
# 创建保存结果的csv
def create_result_csv(path,raw_csv_path):
    df_raw = pd.read_csv(raw_csv_path)
    df_result = pd.DataFrame(columns=['title','context'])
    df_result['title'] = df_raw['title']
    df_result['context'] = df_raw['context']
    df_result.to_csv(path,index=False)


create_result_csv("result/test_result.csv","data/test_data.csv")
create_result_csv("result/val_result.csv","data/val_data.csv")

### 训练

In [11]:
train_dataset = datasets.load_from_disk('train_data')
test_dataset = datasets.load_from_disk('test_data')

In [12]:

peft_config_underfit = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
train_args = TrainingArguments(
    output_dir='test001',
    evaluation_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size =2,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    save_steps=200,
    save_total_limit=2,
    learning_rate=1e-4,
    remove_unused_columns=False,
    logging_steps=100,
    fp16=True
)

torch.cuda.empty_cache()
model = load_model(peft_config=peft_config_underfit)
run_training(model=model, train_args = train_args,train_dataset=train_dataset, eval_dataset=test_dataset)
del model
torch.cuda.empty_cache()

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.33it/s]


Step,Training Loss,Validation Loss
100,3.8791,2.257464
200,2.2508,2.168503
300,2.268,2.119374
400,2.0854,2.088719
500,2.182,2.067272
600,2.0789,2.059705
700,2.1973,2.039689
800,2.0031,2.028336
900,2.0508,2.01506
1000,2.0274,2.005554


In [9]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
)
train_args = TrainingArguments(
    output_dir="test005",
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=30,
    logging_steps=30,
    gradient_accumulation_steps=8,
    num_train_epochs=6,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=100,
    fp16=True,
    push_to_hub=False,
    remove_unused_columns=False
)

torch.cuda.empty_cache()
model = load_model(peft_config=peft_config)
run_training(model=model, train_args = train_args,train_dataset=train_dataset, eval_dataset=test_dataset)
del model
torch.cuda.empty_cache()

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.26it/s]


Step,Training Loss,Validation Loss
30,7.1749,6.815646
60,6.4222,5.667855
90,4.7557,3.275124
120,2.6435,2.32752
150,2.3023,2.17672
180,2.1108,2.10842
210,2.1584,2.068985
240,2.1247,2.055057
270,2.0443,2.026248
300,2.06,2.011586


In [12]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
train_args = TrainingArguments(
    output_dir="test004",
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=3,
    evaluation_strategy="steps",
    eval_steps=15,
    logging_steps=15,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=100,
    fp16=True,
    push_to_hub=False,
    remove_unused_columns=False
)

torch.cuda.empty_cache()
# model = load_model(peft_config=peft_config)
run_training(model=model, train_args = train_args,train_dataset=train_dataset, eval_dataset=test_dataset)



Step,Training Loss,Validation Loss
15,6.2807,5.988694
30,6.0176,5.738894
45,5.6781,5.256823
60,4.9451,4.431996
75,4.0159,3.367702
90,3.1659,2.666406
105,2.4614,2.378569
120,2.3108,2.269515
135,2.2485,2.20542
150,2.28,2.166121


In [16]:
del tokenizer
torch.cuda.empty_cache()

### 推理

In [22]:
peft_config_overfit = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
)
torch.cuda.empty_cache()
infer_model = get_infer_model(peft_config_overfit,lora_path='test005')
run_infer(name='finetuned_overfit' ,generate_method = generate_finetuned_title,infer_model=infer_model,read_path='result/val_result.csv',save_path='result/val_result.csv')
del infer_model
torch.cuda.empty_cache()

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.15it/s]
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
  df[name] = pd.Series()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name].iloc[i] = result
100%|██████████| 200/200 [07:02<00:00,  2.11s/it]


In [None]:
peft_config_justfit = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
torch.cuda.empty_cache()
infer_model = get_infer_model(peft_config_justfit,lora_path='test004')
run_infer(name='finetuned_justfit' ,generate_method = generate_finetuned_title,infer_model=infer_model,read_path='result/val_result.csv',save_path='result/val_result.csv')
del infer_model
torch.cuda.empty_cache()

In [16]:
peft_config_underfit = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
torch.cuda.empty_cache()
infer_model = get_infer_model(peft_config_underfit,lora_path='test001')
run_infer(name='finetuned_underfit' ,generate_method = generate_finetuned_title,infer_model=infer_model,read_path='result/val_result.csv',save_path='result/val_result.csv')
del infer_model
torch.cuda.empty_cache()

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.34it/s]
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
  df[name] = pd.Series()
  0%|          | 0/200 [00:00<?, ?it/s]The dtype of attention mask (torch.int64) is not bool
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name].iloc[i] = result
100%|██████████| 200/200 [07:48<00:00,  2.34s/it]
