# 处理数据集

In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
tokenizer = AutoTokenizer.from_pretrained('qwen/Qwen2-7B-Instruct')

  from .autonotebook import tqdm as notebook_tqdm
2024-06-23 17:30:06.516442: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-23 17:30:06.568119: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
df = pd.read_json('./dataset/tianji-wishes-chinese-v0.1.json')
ds = Dataset.from_pandas(df)
ds['conversation'][0]

[{'input': '我想送赵老师生日祝福,严肃风格',
  'output': '尊敬的赵老师，值此生辰之际，愿岁月如诗，为您带来无尽的喜悦与美好；愿时光荏苒，为您留下珍贵的回忆与感悟。愿您快快乐乐，事业更上一层楼，教诲之恩桃李满天下。在这充满敬意的时刻，恭祝赵老师生日快乐，幸福安康！',
  'system': '你现在是一个送祝福大师，帮我针对不同人和事情、节日送对应的祝福'}]

In [3]:
# 转换为 DataFrame 然后转换为 Dataset
df = pd.DataFrame(ds)
df = df.explode('conversation')  # 展平 'conversation' 列
df = pd.json_normalize(df['conversation'])  # 将嵌套的 JSON 展平
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['input', 'output', 'system'],
    num_rows: 2976
})

In [4]:
dataset[0]

{'input': '我想送赵老师生日祝福,严肃风格',
 'output': '尊敬的赵老师，值此生辰之际，愿岁月如诗，为您带来无尽的喜悦与美好；愿时光荏苒，为您留下珍贵的回忆与感悟。愿您快快乐乐，事业更上一层楼，教诲之恩桃李满天下。在这充满敬意的时刻，恭祝赵老师生日快乐，幸福安康！',
 'system': '你现在是一个送祝福大师，帮我针对不同人和事情、节日送对应的祝福'}

In [5]:
def process_func(example):
    MAX_LENGTH = 384    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n你现在是一个送祝福大师，帮我针对不同人和事情、节日送对应的祝福<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [6]:
tokenized_id = dataset.map(process_func, remove_columns=dataset.column_names)
tokenized_id

Map: 100%|██████████| 2976/2976 [00:01<00:00, 2706.16 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2976
})

In [7]:
tokenizer.decode(tokenized_id[0]['input_ids'])

'<|im_start|>system\n你现在是一个送祝福大师，帮我针对不同人和事情、节日送对应的祝福<|im_end|>\n<|im_start|>user\n我想送赵老师生日祝福,严肃风格<|im_end|>\n<|im_start|>assistant\n尊敬的赵老师，值此生辰之际，愿岁月如诗，为您带来无尽的喜悦与美好；愿时光荏苒，为您留下珍贵的回忆与感悟。愿您快快乐乐，事业更上一层楼，教诲之恩桃李满天下。在这充满敬意的时刻，恭祝赵老师生日快乐，幸福安康！<|endoftext|>'

In [8]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"])))

'赵老师，值此春节佳节之际，恭祝您福寿安康，万事如意。在过去的一年里，您的辛勤耕耘为后辈树立了榜样，新春到来，愿您的生活如诗如画，工作更上一层楼，继续以您的智慧和热忱，引领我们前行。岁月静好，愿您享受每一个温馨时刻，幸福安康，喜悦无忧。<|endoftext|>'

# 创建模型

In [9]:
import torch

# 使用 `bitsandbytes` 进行量化
model_name = 'qwen/Qwen2-7B-Instruct'

bnb_config = {
    'load_in_4bit': True,  # 启用 4-bit 量化
    'bnb_4bit_compute_dtype': torch.bfloat16,  # 计算使用 bfloat16
    'bnb_4bit_use_double_quant': True,  # 使用双重量化
    'bnb_4bit_quant_type': 'nf4',  # 量化类型，nf4
}

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config,device_map="auto",torch_dtype=torch.bfloat16)

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.88s/it]


In [10]:
model.enable_input_require_grads()

In [11]:
model.dtype

torch.bfloat16

# Qlora 

In [12]:
from peft import LoraConfig, TaskType, get_peft_model

# QLoRA 配置
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

# 应用 QLoRA 配置
model = get_peft_model(model, config)

In [13]:
model.print_trainable_parameters()

trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643


# 配置训练参数

In [14]:
args = TrainingArguments(
    output_dir="./output/Qwen2_instruct_Qlora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100, 
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True
)

In [15]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [None]:
trainer.train()

# 合并加载模型

In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import bitsandbytes as bnb

# 设置路径
lora_path = './output/Qwen2_instruct_Qlora/checkpoint-500'  # 你的 QLoRA 权重路径
model_name = 'qwen/Qwen2-7B-Instruct'  # 基础模型

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# 配置量化
bnb_config = {
    'load_in_4bit': True,  # 启用 4-bit 量化
    'bnb_4bit_compute_dtype': torch.bfloat16,  # 计算使用 bfloat16
    'bnb_4bit_use_double_quant': True,  # 使用双重量化
    'bnb_4bit_quant_type': 'nf4',  # 量化类型，nf4
}

# 加载基础模型并应用量化
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # 自动分配设备
    torch_dtype=torch.bfloat16,  # 使用 bfloat16
    trust_remote_code=True,  # 信任远程代码
    quantization_config=bnb_config  # 应用量化配置
).eval()

# 加载 QLoRA 权重
model = PeftModel.from_pretrained(model, lora_path)

# 模型设置为推理模式
model.eval()



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.96s/it]


模型和 QLoRA 权重加载成功！


# 模型推理

In [23]:
prompt = "我想祝福郑同学端午节快乐，严肃风格"
inputs = tokenizer.apply_chat_template([{"role": "user", "content": "你现在是一个送祝福大师，帮我针对不同人和事情、节日送对应的祝福。"},{"role": "user", "content": prompt}],
                                       add_generation_prompt=True,
                                       tokenize=True,
                                       return_tensors="pt",
                                       return_dict=True
                                       ).to('cuda')


gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

愿端午佳节，艾叶飘香，龙舟竞渡，为你带来吉祥与安康。愿你在学习之余，享受传统节日的韵味，身心愉悦，才思泉涌，前程似锦。祝郑同学端午节快乐，万事如意！
