# 指令微调

In [1]:
import argparse
from dataclasses import dataclass, field
from typing import Optional, List, Dict
import sys
import torch
from transformers import TrainingArguments, HfArgumentParser, Trainer, AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainerCallback
import datasets


@dataclass
class ModelArguments:
    """Arguments related to the model and its configuration."""
    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the LLM to fine-tune or its name on the Hugging Face Hub."
        }
    )
    torch_dtype: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Override the default torch.dtype and load the model under this dtype."
            ),
            "choices": ["bfloat16", "float16", "float32"],
        },
    )

@dataclass
class DataArguments:
    """Arguments related to the data and preprocessing."""
    dataset_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the fine-tuning dataset."
        }
    )
    max_length: int = field(
        default=1024,
        metadata={
            "help": "Max sequence length for tokenization."
        }
    )

def finetune():
    # TODO Step 1: Define an arguments parser and parse the arguments
    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # TODO Step 2: Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=model_args.model_name_or_path,
        trust_remote_code=True
        )
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path=model_args.model_name_or_path, 
        trust_remote_code=True,
        torch_dtype="auto"
    )

    # TODO Step 3: Load dataset
    if data_args.dataset_path is None:
        raise ValueError("Dataset path is required. Please provide --dataset_path.")
    
    dataset = datasets.load_dataset(data_args.dataset_path, split="train")
    
    def preprocess_function(examples):
        # 构建prompt
        if examples.get("input") and examples["input"]:
            prompt = (
                f"Below is an instruction that describes a task, paired with an input that provides further context. "
                f"Write a response that appropriately completes the request.\n\n"
                f"### Instruction:\n{examples['instruction']}\n\n"
                f"### Input:\n{examples['input']}\n\n"
            )
        else:
            prompt = (
                f"Below is an instruction that describes a task. "
                f"Write a response that appropriately completes the request.\n\n"
                f"### Instruction:\n{examples['instruction']}\n\n"
            )

        # 答案部分
        answer = examples['output']

        # 合并成完整文本序列
        full_text = prompt + answer

        # 首先仅对 prompt 部分分词，确定prompt长度，以便后面mask
        prompt_encoded = tokenizer(prompt, add_special_tokens=False)
        prompt_len = len(prompt_encoded["input_ids"])

        # 对完整序列进行分词和编码
        encoded = tokenizer(
            full_text,
            padding="max_length",   
            truncation=True,
            max_length=data_args.max_length,         # 根据实际需求调整最大长度
            return_tensors="np"    
        )

        input_ids = encoded["input_ids"][0]  # 因为是单例，通常需要取第0个元素
        attention_mask = encoded["attention_mask"][0]

        # 创建labels，初始与input_ids相同
        labels = input_ids.copy()

        # 将prompt对应的token位置设为-100（不计算loss）
        labels[:prompt_len] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

    # 应用到整个数据集
    processed_dataset = dataset.map(preprocess_function)

    # TODO Step 4: Define the data collator function。  
    def data_collator(batch: List[Dict]):
        return {
            "input_ids": torch.tensor([f["input_ids"] for f in batch], dtype=torch.long),
            "attention_mask": torch.tensor([f["attention_mask"] for f in batch], dtype=torch.long),
            "labels": torch.tensor([f["labels"] for f in batch], dtype=torch.long),
        }

    # TODO Step 5: Define the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=processed_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # Step 6: Train!
    trainer.train()

# 传入适当的参数进行微调
# 注意：这里的参数需要根据实际环境和需求调整
sys.argv = [
    "notebook",
    "--model_name_or_path", "./Qwen2.5-0.5B/",
    "--dataset_path", "./alpaca-cleaned/",
    "--output_dir", "./outputs/models/",
    "--overwrite_output_dir", "True",
    "--num_train_epochs", "3",
    "--per_device_train_batch_size", "2",
    "--logging_steps", "1000",
    "--save_steps", "2000",
    "--save_total_limit", "2"
]

finetune()


  trainer = Trainer(


Step,Training Loss
1000,0.2501
2000,0.2264
3000,0.2303
4000,0.2175
5000,0.2256
6000,0.2111
7000,0.2172
8000,0.2006
9000,0.2128
10000,0.2005


In [2]:
 #模型和分词器路径
model_path = "./outputs/models/checkpoint-77640"

# 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

instruction = "Summarize the following text."
input_text = "Machine learning is a subset of artificial intelligence that focuses on building systems that learn from data."

# 拼接输入
formatted_input = f"Instruction: {instruction}\nInput: {input_text}\nOutput:"

# 将输入文本转为张量
inputs = tokenizer(formatted_input, return_tensors="pt")

# # 生成结果
# outputs = model.generate(**inputs, max_length=1024, num_beams=3, no_repeat_ngram_size=2 ,pad_token_id=tokenizer.eos_token_id)

# # 解码输出
# decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print("Generated Output:", decoded_output)

generate_ids = model.generate(inputs.input_ids, max_length=1024)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Both `max_new_tokens` (=2048) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'Instruction: Summarize the following text.\nInput: Machine learning is a subset of artificial intelligence that focuses on building systems that learn from data.\nOutput: Machine learning is a branch of AI that uses algorithms to analyze data and improve performance.'

# 评测模型

In [1]:
PLM_MODEL_PATH = "./Qwen2.5-0.5B"
SFT_MODEL_PATH = "./outputs/models/checkpoint-38000"

如果你有多个GPU，可以修改下面的--hf-num-gpus参数来加速评测。

In [2]:
!opencompass \
    --datasets mmlu_ppl hellaswag_clean_ppl winogrande_ll ARC_e_ppl ARC_c_clean_ppl SuperGLUE_BoolQ_few_shot_ppl \
    --summarizer example \
    --hf-type base \
    --hf-path {PLM_MODEL_PATH} \
    --tokenizer-kwargs padding_side="left" truncation="left" \
    --max-seq-len 2048 \
    --batch-size 4 \
    --hf-num-gpus 1 \
    --work-dir "outputs/evals/plm" \
    --debug

Traceback (most recent call last):
  File "/root/miniconda3/bin/opencompass", line 5, in <module>
    from opencompass.cli.main import main
ModuleNotFoundError: No module named 'opencompass.cli'


In [3]:
!opencompass \
    --datasets mmlu_ppl hellaswag_clean_ppl winogrande_ll ARC_e_ppl ARC_c_clean_ppl SuperGLUE_BoolQ_few_shot_ppl \
    --summarizer example \
    --hf-type base \
    --hf-path {SFT_MODEL_PATH} \
    --tokenizer-kwargs padding_side="left" truncation="left" \
    --max-seq-len 2048 \
    --batch-size 4 \
    --hf-num-gpus 1 \
    --work-dir "outputs/evals/sft" \
    --debug

Traceback (most recent call last):
  File "/root/miniconda3/bin/opencompass", line 5, in <module>
    from opencompass.cli.main import main
ModuleNotFoundError: No module named 'opencompass.cli'
