# Load dataset

加载数据集，转换为trl reward trainer兼容的格式

数据集转换的方式可以参考hf上的教程：https://huggingface.co/docs/trl/main/en/reward_trainer

In [None]:
from datasets import DatasetDict, load_dataset

dataset_path = "./COIG-P/data/*.parquet"

def make_conversation(example):
    prompt = example.get("conversations")[0]["value"]
    chosen = example.get("chosen")["value"]
    rejected = example.get("rejected")["value"]

    return {
        "chosen": [{"role": "user", "content": prompt}, {"role": "assistant", "content": chosen}],
        "rejected": [{"role": "user", "content": prompt}, {"role": "assistant", "content": rejected}],
    }


In [None]:
# 原始数据集
dataset = load_dataset("parquet", data_files=dataset_path)
print(dataset["train"][0])

In [None]:
# 转换数据集
dataset = dataset.map(make_conversation)
print(dataset["train"][0])

In [None]:
# 划分数据集
dataset = dataset["train"].train_test_split(test_size=0.9, seed=42) # 只取50%用于训练
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({
    "train": split_dataset["train"],
    "test": split_dataset["test"]
})
print(dataset["train"][0])
print(dataset["test"][0])

# Training

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments
from trl import RewardConfig
from peft import LoraConfig, TaskType

# 加载模型
model_path = "./Qwen2.5-0.5B"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token  = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

# 加载配置
training_args = RewardConfig(
    output_dir="./Qwen2.5-0.5B-Reward",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    learning_rate=1.0e-3, # LoRA训练可以放大lr
    warmup_ratio=0.1,
    logging_first_step=True,
    logging_steps=5,
    logging_strategy="steps",
    save_steps=100,
    eval_strategy="steps",
    eval_steps=1000,
    gradient_checkpointing=True,
    report_to="tensorboard",
    logging_dir="./reward_logs",
    bf16=torch.cuda.is_available()
)

# Lora配置
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    task_type=TaskType.SEQ_CLS,
    modules_to_save=["score"],  # 很关键：保证 reward head 也能被保存/训练
)

In [None]:
import torch
from accelerate import logging
from trl import RewardTrainer
import peft

trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    peft_config=peft_config
)
print(trainer.args.device)

trainer.train()
trainer.save_model("./Qwen2.5-0.5B-Reward")
tokenizer.save_pretrained("./Qwen2.5-0.5B-Reward")