In [3]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import sys
import os
LOCAL_TRL_PARENT = "/root/autodl-tmp/new_self_play_drpo"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)
from trl import RewardTrainer, RewardConfig


In [4]:
# ── 1. Load dataset ──
data_cache = '/root/autodl-tmp/data_cache'
ds = load_dataset('august66/hh_helpful_base', cache_dir=data_cache, split='train')
print(ds)
print(ds[0])

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 46189
})
{'prompt': [{'content': 'Hi, I want to learn to play horseshoes. Can you teach me?', 'role': 'user'}, {'content': 'I can, but maybe I should begin by telling you that a typical game consists of 2 players and 6 or 8 horseshoes.', 'role': 'assistant'}, {'content': 'Okay. What else is needed to play, and what are the rules?', 'role': 'user'}], 'chosen': [{'content': 'A horseshoe is usually made out of metal and is about 3 to 3.5 inches long and around 1 inch thick. The horseshoe should also have a 2 inch by 3 inch flat at the bottom where the rubber meets the metal. We also need two stakes and six horseshoes.', 'role': 'assistant'}], 'rejected': [{'content': 'Horseshoes are either metal or plastic discs. The horseshoes come in different weights, and the lighter ones are easier to throw, so they are often the standard for beginning players.', 'role': 'assistant'}]}


In [5]:
# ── 2. Format dataset for RewardTrainer ──
# RewardTrainer expects columns: "chosen" and "rejected", each as a list of {role, content} messages.
# Our dataset has: prompt (shared conversation), chosen (response), rejected (response).
# We need to merge prompt+chosen and prompt+rejected into full conversations.

def format_for_reward(example):
    return {
        "chosen": example["prompt"] + example["chosen"],
        "rejected": example["prompt"] + example["rejected"],
    }

reward_dataset = ds.map(format_for_reward, remove_columns=["prompt"])
reward_dataset = reward_dataset.shuffle(seed=42)
print(f"Reward dataset: {len(reward_dataset)} rows")
print("Chosen example:", reward_dataset[0]["chosen"][:2])

Map: 100%|██████████| 46189/46189 [00:02<00:00, 19144.38 examples/s]

Reward dataset: 46189 rows
Chosen example: [{'content': 'How do I install solar panels on my roof?', 'role': 'user'}, {'content': 'Well, I don’t know for sure. It really depends on how you want to connect them. Are you going to be getting electricity from the utility company, or from a solar power system of your own?', 'role': 'assistant'}]





In [9]:
reward_dataset[0]

{'chosen': [{'content': 'How do I install solar panels on my roof?',
   'role': 'user'},
  {'content': 'Well, I don’t know for sure. It really depends on how you want to connect them. Are you going to be getting electricity from the utility company, or from a solar power system of your own?',
   'role': 'assistant'},
  {'content': "I'm not sure. I guess with my own power system.",
   'role': 'user'},
  {'content': 'To get your own power system, you need to run some wires from your solar panels to your batteries.  And you also need to figure out what size and type of batteries you want to buy.  Once you know all those details, you can look up how to install your system, and then choose which company to buy it from.',
   'role': 'assistant'},
  {'content': 'How can I make sure the solar panels get enough sun? Do they have to face a certain direction?',
   'role': 'user'},
  {'content': 'That depends on the specific panel you want.  Most panels these days are fixed, and don’t need to be r

In [10]:
# ── 3. Load Qwen2.5-1.5B as reward model ──
model_name = "Qwen/Qwen2.5-1.5B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

print(f"Reward model loaded: {model_name}")
print(f"Model output: {model.config.num_labels} label(s) (scalar reward)")

`torch_dtype` is deprecated! Use `dtype` instead!
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reward model loaded: Qwen/Qwen2.5-1.5B
Model output: 1 label(s) (scalar reward)


In [11]:
# ── 4. Configure & train ──
output_dir = "/root/autodl-tmp/new_self_play_drpo/self_play_drpo_code/output/reward_qwen25_hh_helpful"

training_args = RewardConfig(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    bf16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    max_length=1024,
    report_to="none",
)

trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=reward_dataset,
    processing_class=tokenizer,
)

print(f"Training reward model for {training_args.num_train_epochs} epochs on {len(reward_dataset)} pairs")
trainer.train()

Map: 100%|██████████| 46189/46189 [00:07<00:00, 5936.16 examples/s]


AFTER APPLYING CHAT TEMPLATE {'chosen': "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHow do I install solar panels on my roof?<|im_end|>\n<|im_start|>assistant\nWell, I don’t know for sure. It really depends on how you want to connect them. Are you going to be getting electricity from the utility company, or from a solar power system of your own?<|im_end|>\n<|im_start|>user\nI'm not sure. I guess with my own power system.<|im_end|>\n<|im_start|>assistant\nTo get your own power system, you need to run some wires from your solar panels to your batteries.  And you also need to figure out what size and type of batteries you want to buy.  Once you know all those details, you can look up how to install your system, and then choose which company to buy it from.<|im_end|>\n<|im_start|>user\nHow can I make sure the solar panels get enough sun? Do they have to face a certain direction?<|im_end|>\n<|im_start|>assistant\nThat depends on the specific panel you want

Map: 100%|██████████| 46189/46189 [00:29<00:00, 1588.14 examples/s]
Filter: 100%|██████████| 46189/46189 [00:07<00:00, 6112.65 examples/s]
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training reward model for 3 epochs on 46189 pairs


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 94.97 GiB of which 8.56 MiB is free. Process 3268 has 86.69 GiB memory in use. Including non-PyTorch memory, this process has 8.26 GiB memory in use. Of the allocated memory 7.26 GiB is allocated by PyTorch, and 374.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# ── 5. Save ──
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Reward model saved to {output_dir}")