In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import sys
import os
LOCAL_TRL_PARENT = "/root/autodl-tmp/new_self_play_drpo"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)
from trl import RewardTrainer, RewardConfig


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ── 1. Load dataset ──
data_cache = '/root/autodl-tmp/data_cache'
ds = load_dataset('august66/hh_helpful_base', cache_dir=data_cache, split='train')
print(ds)
print(ds[0])

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 46189
})
{'prompt': [{'content': 'Hi, I want to learn to play horseshoes. Can you teach me?', 'role': 'user'}, {'content': 'I can, but maybe I should begin by telling you that a typical game consists of 2 players and 6 or 8 horseshoes.', 'role': 'assistant'}, {'content': 'Okay. What else is needed to play, and what are the rules?', 'role': 'user'}], 'chosen': [{'content': 'A horseshoe is usually made out of metal and is about 3 to 3.5 inches long and around 1 inch thick. The horseshoe should also have a 2 inch by 3 inch flat at the bottom where the rubber meets the metal. We also need two stakes and six horseshoes.', 'role': 'assistant'}], 'rejected': [{'content': 'Horseshoes are either metal or plastic discs. The horseshoes come in different weights, and the lighter ones are easier to throw, so they are often the standard for beginning players.', 'role': 'assistant'}]}


In [3]:
# ── 2. Format dataset for RewardTrainer ──
# RewardTrainer expects columns: "chosen" and "rejected", each as a list of {role, content} messages.
# Our dataset has: prompt (shared conversation), chosen (response), rejected (response).
# We need to merge prompt+chosen and prompt+rejected into full conversations.

def format_for_reward(example):
    return {
        "chosen": example["prompt"] + example["chosen"],
        "rejected": example["prompt"] + example["rejected"],
    }

reward_dataset = ds.map(format_for_reward, remove_columns=["prompt"])
reward_dataset = reward_dataset.shuffle(seed=42)

# Split off a small eval set (5%)
split = reward_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"Train: {len(train_dataset)} rows, Eval: {len(eval_dataset)} rows")
print("Chosen example:", train_dataset[0]["chosen"][:2])

Train: 43879 rows, Eval: 2310 rows
Chosen example: [{'content': 'I need to know how long it would take to travel from Maryland to Michigan by car.', 'role': 'user'}, {'content': 'Sure, I’ll give you some simple rules that can tell you how long it would take.', 'role': 'assistant'}]


In [4]:
reward_dataset[0]

{'chosen': [{'content': 'How do I install solar panels on my roof?',
   'role': 'user'},
  {'content': 'Well, I don’t know for sure. It really depends on how you want to connect them. Are you going to be getting electricity from the utility company, or from a solar power system of your own?',
   'role': 'assistant'},
  {'content': "I'm not sure. I guess with my own power system.",
   'role': 'user'},
  {'content': 'To get your own power system, you need to run some wires from your solar panels to your batteries.  And you also need to figure out what size and type of batteries you want to buy.  Once you know all those details, you can look up how to install your system, and then choose which company to buy it from.',
   'role': 'assistant'},
  {'content': 'How can I make sure the solar panels get enough sun? Do they have to face a certain direction?',
   'role': 'user'},
  {'content': 'That depends on the specific panel you want.  Most panels these days are fixed, and don’t need to be r

In [5]:
# ── 3. Load Qwen2.5-1.5B as reward model ──
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model_cache = '/root/autodl-tmp/model_cache'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir = model_cache)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    cache_dir = model_cache
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

print(f"Reward model loaded: {model_name}")
print(f"Model output: {model.config.num_labels} label(s) (scalar reward)")

`torch_dtype` is deprecated! Use `dtype` instead!
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reward model loaded: Qwen/Qwen2.5-1.5B-Instruct
Model output: 1 label(s) (scalar reward)


In [6]:
# ── 4. Configure & train ──
output_dir = "/root/autodl-tmp/new_self_play_drpo/self_play_drpo_code/output/reward_qwen25_hh_helpful"

training_args = RewardConfig(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    bf16=True,
    logging_strategy="steps",
    logging_steps=250,
    save_strategy="epoch",
    save_total_limit=1,
    max_length=1024,
    eval_strategy="steps",
    eval_steps = 250,
    report_to="none",
)

trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
)


print(f"Training reward model for {training_args.num_train_epochs} epochs on {len(train_dataset)} pairs, eval on {len(eval_dataset)}")
trainer.train()

AFTER APPLYING CHAT TEMPLATE {'chosen': "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nI need to know how long it would take to travel from Maryland to Michigan by car.<|im_end|>\n<|im_start|>assistant\nSure, I’ll give you some simple rules that can tell you how long it would take.<|im_end|>\n<|im_start|>user\nI don't need simple rules I just want to know the average travel time by car.<|im_end|>\n<|im_start|>assistant\nThere are several variables here that could affect the actual time that a trip takes. Some of the variables are that your travel can vary according to weather and traffic conditions, but I could estimate the average based on historical data that shows what the average travel time in an average year is for a car traveling from Maryland to Michigan on a busy highway.<|im_end|>\n<|im_start|>user\nHow long would it take?<|im_end|>\n<|im_start|>assistant\nAbout 6 hours.<|im_end|>\n", 'rejected': "<|im_st

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Training reward model for 1 epochs on 43879 pairs, eval on 2310


You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
250,0.6468,0.589638,0.688802
500,0.5698,0.548966,0.711931
750,0.5444,0.545805,0.715401
1000,0.5503,0.534463,0.735652
1250,0.5387,0.531082,0.727549
1500,0.5246,0.529802,0.738075
1750,0.5319,0.524367,0.734835
2000,0.5343,0.521616,0.731856
2250,0.531,0.520708,0.731739
2500,0.5301,0.520854,0.732236






















TrainOutput(global_step=2742, training_loss=0.5492058996733394, metrics={'train_runtime': 2149.4122, 'train_samples_per_second': 20.41, 'train_steps_per_second': 1.276, 'total_flos': 0.0, 'train_loss': 0.5492058996733394, 'epoch': 1.0})

In [None]:
# ── 5. Save ──
output_dir = "/root/autodl-tmp/new_self_play_drpo/output/reward_qwen25_hh_helpful"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Reward model saved to {output_dir}")

Reward model saved to /root/autodl-tmp/new_self_play_drpo/self_play_drpo_code/output/reward_qwen25_hh_helpful


In [7]:
# ── Evaluate pre-trained reward model on eval_dataset ──
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import RewardTrainer, RewardConfig
import torch
eval_model_name = "Kyleyee/Qwen2.5-1.5B-reward-hh-retrain"
model_cache = '/root/autodl-tmp/model_cache'

eval_tokenizer = AutoTokenizer.from_pretrained(eval_model_name, trust_remote_code=True, cache_dir = model_cache)
eval_model = AutoModelForSequenceClassification.from_pretrained(
    eval_model_name,
    num_labels=1,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    cache_dir = model_cache
)

if eval_tokenizer.pad_token is None:
    eval_tokenizer.pad_token = eval_tokenizer.eos_token
eval_model.config.pad_token_id = eval_tokenizer.pad_token_id

# Set up a RewardTrainer just for evaluation (no training)
eval_args = RewardConfig(
    output_dir="/root/autodl-tmp/new_self_play_drpo/self_play_drpo_code/output/eval_tmp",
    per_device_eval_batch_size=8,
    max_length=1024,
    bf16=True,
    report_to="none",
)

eval_trainer = RewardTrainer(
    model=eval_model,
    args=eval_args,
    train_dataset=eval_dataset,  # required by RewardTrainer init, won't be used
    eval_dataset=eval_dataset,
    processing_class=eval_tokenizer,
)

metrics = eval_trainer.evaluate()
print(f"Model: {eval_model_name}")
print(f"Eval results on {len(eval_dataset)} pairs:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.51it/s]
Map: 100%|██████████| 2310/2310 [00:00<00:00, 5677.56 examples/s]


AFTER APPLYING CHAT TEMPLATE {'chosen': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHow much money should I give my 10 year old twins for allowances?<|im_end|>\n<|im_start|>assistant\nOkay, here’s an example situation that’s similar to yours: \n\nI was recently supervising a 10 year old-girl who wanted to start contributing to household expenses and had never before received an allowance.  I asked her to think about what she and her family needed the most from her, and she responded that it was clothing and toys.  I recommended she be given $40/month for clothing and $10/month for toys, and that she be allowed to use some of the $40 for food, if she chooses.  She’s really good about money and a great credit risk, so I’m comfortable with that.  Does that sound like a good example for you?<|im_end|>\n', 'rejected': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHow much money should I give my 10 year old twins for allowan

Map: 100%|██████████| 2310/2310 [00:01<00:00, 1552.46 examples/s]
Filter: 100%|██████████| 2310/2310 [00:00<00:00, 6195.43 examples/s]
Filter: 100%|██████████| 2310/2310 [00:00<00:00, 4265.25 examples/s]
The model is already on multiple devices. Skipping the move to device specified in `args`.
You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Model: Kyleyee/Qwen2.5-1.5B-reward-hh-retrain
Eval results on 2310 pairs:
  eval_loss: 0.2101
  eval_model_preparation_time: 0.0026
  eval_accuracy: 0.9831
  eval_runtime: 26.9518
  eval_samples_per_second: 85.7080
  eval_steps_per_second: 10.7230




In [6]:
eval_dataset

Dataset({
    features: ['chosen', 'rejected'],
    num_rows: 2310
})