In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import RewardTrainer, RewardConfig
# ── 1. Load dataset ──
data_cache = '/root/autodl-tmp/data_cache'
ds = load_dataset('august66/hh_helpful_base', cache_dir=data_cache, split='train')
print(ds)
print(ds[0])

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 46070
})
{'prompt': [{'content': 'Hi, I want to learn to play horseshoes. Can you teach me?', 'role': 'user'}, {'content': 'I can, but maybe I should begin by telling you that a typical game consists of 2 players and 6 or 8 horseshoes.', 'role': 'assistant'}, {'content': 'Okay. What else is needed to play, and what are the rules?', 'role': 'user'}], 'chosen': [{'content': 'A horseshoe is usually made out of metal and is about 3 to 3.5 inches long and around 1 inch thick. The horseshoe should also have a 2 inch by 3 inch flat at the bottom where the rubber meets the metal. We also need two stakes and six horseshoes.', 'role': 'assistant'}], 'rejected': [{'content': 'Horseshoes are either metal or plastic discs. The horseshoes come in different weights, and the lighter ones are easier to throw, so they are often the standard for beginning players.', 'role': 'assistant'}]}


In [2]:
# ── 2. Format dataset for RewardTrainer ──
# RewardTrainer expects columns: "chosen" and "rejected", each as a list of {role, content} messages.
# Our dataset has: prompt (shared conversation), chosen (response), rejected (response).
# We need to merge prompt+chosen and prompt+rejected into full conversations.

def format_for_reward(example):
    return {
        "chosen": example["prompt"] + example["chosen"],
        "rejected": example["prompt"] + example["rejected"],
    }

reward_dataset = ds.map(format_for_reward, remove_columns=["prompt"])
reward_dataset = reward_dataset.shuffle(seed=42)

# Split off a small eval set (5%)
split = reward_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f"Train: {len(train_dataset)} rows, Eval: {len(eval_dataset)} rows")
print("Chosen example:", train_dataset[0]["chosen"][:2])

Train: 43766 rows, Eval: 2304 rows
Chosen example: [{'content': 'Do all turtles share a common ancestor?', 'role': 'user'}, {'content': 'The group of turtles you’re referring to is known as “Testudines”, which means “turtle”.  We can get more specific and look at the family Testudinidae, which means “Turtle family”.', 'role': 'assistant'}]


In [3]:
reward_dataset[0]

{'chosen': [{'content': 'How do you draw a horse?', 'role': 'user'},
  {'content': 'Are you talking about something like this?\n\n[]',
   'role': 'assistant'},
  {'content': "I can't see anything.", 'role': 'user'},
  {'content': 'Did you try using Chrome?', 'role': 'assistant'}],
 'rejected': [{'content': 'How do you draw a horse?', 'role': 'user'},
  {'content': 'Are you talking about something like this?\n\n[]',
   'role': 'assistant'},
  {'content': "I can't see anything.", 'role': 'user'},
  {'content': 'It’s not clear to me.  Which aspect of the drawing is unclear to you?',
   'role': 'assistant'}]}

In [4]:
# ── 3. Load Qwen2.5-1.5B as reward model ──
model_name = "Qwen/Qwen2.5-1.5B"
model_cache = '/root/autodl-tmp/model_cache'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir = model_cache)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    cache_dir = model_cache
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

print(f"Reward model loaded: {model_name}")
print(f"Model output: {model.config.num_labels} label(s) (scalar reward)")

`torch_dtype` is deprecated! Use `dtype` instead!
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reward model loaded: Qwen/Qwen2.5-1.5B
Model output: 1 label(s) (scalar reward)


In [5]:
# ── 4. Configure & train (with early stopping + strong regularization) ──
from transformers import EarlyStoppingCallback


training_args = RewardConfig(
    output_dir="/root/autodl-tmp/new_self_play_drpo/self_play_drpo_code/output/eval_tmp",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-6,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    bf16=True,
    logging_steps=50,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    max_length=1024,
    eval_strategy="steps",
    eval_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    report_to="none",
)

trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

print(f"Training reward model for up to {training_args.num_train_epochs} epoch")
print(f"lr={training_args.learning_rate}, weight_decay={training_args.weight_decay}, label_smoothing={training_args.label_smoothing_factor}")
print(f"Early stopping: patience=3 (stops if accuracy doesn't improve for 3 evals)")
trainer.train()

Filtering train >1024 tokens:   5%|▍         | 2000/43766 [00:00<00:06, 6889.50 examples/s]

Filtering train >1024 tokens: 100%|██████████| 43766/43766 [00:06<00:00, 6964.00 examples/s]
Filtering eval >1024 tokens: 100%|██████████| 2304/2304 [00:00<00:00, 7310.01 examples/s]
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Training reward model for up to 5 epoch
lr=1e-06, weight_decay=0.1, label_smoothing=0.1
Early stopping: patience=3 (stops if accuracy doesn't improve for 3 evals)


Step,Training Loss,Validation Loss
200,0.7077,0.693641
400,0.6904,0.693268
600,0.7011,0.691345
800,0.7015,0.686767
1000,0.6897,0.682025
1200,0.665,0.674176
1400,0.6678,0.666893
1600,0.6629,0.660485
1800,0.6652,0.656429
2000,0.6659,0.651331


TrainOutput(global_step=5200, training_loss=0.6636088631703303, metrics={'train_runtime': 4713.9698, 'train_samples_per_second': 46.414, 'train_steps_per_second': 2.901, 'total_flos': 4.7858559613055386e+17, 'train_loss': 0.6636088631703303})

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os

hub_repo = "august66/Qwen2.5-1.5B-Base-reward-hh-helpful-early-stop"
model = trainer.model
tokenizer = trainer.processing_class
model.push_to_hub(hub_repo)
tokenizer.push_to_hub(hub_repo)
print(f"Pushed to https://huggingface.co/{hub_repo}")

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["HF_HUB_ENDPOINT"] = "https://hf-mirror.com"
os.environ["HUGGINGFACE_HUB_ENDPOINT"] = "https://hf-mirror.com"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

model = AutoModelForSequenceClassification.from_pretrained(hub_repo, cache_dir = '/root/autodl-tmp/model_cache')
tokenizer = AutoTokenizer.from_pretrained(hub_repo, cache_dir = '/root/autodl-tmp/model_cache')



Processing Files (0 / 0)                : |          |  0.00B /  0.00B            
[A
Processing Files (0 / 1)                :   0%|          |  105kB / 3.09GB,  106kB/s  
[A
Processing Files (0 / 1)                :   0%|          |  123kB / 3.09GB, 87.5kB/s  
Processing Files (0 / 1)                :   0%|          |  171kB / 3.09GB,  107kB/s  
Processing Files (0 / 1)                :   0%|          |  260kB / 3.09GB,  145kB/s  
Processing Files (0 / 1)                :   0%|          |  388kB / 3.09GB,  194kB/s  
Processing Files (0 / 1)                :   0%|          | 1.08MB / 3.09GB,  489kB/s  
[A
[A
Processing Files (0 / 1)                :   0%|          | 1.68MB / 3.09GB,  598kB/s  
[A
Processing Files (0 / 1)                :   0%|          | 2.47MB / 3.09GB,  773kB/s  
Processing Files (0 / 1)                :   0%|          | 6.44MB / 3.09GB, 1.89MB/s  
Processing Files (0 / 1)                :   0%|          | 9.49MB / 3.09GB, 2.64MB/s  
[A
Processing Files (0 / 1

Pushed to https://huggingface.co/august66/Qwen2.5-1.5B-Base-reward-hh-helpful-early-stop


In [None]:
# ── Evaluate pre-trained reward model on eval_dataset ──
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np

eval_model_name = "blakenp/Qwen2.5-1.5B-Reward"

eval_tokenizer = AutoTokenizer.from_pretrained(eval_model_name, trust_remote_code=True, cache_dir='/root/autodl-tmp/model_cache')
eval_model = AutoModelForSequenceClassification.from_pretrained(
    eval_model_name,
    num_labels=1,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    cache_dir='/root/autodl-tmp/model_cache',
)

if eval_tokenizer.pad_token is None:
    eval_tokenizer.pad_token = eval_tokenizer.eos_token
eval_model.config.pad_token_id = eval_tokenizer.pad_token_id

eval_model.eval()
rc_list, rr_list = [], []
batch_size = 8

for i in range(0, len(eval_dataset), batch_size):
    batch = eval_dataset[i : i + batch_size]
    
    chosen_texts = [eval_tokenizer.apply_chat_template(c, tokenize=False) for c in batch["chosen"]]
    rejected_texts = [eval_tokenizer.apply_chat_template(r, tokenize=False) for r in batch["rejected"]]
    
    tok_chosen = eval_tokenizer(chosen_texts, padding=True, truncation=True, max_length=1024, return_tensors="pt").to(eval_model.device)
    tok_rejected = eval_tokenizer(rejected_texts, padding=True, truncation=True, max_length=1024, return_tensors="pt").to(eval_model.device)
    
    with torch.no_grad():
        rc = eval_model(**tok_chosen)["logits"]
        rr = eval_model(**tok_rejected)["logits"]
    rc_list.append(rc.float().cpu().numpy())
    rr_list.append(rr.float().cpu().numpy())

rewards_chosen = np.concatenate(rc_list).flatten()
rewards_rejected = np.concatenate(rr_list).flatten()
accuracy = np.mean(rewards_chosen > rewards_rejected)

print(f"\nModel: {eval_model_name}")
print(f"Eval on {len(eval_dataset)} pairs:")
print(f"  Pairwise Accuracy: {accuracy:.4f} ({int(np.sum(rewards_chosen > rewards_rejected))}/{len(rewards_chosen)})")
print(f"  Avg Chosen Reward:   {np.mean(rewards_chosen):.4f}")
print(f"  Avg Rejected Reward: {np.mean(rewards_rejected):.4f}")
print(f"  Avg Margin:          {np.mean(rewards_chosen - rewards_rejected):.4f}")

ValueError: Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating