In [None]:
from datasets import load_dataset
from datasets import Dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer
import sys
import os
LOCAL_TRL_PARENT = "/root/autodl-tmp/new_self_play_drpo"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

from trl import SFTTrainer, SFTConfig
import torch
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["HF_HUB_ENDPOINT"] = "https://hf-mirror.com"
os.environ["HUGGINGFACE_HUB_ENDPOINT"] = "https://hf-mirror.com"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

  from .autonotebook import tqdm as notebook_tqdm

libgomp: Invalid value for environment variable OMP_NUM_THREADS




In [2]:
data_cache = '/root/autodl-tmp/data_cache'
ds_hh_helpful = load_dataset('august66/hh_helpful_base', cache_dir=data_cache, split = 'train')

In [3]:
ds_hh_helpful[0]

{'prompt': [{'content': 'Hi, I want to learn to play horseshoes. Can you teach me?',
   'role': 'user'},
  {'content': 'I can, but maybe I should begin by telling you that a typical game consists of 2 players and 6 or 8 horseshoes.',
   'role': 'assistant'},
  {'content': 'Okay. What else is needed to play, and what are the rules?',
   'role': 'user'}],
 'chosen': [{'content': 'A horseshoe is usually made out of metal and is about 3 to 3.5 inches long and around 1 inch thick. The horseshoe should also have a 2 inch by 3 inch flat at the bottom where the rubber meets the metal. We also need two stakes and six horseshoes.',
   'role': 'assistant'}],
 'rejected': [{'content': 'Horseshoes are either metal or plastic discs. The horseshoes come in different weights, and the lighter ones are easier to throw, so they are often the standard for beginning players.',
   'role': 'assistant'}]}

In [4]:
# ── 1. Build SFT dataset: prompt+chosen and prompt+rejected as separate rows ──
def build_sft_conversations(example):
    """Create two full conversations: one with chosen, one with rejected."""
    chosen_conv = example["prompt"] + example["chosen"]
    rejected_conv = example["prompt"] + example["rejected"]
    return {"conversations": [chosen_conv, rejected_conv]}

# Expand each row into two conversation rows
all_convos = []
for ex in ds_hh_helpful:
    all_convos.append({"messages": ex["prompt"] + ex["chosen"]})
    all_convos.append({"messages": ex["prompt"] + ex["rejected"]})

sft_dataset = Dataset.from_list(all_convos)
sft_dataset = sft_dataset.shuffle(seed=42)
print(f"SFT dataset: {len(sft_dataset)} rows (2x {len(ds_hh_helpful)} original)")
print("Example:", sft_dataset[0]["messages"][:2])

SFT dataset: 92378 rows (2x 46189 original)
Example: [{'content': 'How do you void a check?', 'role': 'user'}, {'content': 'If you wish to void a check, first you need to cash it at your bank.  Once the check is cashed, it is no longer valid, and you can claim a refund on your bank’s balance.', 'role': 'assistant'}]


In [6]:
# ── 2. Load Qwen2.5-1.5B base model & tokenizer ──
model_name = "Qwen/Qwen2.5-1.5B"
model_cache = '/root/autodl-tmp/model_cache'
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir = model_cache)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    cache_dir = model_cache 
)

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded: {model_name}")
print(f"Chat template available: {tokenizer.chat_template is not None}")

Model loaded: Qwen/Qwen2.5-1.5B
Chat template available: True


In [8]:
# ── 3. Configure & run SFT training ──
output_dir = "/root/autodl-tmp/outputs"

training_args = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    bf16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    max_seq_length=1024,
    dataset_kwargs={"skip_prepare_dataset": False},
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=sft_dataset,
    processing_class=tokenizer,
)

print(f"Training for {training_args.num_train_epochs} epochs on {len(sft_dataset)} examples")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
trainer.train()

Converting train dataset to ChatML:  21%|██        | 19134/92378 [00:01<00:04, 17359.43 examples/s]

Converting train dataset to ChatML: 100%|██████████| 92378/92378 [00:05<00:00, 16217.50 examples/s]
Applying chat template to train dataset: 100%|██████████| 92378/92378 [00:07<00:00, 11827.68 examples/s]


example of input after trainer processing {'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nHow do you void a check?<|im_end|>\n<|im_start|>assistant\nIf you wish to void a check, first you need to cash it at your bank.  Once the check is cashed, it is no longer valid, and you can claim a refund on your bank’s balance.<|im_end|>\n<|im_start|>user\nSo you have to cash it firtst?<|im_end|>\n<|im_start|>assistant\nYes, you must first cash the check before you can void it.<|im_end|>\n<|im_start|>user\nI always thought it had to be voided before cashing?<|im_end|>\n<|im_start|>assistant\nWhen you void a check, you are asking the bank to void it from your account.  When you “cash” a check, you are taking money from your account.  You cannot do both at the same time, as it is a transaction on two different accounts.  You must first cash the check, before you can void it.<|im_end|>\n'}


Tokenizing train dataset: 100%|██████████| 92378/92378 [00:33<00:00, 2773.51 examples/s]
Truncating train dataset: 100%|██████████| 92378/92378 [00:12<00:00, 7331.11 examples/s]
The model is already on multiple devices. Skipping the move to device specified in `args`.


Training for 3 epochs on 92378 examples
Effective batch size: 16


Step,Training Loss
50,2.9895
100,2.9187
150,2.7818
200,2.5905
250,2.3085
300,2.1898
350,2.1121
400,2.0503
450,1.9534
500,1.8725


TrainOutput(global_step=17322, training_loss=1.4324204988064495, metrics={'train_runtime': 11074.2745, 'train_samples_per_second': 25.025, 'train_steps_per_second': 1.564, 'total_flos': 7.513446293319107e+17, 'train_loss': 1.4324204988064495})

In [9]:
# ── 4. Save the final model & tokenizer ──
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model saved to /root/autodl-tmp/outputs
