In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
data_cache = '/Users/august/Documents/LLM Research/new_self_play_drpo/data_cache'
ds_hh_helpful = load_dataset('august66/hh_helpful_base', cache_dir=data_cache, split = 'train')

In [9]:
ds_hh_helpful[0]

{'prompt': [{'content': 'Hi, I want to learn to play horseshoes. Can you teach me?',
   'role': 'user'},
  {'content': 'I can, but maybe I should begin by telling you that a typical game consists of 2 players and 6 or 8 horseshoes.',
   'role': 'assistant'},
  {'content': 'Okay. What else is needed to play, and what are the rules?',
   'role': 'user'}],
 'chosen': [{'content': 'A horseshoe is usually made out of metal and is about 3 to 3.5 inches long and around 1 inch thick. The horseshoe should also have a 2 inch by 3 inch flat at the bottom where the rubber meets the metal. We also need two stakes and six horseshoes.',
   'role': 'assistant'}],
 'rejected': [{'content': 'Horseshoes are either metal or plastic discs. The horseshoes come in different weights, and the lighter ones are easier to throw, so they are often the standard for beginning players.',
   'role': 'assistant'}]}

In [12]:
from datasets import Dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
import torch

# ── 1. Build SFT dataset: prompt+chosen and prompt+rejected as separate rows ──
def build_sft_conversations(example):
    """Create two full conversations: one with chosen, one with rejected."""
    chosen_conv = example["prompt"] + example["chosen"]
    rejected_conv = example["prompt"] + example["rejected"]
    return {"conversations": [chosen_conv, rejected_conv]}

# Expand each row into two conversation rows
all_convos = []
for ex in ds_hh_helpful:
    all_convos.append({"messages": ex["prompt"] + ex["chosen"]})
    all_convos.append({"messages": ex["prompt"] + ex["rejected"]})

sft_dataset = Dataset.from_list(all_convos)
sft_dataset = sft_dataset.shuffle(seed=42)
print(f"SFT dataset: {len(sft_dataset)} rows (2x {len(ds_hh_helpful)} original)")
print("Example:", sft_dataset[0]["messages"][:2])

SFT dataset: 92378 rows (2x 46189 original)
Example: [{'content': 'How do you void a check?', 'role': 'user'}, {'content': 'If you wish to void a check, first you need to cash it at your bank.  Once the check is cashed, it is no longer valid, and you can claim a refund on your bank’s balance.', 'role': 'assistant'}]


In [None]:
# ── 2. Load Qwen2.5-1.5B base model & tokenizer ──
model_name = "Qwen/Qwen2.5-1.5B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded: {model_name}")
print(f"Chat template available: {tokenizer.chat_template is not None}")

In [None]:
# ── 3. Configure & run SFT training ──
output_dir = "/Users/august/Documents/LLM Research/new_self_play_drpo/self_play_drpo_code/SFT_codes/sft_qwen25_hh_helpful"

training_args = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    bf16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    max_seq_length=1024,
    dataset_kwargs={"skip_prepare_dataset": False},
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=sft_dataset,
    processing_class=tokenizer,
)

print(f"Training for {training_args.num_train_epochs} epochs on {len(sft_dataset)} examples")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
trainer.train()

In [None]:
# ── 4. Save the final model & tokenizer ──
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

In [14]:
sft_dataset[0]

{'messages': [{'content': 'How do you void a check?', 'role': 'user'},
  {'content': 'If you wish to void a check, first you need to cash it at your bank.  Once the check is cashed, it is no longer valid, and you can claim a refund on your bank’s balance.',
   'role': 'assistant'},
  {'content': 'So you have to cash it firtst?', 'role': 'user'},
  {'content': 'Yes, you must first cash the check before you can void it.',
   'role': 'assistant'},
  {'content': 'I always thought it had to be voided before cashing?',
   'role': 'user'},
  {'content': 'When you void a check, you are asking the bank to void it from your account.  When you “cash” a check, you are taking money from your account.  You cannot do both at the same time, as it is a transaction on two different accounts.  You must first cash the check, before you can void it.',
   'role': 'assistant'}]}

In [15]:
# Quick check: what the tokenizer will produce from your messages
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B", trust_remote_code=True)
sample = sft_dataset[0]["messages"]
formatted = tokenizer.apply_chat_template(sample, tokenize=False)
print(formatted)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
How do you void a check?<|im_end|>
<|im_start|>assistant
If you wish to void a check, first you need to cash it at your bank.  Once the check is cashed, it is no longer valid, and you can claim a refund on your bank’s balance.<|im_end|>
<|im_start|>user
So you have to cash it firtst?<|im_end|>
<|im_start|>assistant
Yes, you must first cash the check before you can void it.<|im_end|>
<|im_start|>user
I always thought it had to be voided before cashing?<|im_end|>
<|im_start|>assistant
When you void a check, you are asking the bank to void it from your account.  When you “cash” a check, you are taking money from your account.  You cannot do both at the same time, as it is a transaction on two different accounts.  You must first cash the check, before you can void it.<|im_end|>

