In [2]:
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer
LOCAL_TRL_PARENT = "/workspace/Self_play_DRPO"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)
import llm_blender
blender = llm_blender.Blender()
blender.loadranker("llm-blender/PairRM") 
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig,
    DRPOTrainer,
    DRPOConfig,
)
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
from trl.data_utils import apply_chat_template
from datasets import load_dataset, concatenate_datasets, DatasetDict
data_cache_path = "/workspace/dataset"
#ultrafeedback_ds = load_dataset('august66/DRPO_data_from_ultrafeed_new_template', split="train", cache_dir=data_cache_path)
hh_ds = load_dataset('Kyleyee/train_data_hh_for_drpo', split = 'train', cache_dir=data_cache_path)

  from .autonotebook import tqdm as notebook_tqdm


Successfully loaded ranker from  /root/.cache/huggingface/hub/llm-blender/PairRM


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
cache_path = "/workspace/model_cache"

model_args = ModelConfig(model_name)
model_torch_dtype = torch.bfloat16

lm_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_path,
    torch_dtype=model_torch_dtype,
    trust_remote_code=True,
).to(device)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    use_fast=True,
    trust_remote_code=True,
    cache_dir=cache_path,
)

if tokenizer.chat_template is None:
    tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE


def generate_text(prompts, tokenizer, model, temperature):

    inputs = tokenizer(
        prompts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(model.device)
    
    generate_kwargs = {
        "max_new_tokens": 512,
        "eos_token_id": tokenizer.eos_token_id,
        "pad_token_id": tokenizer.pad_token_id,
        "do_sample": temperature > 0,
        "num_return_sequences": 2,
        'use_cache': True
    }
    
    if temperature > 0:
        generate_kwargs["temperature"] = temperature
    
    outputs = model.generate(
        **inputs,
        **generate_kwargs
    )
    
    generated_ids = outputs[:, inputs.input_ids.shape[1]:]
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

def get_preference(prompts, response_1, response_2):
    compare_result = blender.compare(prompts, response_1, response_2)
    if compare_result[0]:
        a1 =response_1
        a2 = response_2
    else:
        a2 = response_1
        a1 = response_2
    return a1, a2

def truncate_human(texts):
    return [text.split("\n\nHuman")[0] for text in texts]

def extract_dialogue(examples: dict, tokenizer, model, temperature: float) -> dict:
    # each item is a full chat: list[{"role": "...", "content": "..."}]
    chats = examples["prompt"]

    # render each chat to a single prompt string (all rounds kept)
    rendered = [
        tokenizer.apply_chat_template(
            chat,
            add_generation_prompt=True,   # append assistant prefix for generation
            tokenize=False
        )
        for chat in chats
    ]

    responses = generate_text(rendered, tokenizer, model, temperature)
    responses = truncate_human(responses)
    return {"generated_response": responses}

def prepare_dataset(batch, tokenizer=tokenizer, model=lm_model, temperature=1.0):
    # full multi-round chats
    chats = batch["prompt"]   # list[list[dict(role, content)]]

    # generate two responses per chat
    responses = extract_dialogue(batch, tokenizer, model, temperature)["generated_response"]

    a1_list, a2_list = [], []
    n = len(responses) // 2

    # render full chats (without add_generation_prompt) for the preference model
    rendered_prompts = [
        tokenizer.apply_chat_template(chat, add_generation_prompt=False, tokenize=False)
        for chat in chats
    ]

    for i in range(n):
        prompt_text = rendered_prompts[i]     # full conversation as plain text
        res1, res2  = responses[2*i], responses[2*i+1]
        a1, a2 = get_preference([prompt_text], [res1], [res2])  # keep your blender API shape
        a1_list.append(a1)
        a2_list.append(a2)

    return {
        "prompt": chats,                                          # <-- keep ALL rounds
        "a1":     [[{"role": "assistant", "content": a[0]}] for a in a1_list],
        "a2":     [[{"role": "assistant", "content": a[0]}] for a in a2_list],
        "rank":   [1] * len(chats),
    }

In [None]:
hh_ds_qwen = hh_ds.map(prepare_dataset,
                        batched=True,
                        batch_size=128,
                        remove_columns=hh_ds.column_names,
                        fn_kwargs={"tokenizer": tokenizer, "model": lm_model, "temperature": 1.0})

Ranking candidates: 100%|██████████| 1/1 [00:00<00:00,  3.40it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00,  7.81it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00,  4.19it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 16.11it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 16.35it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00,  3.48it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 18.21it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 14.31it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 18.12it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.52it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 16.84it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.21it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.64it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 14.78it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.46it/s]
Ranking ca

In [47]:
hh_ds_qwen.push_to_hub('august66/drpo_hh_qwen2.5_1.5b')

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1308.68ba/s]
Processing Files (0 / 0)                : |          |  0.00B /  0.00B            
[A
Processing Files (1 / 1)                : 100%|██████████| 16.1kB / 16.1kB, 80.2kB/s  
[A
Processing Files (1 / 1)                : 100%|██████████| 16.1kB / 16.1kB, 40.2kB/s  
New Data Upload                         : 100%|██████████| 16.1kB / 16.1kB, 40.2kB/s  
                                        : 100%|██████████| 16.1kB / 16.1kB            
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.03s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/august66/drpo_hh_qwen2.5_1.5b/commit/fdd4ea7ed6c74514e2e365c5a0f09dd0ad83f38a', commit_message='Upload dataset', commit_description='', oid='fdd4ea7ed6c74514e2e365c5a0f09dd0ad83f38a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/august66/drpo_hh_qwen2.5_1.5b', endpoint='https://huggingface.co', repo_type='dataset', repo_id='august66/drpo_hh_qwen2.5_1.5b'), pr_revision=None, pr_num=None)

In [3]:
ds_path = 'august66/drpo_hh_qwen2.5_1.5b'
drpo_train = load_dataset(ds_path, cache_dir=data_cache_path, split = 'train')

In [4]:
drpo_train[0]

{'prompt': [{'content': 'Hi, I want to learn to play horseshoes. Can you teach me?',
   'role': 'user'},
  {'content': 'I can, but maybe I should begin by telling you that a typical game consists of 2 players and 6 or 8 horseshoes.',
   'role': 'assistant'},
  {'content': 'Okay. What else is needed to play, and what are the rules?',
   'role': 'user'}],
 'a1': [{'content': "To play horseshoe, you'll need:\n\n- At least two people: one player will throw the shoe and the other will catch it.\n- 8 horseshoes (usually cast iron ones) - they come in pairs with a curved end and an open end.\n\nThe goal is for the person throwing the shoe to toss it so that it lands on top of the opposite horse, which must be caught by their opponent before they can try again. The first person to get all four shoes caught wins the round.\n\nAdditional equipment like gloves may help if the shoes tend to stick to your hands. The most important thing is to have fun and enjoy the sport! If you're new to horseshoe

In [None]:
from datasets import load_dataset
from trl.data_utils import apply_chat_template as trl_apply

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
cache_path = "/workspace/model_cache"

model_args = ModelConfig(model_name)
model_torch_dtype = torch.bfloat16

lm_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_path,
    torch_dtype=model_torch_dtype,
    trust_remote_code=True,
).to(device)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left",
    use_fast=True,
    trust_remote_code=True,
    cache_dir=cache_path,
)



tok = tokenizer  # use the target tokenizer

def render_prompt(history):
    # Prefer tokenizer's own method if available
    if hasattr(tok, "apply_chat_template"):
        return tok.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
    return trl_apply(history, tok, add_generation_prompt=True)

def as_assistant_text(x):
    # your a1/a2 can be list[{"role":"assistant","content":...}] or plain string
    return x[0]["content"] if isinstance(x, list) else x

def to_three_cols(ex):
    prompt_text = render_prompt(ex["prompt"])
    a1 = as_assistant_text(ex["a1"])
    a2 = as_assistant_text(ex["a2"])
    if ex["rank"] == 1:   # adjust if your convention differs
        chosen, rejected = a1, a2
    else:
        chosen, rejected = a2, a1
    return {"prompt": prompt_text, "chosen": chosen, "rejected": rejected}

ds = load_dataset("august66/drpo_hh_qwen2.5_1.5b", split="train")
ds = ds.map(to_three_cols, remove_columns=[c for c in ds.column_names
                                           if c not in {"prompt","chosen","rejected"}]) dd




Generating train split: 100%|██████████| 43835/43835 [00:00<00:00, 240452.66 examples/s]
Map: 100%|██████████| 43835/43835 [00:05<00:00, 7673.61 examples/s]


In [4]:
ckpt = "/workspace/Self_play_DRPO/self_play_drpo_code/dpo_out/checkpoint-16440" 
target_policy_model = AutoModelForCausalLM.from_pretrained(
    ckpt, torch_dtype=torch.bfloat16, device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]


In [7]:
target_policy_model.push_to_hub('august66/hh_qwen_1.5b_dpo_model_2')

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            
[A
Processing Files (0 / 1)                :   0%|          |  602kB / 3.09GB,  602kB/s  
Processing Files (0 / 1)                :   0%|          | 5.42MB / 3.09GB, 4.52MB/s  
Processing Files (0 / 1)                :   0%|          | 11.4MB / 3.09GB, 8.17MB/s  
Processing Files (0 / 1)                :   1%|          | 22.3MB / 3.09GB, 13.9MB/s  
Processing Files (0 / 1)                :   1%|          | 37.9MB / 3.09GB, 21.1MB/s  
Processing Files (0 / 1)                :   2%|▏         | 57.8MB / 3.09GB, 28.9MB/s  
Processing Files (0 / 1)                :   3%|▎         | 81.9MB / 3.09GB, 37.2MB/s  
Processing Files (0 / 1)                :   3%|▎         |  107MB / 3.09GB, 44.7MB/s  
Processing Files (0 / 1)                :   4%|▍         |  131MB / 3.09GB, 50.5MB/s  
Processing Files (0 / 1)                :   5%|▌         |  155MB / 3.09GB, 55.5MB/s  
Processing Files (0 / 1)                :  

CommitInfo(commit_url='https://huggingface.co/august66/hh_qwen_1.5b_dpo_model_2/commit/28e33a80177fe0bfe11bfd84b4628782a0d15f50', commit_message='Upload Qwen2ForCausalLM', commit_description='', oid='28e33a80177fe0bfe11bfd84b4628782a0d15f50', pr_url=None, repo_url=RepoUrl('https://huggingface.co/august66/hh_qwen_1.5b_dpo_model_2', endpoint='https://huggingface.co', repo_type='model', repo_id='august66/hh_qwen_1.5b_dpo_model_2'), pr_revision=None, pr_num=None)