In [1]:
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification,DebertaV2ForSequenceClassification, GPTNeoXForCausalLM
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM
from transformers import DataCollatorWithPadding
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch.nn.functional as F
from tqdm import tqdm
import re
import yaml

LOCAL_TRL_PARENT = "/workspace/Self_play_DRPO"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

    
# now the import will use your local copy:
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig,
    DRPOTrainer,
    DRPOConfig,
)
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
from trl.data_utils import apply_chat_template
from trl.trainer.drpo_utils import PairRMPipeline

def process_split(original, seed = 42):
    swapped = original.map(lambda x: {
        'a1': x['a2'],
        'a2': x['a1'],
        # 'rank': 1 - int(random.random() < x['chosen_preference']),
        'rank': 1 - x['rank'],
    })

    return concatenate_datasets([original, swapped]).shuffle(seed=seed)

def load_model(model_path, task = 'generation', model_type = 'decoder', model_cache_path =  '/workspace/model_cache'):

    model_args = ModelConfig(model_path)
    model_torch_dtype = torch.bfloat16
    model_kwargs = dict(
    revision = model_args.model_revision,
    torch_dtype = model_torch_dtype, 
    trust_remote_code = model_args.trust_remote_code,
    )

    padding_side = 'left' if model_type == 'decoder' else 'right'
    truncation_side = 'left' if model_type == 'decoder' else 'right'

    if task == 'generation':
        model_instance = AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
            cache_dir = model_cache_path,
        )

    elif task == 'reward':
        model_instance = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
            cache_dir = model_cache_path,
        )
    

    model_tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, 
        padding_side = padding_side, 
        truncation_side = truncation_side,
        use_fast = True,
        trust_remote_code = model_args.trust_remote_code,
        cache_dir = model_cache_path
    )

    if model_tokenizer.pad_token is None:
        model_tokenizer.pad_token = model_tokenizer.eos_token

    if getattr(model_instance.config, "pad_token_id", None) is None:
        model_instance.config.pad_token_id = model_tokenizer.pad_token_id

    if model_tokenizer.eos_token is None:
        model_tokenizer.eos_token = model_tokenizer.pad_token  

    if getattr(model_instance.config, "eos_token_id", None) is None:
        model_instance.config.eos_token_id = model_tokenizer.eos_token_id

    return model_instance, model_tokenizer



data_cache_path = "/workspace/dataset"
model_cache_path = '/workspace/model_cache'
ds_path = 'august66/drpo_hh_qwen2.5_1.5b'
ref_policy_path = "Qwen/Qwen2.5-1.5B-Instruct" 
target_policy_path = "Qwen/Qwen2.5-1.5B-Instruct" 
dpo_policy_path = 'august66/hh_qwen_1.5b_dpo_model_2'

#load training argument for drpo
with open("/workspace/Self_play_DRPO/self_play_drpo_code/training_config/config_normal_dist.yaml", "r") as f:
    training_args_config = yaml.safe_load(f)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 1234
ref_policy_model, ref_policy_tokenizer = load_model(ref_policy_path)
target_policy_model, target_policy_tokenizer = load_model(target_policy_path)
dpo_policy_model, dpo_policy_tokenizer = load_model(dpo_policy_path)
drpo_train = load_dataset(ds_path, cache_dir=data_cache_path, split = 'train')
drpo_train = process_split(drpo_train)
drpo_train = drpo_train.shuffle(seed=seed)

training_args = DRPOConfig(
    **training_args_config
)
preference_pipeline = PairRMPipeline(
    model_name_or_path = training_args.preference_model_id,
)

trainer = DRPOTrainer(
    model=target_policy_model,
    ref_model=ref_policy_model,
    dpo_model = dpo_policy_model,
    preference_model=preference_pipeline,
    train_dataset = drpo_train,
    processing_class=ref_policy_tokenizer,
    args=training_args
)
trainer.train()

after chat template dataset sample: {'prompt': "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nAny tips on how to revise something you wrote?<|im_end|>\n<|im_start|>assistant\nThat depends on what you mean by revise, I think. Do you mean you’ve already written something and you want to fix mistakes and add detail, or do you want to make some major changes?<|im_end|>\n<|im_start|>user\nCould be either. It's a hypothetical scenario. I'm not actually rewriting or revising anything. I just want your advice on how to do this.<|im_end|>\n<|im_start|>assistant\nOK, that’s fine. If it’s hypothetical, then I’m happy to talk about whatever you have in mind, if it’s a useful topic of conversation.<|im_end|>\n<|im_start|>user\nSo what's your advice for rewrite or revision?<|im_end|>\n<|im_start|>assistant\n", 'a1': "Rewriting or revising can be a challenging task, but there are a few general tips that may help:\n\n  1. Take a b

Tokenizing train dataset: 100%|██████████| 87670/87670 [01:42<00:00, 854.65 examples/s]
[34m[1mwandb[0m: Currently logged in as: [33mooooaugust[0m ([33mooooaugust-london-school-of-economics-and-political-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Allocated   (GB): 11.409146785736084


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Allocated   (GB): 13.402694702148438
Allocated   (GB): 13.40269660949707
Allocated   (GB): 13.402700424194336
{'reward': 0.028594970703125, 'kl_ref_theta_star': 121.0, 'kl_theta_ref': -0.007459861692041159, 'kl_theta_theta_star': 83.57323455810547, 'scale': 3.0}
Allocated   (GB): 13.95131254196167
Allocated   (GB): 16.071962356567383
Allocated   (GB): 16.071961879730225
Allocated   (GB): 16.07196807861328
{'reward': 0.05770111083984375, 'kl_ref_theta_star': 162.0, 'kl_theta_ref': 0.0010937975021079183, 'kl_theta_theta_star': 84.36793518066406, 'scale': 3.0}
Allocated   (GB): 14.871642589569092
Allocated   (GB): 17.227574825286865
Allocated   (GB): 17.227578163146973
Allocated   (GB): 17.22758197784424
{'reward': 0.457000732421875, 'kl_ref_theta_star': 90.5, 'kl_theta_ref': 0.003996164072304964, 'kl_theta_theta_star': 121.81249237060547, 'scale': 3.0}
Allocated   (GB): 13.883245944976807
Allocated   (GB): 15.92759895324707
Allocated   (GB): 15.927598476409912
Allocated   (GB): 15.927605

Step,Training Loss
5,0.0057


Allocated   (GB): 17.476655960083008
Allocated   (GB): 19.952197074890137
Allocated   (GB): 19.95219898223877
Allocated   (GB): 19.952205181121826
{'reward': 1.55731201171875, 'kl_ref_theta_star': 94.5, 'kl_theta_ref': 0.28174740076065063, 'kl_theta_theta_star': 102.5602035522461, 'scale': 3.0}
Allocated   (GB): 19.718765258789062
Allocated   (GB): 22.114320755004883
Allocated   (GB): 22.114319801330566
Allocated   (GB): 22.114327907562256
{'reward': 0.0060272216796875, 'kl_ref_theta_star': 152.0, 'kl_theta_ref': 0.4056096374988556, 'kl_theta_theta_star': 102.4009780883789, 'scale': 3.0}
Allocated   (GB): 20.48171615600586
Allocated   (GB): 23.350520133972168
Allocated   (GB): 23.350520610809326
Allocated   (GB): 23.3505277633667
{'reward': 0.1794891357421875, 'kl_ref_theta_star': 87.0, 'kl_theta_ref': 0.05697329342365265, 'kl_theta_theta_star': 75.77232360839844, 'scale': 3.0}
Allocated   (GB): 20.254658222198486
Allocated   (GB): 22.467305183410645
Allocated   (GB): 22.46730661392212

KeyboardInterrupt: 

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x785ec02816d0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7860a4131f10, execution_count=2 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 7860a4131590, raw_cell="seed = 1234
ref_policy_model, ref_policy_tokenizer.." transformed_cell="seed = 1234
ref_policy_model, ref_policy_tokenizer.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22413130305f72756e706f64227d/workspace/Self_play_DRPO/self_play_drpo_code/test_codes/drpo_normal_dist.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [None]:
target_model, tokenizer = load_model(target_policy_path)
ref_model, tokenizer = load_model(target_policy_path)
target_model.to('cuda')
ref_model.to('cuda')
drpo_train = load_dataset(ds_path, cache_dir=data_cache_path, split = 'train')

In [None]:
prompt = '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nYou will be given a definition of a task first, then some input of the task.\nIn this task, you are given a sentence which is either in the Gujarati language or English language. You task is to identify the language of input sentence. Input sentence can be in Gujarari or English language only and also it cannot have two languages at a time.\n\nએક ટ્રેન એન્જિન કે જે રેલરોડ ટ્રેક પર મુસાફરી કરેલા ધૂમ્રપાનની સાથે જોડાયેલી છે તેમાં ઘણી પેસેન્જર કાર જોડાયેલી છે.\nOutput:<|im_end|>\n<|im_start|>assistant\n'
print (prompt)

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
You will be given a definition of a task first, then some input of the task.
In this task, you are given a sentence which is either in the Gujarati language or English language. You task is to identify the language of input sentence. Input sentence can be in Gujarari or English language only and also it cannot have two languages at a time.

એક ટ્રેન એન્જિન કે જે રેલરોડ ટ્રેક પર મુસાફરી કરેલા ધૂમ્રપાનની સાથે જોડાયેલી છે તેમાં ઘણી પેસેન્જર કાર જોડાયેલી છે.
Output:<|im_end|>
<|im_start|>assistant



In [7]:
enc = tokenizer(
    prompt,
    return_tensors = 'pt',
    truncation = True,
    max_length = 1024
).to('cuda')


In [9]:
outputs_1 = ref_model.generate(
    **enc,
    do_sample = False, 
    max_new_tokens = 1024,
    temperature = 0
)
outputs_2 = target_model.generate(
    **enc,
    do_sample = False, 
    max_new_tokens = 1024,
    temperature = 0
)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
