In [1]:
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification,DebertaV2ForSequenceClassification, GPTNeoXForCausalLM
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM
from transformers import DataCollatorWithPadding
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch.nn.functional as F
from tqdm import tqdm
import re
import yaml

LOCAL_TRL_PARENT = "/workspace/Self_play_DRPO"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

    
# now the import will use your local copy:
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig,
    DRPOTrainer,
    DRPOConfig,
)
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
from trl.data_utils import apply_chat_template
from trl.trainer.drpo_utils import PairRMPipeline

def process_split(original, seed = 42):
    swapped = original.map(lambda x: {
        'a1': x['a2'],
        'a2': x['a1'],
        # 'rank': 1 - int(random.random() < x['chosen_preference']),
        'rank': 1 - x['rank'],
    })

    return concatenate_datasets([original, swapped]).shuffle(seed=seed)

def load_model(model_path, task = 'generation', model_type = 'decoder', model_cache_path =  '/workspace/model_cache'):

    model_args = ModelConfig(model_path)
    model_torch_dtype = torch.bfloat16
    model_kwargs = dict(
    revision = model_args.model_revision,
    torch_dtype = model_torch_dtype, 
    trust_remote_code = model_args.trust_remote_code,
    )

    padding_side = 'left' if model_type == 'decoder' else 'right'
    truncation_side = 'left' if model_type == 'decoder' else 'right'

    if task == 'generation':
        model_instance = AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
            cache_dir = model_cache_path,
        )

    elif task == 'reward':
        model_instance = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
            cache_dir = model_cache_path,
        )
    

    model_tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, 
        padding_side = padding_side, 
        truncation_side = truncation_side,
        use_fast = True,
        trust_remote_code = model_args.trust_remote_code,
        cache_dir = model_cache_path
    )

    if model_tokenizer.pad_token is None:
        model_tokenizer.pad_token = model_tokenizer.eos_token

    if getattr(model_instance.config, "pad_token_id", None) is None:
        model_instance.config.pad_token_id = model_tokenizer.pad_token_id

    if model_tokenizer.eos_token is None:
        model_tokenizer.eos_token = model_tokenizer.pad_token  

    if getattr(model_instance.config, "eos_token_id", None) is None:
        model_instance.config.eos_token_id = model_tokenizer.eos_token_id

    return model_instance, model_tokenizer



data_cache_path = "/workspace/dataset"
model_cache_path = '/workspace/model_cache'
ds_path = 'august66/hh_qwen2.5_1.5b_with_bias'
ref_policy_path = "Qwen/Qwen2.5-1.5B-Instruct" 
target_policy_path = "Qwen/Qwen2.5-1.5B-Instruct" 
dpo_policy_path = 'august66/hh_qwen_1.5b_dpo_model_2'
train_split =  'train_chunk_00000'

#load training argument for drpo
with open("/workspace/Self_play_DRPO/self_play_drpo_code/training_config/config_normal_dist.yaml", "r") as f:
    training_args_config = yaml.safe_load(f)


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
training_args = DRPOConfig(
    **training_args_config
)
seed = 1234
ref_policy_model, ref_policy_tokenizer = load_model(ref_policy_path)
target_policy_model, target_policy_tokenizer = load_model(target_policy_path)
dpo_policy_model, dpo_policy_tokenizer = load_model(dpo_policy_path)
preference_model, preference_model_tokenizer = load_model(training_args.preference_model_id, task = 'reward')
drpo_train = load_dataset(ds_path, cache_dir=data_cache_path, split = train_split)


trainer = DRPOTrainer(
    model=target_policy_model,
    ref_model=ref_policy_model,
    dpo_model = dpo_policy_model,
    preference_model=preference_model,
    train_dataset = drpo_train,
    processing_class=ref_policy_tokenizer,
    args=training_args
)

NameError: name 'DRPOConfig' is not defined

In [2]:
training_args = DRPOConfig(
    **training_args_config
)

: 

In [None]:
target_model, tokenizer = load_model(target_policy_path)
ref_model, tokenizer = load_model(target_policy_path)
target_model.to('cuda')
ref_model.to('cuda')
drpo_train = load_dataset(ds_path, cache_dir=data_cache_path, split = 'train')

In [None]:
prompt = '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nYou will be given a definition of a task first, then some input of the task.\nIn this task, you are given a sentence which is either in the Gujarati language or English language. You task is to identify the language of input sentence. Input sentence can be in Gujarari or English language only and also it cannot have two languages at a time.\n\nએક ટ્રેન એન્જિન કે જે રેલરોડ ટ્રેક પર મુસાફરી કરેલા ધૂમ્રપાનની સાથે જોડાયેલી છે તેમાં ઘણી પેસેન્જર કાર જોડાયેલી છે.\nOutput:<|im_end|>\n<|im_start|>assistant\n'
print (prompt)

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
You will be given a definition of a task first, then some input of the task.
In this task, you are given a sentence which is either in the Gujarati language or English language. You task is to identify the language of input sentence. Input sentence can be in Gujarari or English language only and also it cannot have two languages at a time.

એક ટ્રેન એન્જિન કે જે રેલરોડ ટ્રેક પર મુસાફરી કરેલા ધૂમ્રપાનની સાથે જોડાયેલી છે તેમાં ઘણી પેસેન્જર કાર જોડાયેલી છે.
Output:<|im_end|>
<|im_start|>assistant



In [7]:
enc = tokenizer(
    prompt,
    return_tensors = 'pt',
    truncation = True,
    max_length = 1024
).to('cuda')


In [9]:
outputs_1 = ref_model.generate(
    **enc,
    do_sample = False, 
    max_new_tokens = 1024,
    temperature = 0
)
outputs_2 = target_model.generate(
    **enc,
    do_sample = False, 
    max_new_tokens = 1024,
    temperature = 0
)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
