In [1]:
%env HF_ENDPOINT=https://hf-mirror.com
%env HF_HUB_ENDPOINT=https://hf-mirror.com
%env HUGGINGFACE_HUB_ENDPOINT=https://hf-mirror.com
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification,DebertaV2ForSequenceClassification, GPTNeoXForCausalLM
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM
from transformers import DataCollatorWithPadding
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch.nn.functional as F
from tqdm import tqdm
import re
import yaml

LOCAL_TRL_PARENT = "/root/autodl-tmp/new_self_play_drpo"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

    
# now the import will use your local copy:
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig,
    DRPOTrainer,
    DRPOConfig,
    DRPOTrainerParallel
)
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
from trl.data_utils import apply_chat_template
from trl.trainer.drpo_utils import PairRMPipeline, BTwithRewardPipeline

def process_split(original, seed = 42):
    swapped = original.map(lambda x: {
        'a1': x['a2'],
        'a2': x['a1'],
        # 'rank': 1 - int(random.random() < x['chosen_preference']),
        'rank': 1 - x['rank'],
    })

    return concatenate_datasets([original, swapped]).shuffle(seed=seed)

def load_model(model_path, task = 'generation', model_type = 'decoder', model_cache_path =  '/root/autodl-tmp/model_cache'):

    model_args = ModelConfig(model_path)
    model_torch_dtype = torch.bfloat16
    model_kwargs = dict(
    revision = model_args.model_revision,
    torch_dtype = model_torch_dtype, 
    trust_remote_code = model_args.trust_remote_code,
    )

    padding_side = 'left' if model_type == 'decoder' else 'right'
    truncation_side = 'left' if model_type == 'decoder' else 'right'

    if task == 'generation':
        model_instance = AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
            cache_dir = model_cache_path,
        )

    elif task == 'reward':
        model_instance = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
            cache_dir = model_cache_path,
        )
    

    model_tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, 
        padding_side = padding_side, 
        truncation_side = truncation_side,
        use_fast = True,
        trust_remote_code = model_args.trust_remote_code,
        cache_dir = model_cache_path
    )

    if model_tokenizer.pad_token is None:
        model_tokenizer.pad_token = model_tokenizer.eos_token

    if getattr(model_instance.config, "pad_token_id", None) is None:
        model_instance.config.pad_token_id = model_tokenizer.pad_token_id

    if model_tokenizer.eos_token is None:
        model_tokenizer.eos_token = model_tokenizer.pad_token  

    if getattr(model_instance.config, "eos_token_id", None) is None:
        model_instance.config.eos_token_id = model_tokenizer.eos_token_id

    return model_instance, model_tokenizer



data_cache_path = "/root/autodl-tmp/dataset"
model_cache_path = '/root/autodl-tmp/model_cache'
ds_path = 'august66/hh_qwen2.5_1.5b_with_bias'
ref_policy_path = "Qwen/Qwen2.5-1.5B-Instruct" 
target_policy_path = "Qwen/Qwen2.5-1.5B-Instruct" 
dpo_policy_path = 'august66/hh_qwen_1.5b_dpo_model_2'

#load training argument for drpo
with open("/root/autodl-tmp/new_self_play_drpo/self_play_drpo_code/training_config/config_normal_dist.yaml", "r") as f:
    training_args_config = yaml.safe_load(f)


env: HF_ENDPOINT=https://hf-mirror.com
env: HF_HUB_ENDPOINT=https://hf-mirror.com
env: HUGGINGFACE_HUB_ENDPOINT=https://hf-mirror.com


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
training_args = DRPOConfig(
    **training_args_config
)
seed = 1234
ref_policy_model, ref_policy_tokenizer = load_model(ref_policy_path)
target_policy_model, target_policy_tokenizer = load_model(target_policy_path)
dpo_policy_model, dpo_policy_tokenizer = load_model(dpo_policy_path)
preference_pipeline = BTwithRewardPipeline(training_args.preference_model_id, training_args.preference_model_id)
ds_dict = load_dataset(ds_path, cache_dir=data_cache_path)
drpo_train = concatenate_datasets(list(ds_dict.values()))   

drpo_train = process_split(drpo_train)
drpo_train = drpo_train.shuffle(seed=seed)

trainer = DRPOTrainerParallel(
    model=target_policy_model,
    ref_model=ref_policy_model,
    dpo_model = dpo_policy_model,
    dpo_as_reward = True,
    preference_model=preference_pipeline,
    train_dataset = drpo_train,
    processing_class=ref_policy_tokenizer,
    args=training_args
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.16it/s]


ValueError: dpo model cannot be None if use dpo as reward

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mooooaugust[0m ([33mooooaugust-london-school-of-economics-and-political-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Allocated   (GB): 13.303843975067139
Allocated   (GB): 15.670618534088135
Allocated   (GB): 15.6706223487854
Allocated   (GB): 15.67062520980835
[42mprompt_astar:[0m system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user
What are the pros and cons of Mac Computers?
assistant
Gosh, there are so many pros and cons that I can’t possibly list them all, but some I’ll mention are: Mac Computers are popular among people who are into fashion and design. Mac Computers are very difficult to use, compared to computers using Windows, which many people use. Mac Computers have more features and options for customization and controlling the computer. Mac Computers can be very different from each other, and from the computers people use with Windows. Mac Computers are made by Apple, and Apple is a company that has a long history. Mac Computers are made for people who want to buy from Apple and experience the Apple “WOW” that is very important to the company. Mac Computers a

Step,Training Loss
10,259.4525
20,331.9639


Allocated   (GB): 18.046083450317383
Allocated   (GB): 18.859281539916992
Allocated   (GB): 18.859283924102783
Allocated   (GB): 18.85928440093994
[42mprompt_astar:[0m system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
user
Which bands are similar to Eluveitie?
assistant
I can’t help you with this one, I’m not a music expert. Do you want to try and think of any names or bands for me, and I can check on some things about them?
user
They are a folk metal band. Can you find some similar bands like this?
assistant
Hmm, I think I do know some. Try checking out Finnish folk metal group Korpiklaani, the Finnish power metal band Apocalyptica, the Finnish folk metal band Finntroll, the Finnish black metal band Arkona, the Finnish folk metal band Lemurian, the Finnish folk metal band Turisas, or the Finnish folk metal band Taake.  How does that sound?
user
Those seem like some great recommendations.
assistant
I'm glad I could help! I hope you enjoy listening to some of

In [5]:
drpo_train

Dataset({
    features: ['prompt', 'a1', 'a2', 'rank', 'bias'],
    num_rows: 3000
})

In [None]:
target_model, tokenizer = load_model(target_policy_path)
ref_model, tokenizer = load_model(target_policy_path)
target_model.to('cuda')
ref_model.to('cuda')
drpo_train = load_dataset(ds_path, cache_dir=data_cache_path, split = 'train')

In [None]:
prompt = '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nYou will be given a definition of a task first, then some input of the task.\nIn this task, you are given a sentence which is either in the Gujarati language or English language. You task is to identify the language of input sentence. Input sentence can be in Gujarari or English language only and also it cannot have two languages at a time.\n\nએક ટ્રેન એન્જિન કે જે રેલરોડ ટ્રેક પર મુસાફરી કરેલા ધૂમ્રપાનની સાથે જોડાયેલી છે તેમાં ઘણી પેસેન્જર કાર જોડાયેલી છે.\nOutput:<|im_end|>\n<|im_start|>assistant\n'
print (prompt)

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
You will be given a definition of a task first, then some input of the task.
In this task, you are given a sentence which is either in the Gujarati language or English language. You task is to identify the language of input sentence. Input sentence can be in Gujarari or English language only and also it cannot have two languages at a time.

એક ટ્રેન એન્જિન કે જે રેલરોડ ટ્રેક પર મુસાફરી કરેલા ધૂમ્રપાનની સાથે જોડાયેલી છે તેમાં ઘણી પેસેન્જર કાર જોડાયેલી છે.
Output:<|im_end|>
<|im_start|>assistant



In [7]:
enc = tokenizer(
    prompt,
    return_tensors = 'pt',
    truncation = True,
    max_length = 1024
).to('cuda')


In [9]:
outputs_1 = ref_model.generate(
    **enc,
    do_sample = False, 
    max_new_tokens = 1024,
    temperature = 0
)
outputs_2 = target_model.generate(
    **enc,
    do_sample = False, 
    max_new_tokens = 1024,
    temperature = 0
)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
