In [None]:
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM
from transformers import DataCollatorWithPadding
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch.nn.functional as F
from tqdm import tqdm
import re
import yaml

LOCAL_TRL_PARENT = "/workspace/Self_play_DRPO"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

    
# now the import will use your local copy:
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig,
    DRPOTrainer,
    DRPOConfig,
)

from trl.trainer.drpo_utils import GPMwithRewardNetwork, estDPOStylePipeline, BTRewardNetwork, PairRMPipeline

def strip_prompt(prompt: str, text: str) -> str:
    """
    If `text` literally starts with `prompt` (ignoring leading/trailing
    whitespace), cut that prefix off and return the remainder.
    """
    p = prompt.strip()
    # Escaping safeguards punctuation / regex metacharacters
    pattern = r"^\s*" + re.escape(p) + r"\s*"
    return re.sub(pattern, "", text, count=1).lstrip()

seed = 42
FIRST = 500
SECOND = 100
data_cache_path = "/workspace/dataset"
drpo_train = load_dataset("august66/DRPO_data_from_ultrafeed_new_template", split="train", cache_dir=data_cache_path)


def process_split(original):
    swapped = original.map(lambda x: {
        'a1': x['a2'],
        'a2': x['a1'],
        # 'rank': 1 - int(random.random() < x['chosen_preference']),
        'rank': 1 - x['rank'],
    })

    return concatenate_datasets([original, swapped]).shuffle(seed=seed)
drpo_train = process_split(drpo_train)
drpo_train_reshuffle = drpo_train.shuffle(seed=seed)
drpo_train_split_1 = drpo_train_reshuffle.select(range(FIRST))
drpo_train_split_2 = drpo_train_reshuffle.select(range(FIRST, FIRST + SECOND))
drpo_train_split_3 = drpo_train_reshuffle.select(range(FIRST + SECOND, len(drpo_train_reshuffle)))

device = 'cuda'
model_name = "Qwen/Qwen2.5-1.5B-Instruct"   # use 0.5B model to test for now 
cache_path = "/workspace/model_cache"
model_args = ModelConfig(model_name)
model_torch_dtype = torch.bfloat16
model_args.trust_remote_code = True
model_kwargs = dict(
    revision = model_args.model_revision,
    torch_dtype = model_torch_dtype, 
    trust_remote_code = model_args.trust_remote_code,
)
lm_model_instance = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path,
)

ref_model = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path,
)

lm_model_tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path, 
    padding_side = 'left', 
    use_fast = True,
    trust_remote_code = model_args.trust_remote_code,
    cache_dir = cache_path
)

if not lm_model_tokenizer.pad_token:
    lm_model_tokenizer.pad_token = lm_model_tokenizer.eos_token


with open("/workspace/Self_play_DRPO/DRPO_scripts/hh/train_configs/config_gpm.yaml", "r") as f:
    training_args_config = yaml.safe_load(f)


training_args = DRPOConfig(
    **training_args_config
)


training_args.preference_model_id = 'llm-blender/PairRM-hf'

preference_pipeline = PairRMPipeline(
    model_name_or_path = training_args.preference_model_id,
)


trainer = DRPOTrainer(
    model=lm_model_instance,
    ref_model=ref_model,
    preference_model=preference_pipeline,
    train_dataset = drpo_train_split_1,
    processing_class=lm_model_tokenizer,
    args=training_args
)

trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


after chat template dataset sample: {'prompt': '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nYou will be given a definition of a task first, then some input of the task.\nIn this task, you are given a hateful post in Bengali that expresses hate or encourages violence towards a person or a group based on the protected characteristics such as race, religion, sex, and sexual orientation. You are expected to classify the post into two classes: political or non-political depending on the topic.\n\n‡¶ï‡ßÄ ‡¶Ü‡¶∞ ‡¶¨‡¶≤‡¶¨ ‡¶Æ‡¶æ‡¶Æ‡¶æ‡¶®‡¶Æ‡¶æ‡¶∞‡ßá ‡¶Æ‡ßÅ‡¶õ‡ßÅ‡¶≤‡¶Æ‡¶æ‡¶® ‡¶Æ‡¶æ‡¶∞‡¶õ‡ßá ‡¶Ü‡¶∞ ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶∏‡¶∞‡¶ï‡¶æ‡¶∞ ‡¶≠‡¶æ‡¶∞‡¶§‡ßá‡¶∞ ‡¶ü‡ßá‡¶∞‡ßá‡¶® ‡¶π‡¶æ‡¶§‡¶õ‡¶æ‡•§‡¶¶‡ßÅ‡¶É‡¶ñ ‡¶™‡ßç‡¶∞‡¶ï‡¶æ‡¶∂ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá‡¶®\nOutput:<|im_end|>\n<|im_start|>assistant\n', 'a1': 'Looking at the post, it appears like a political post with multiple expressions of hate towards different groups of people. Additionally,

[34m[1mwandb[0m: Currently logged in as: [33mooooaugust[0m ([33mooooaugust-london-school-of-economics-and-political-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss1': -0.8134000301361084, 'loss2': 0.19874586164951324, 'preference_score': 0.01480590458959341, 'preference_score_star': 0.015502894297242165, 'mean_kl': 0.00015787780284881592, 'logps': -141.16986083984375, 'logps_star': -59.18394470214844}
{'loss1': -500.8337097167969, 'loss2': 2.1064183712005615, 'preference_score': 0.47285574674606323, 'preference_score_star': 0.2973212003707886, 'mean_kl': 0.00013533234596252441, 'logps': -516.300537109375, 'logps_star': -51.05919647216797}
{'loss1': 121.74647521972656, 'loss2': 11.176969528198242, 'preference_score': 0.019213782623410225, 'preference_score_star': 0.13052856922149658, 'mean_kl': 0.0001474320888519287, 'logps': -682.0867309570312, 'logps_star': -65.59727478027344}
{'loss1': -0.015438009053468704, 'loss2': 55.53526306152344, 'preference_score': 0.5000272393226624, 'preference_score_star': 0.7440705895423889, 'mean_kl': 0.00021645426750183105, 'logps': -712.7546997070312, 'logps_star': -63.60350799560547}
{'loss1': 65.411598205

In [None]:
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch.nn.functional as F
from tqdm import tqdm
import re
import yaml



LOCAL_TRL_PARENT = "/workspace/Self_play_DRPO"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

    
# now the import will use your local copy:
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig,
    DRPOTrainer,
    DRPOConfig,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
)


from trl.trainer.drpo_utils import GPMwithRewardNetwork, estDPOStylePipeline, BTRewardNetwork, PairRMPipeline

def strip_prompt(prompt: str, text: str) -> str:
    """
    If `text` literally starts with `prompt` (ignoring leading/trailing
    whitespace), cut that prefix off and return the remainder.
    """
    p = prompt.strip()
    # Escaping safeguards punctuation / regex metacharacters
    pattern = r"^\s*" + re.escape(p) + r"\s*"
    return re.sub(pattern, "", text, count=1).lstrip()

seed = 42
FIRST = 20
SECOND = 20
data_cache_path = "/workspace/dataset"
drpo_train = load_dataset("august66/DRPO_data_from_ultrafeed", split="train", cache_dir=data_cache_path)


def process_split(original):
    swapped = original.map(lambda x: {
        'a1': x['a2'],
        'a2': x['a1'],
        # 'rank': 1 - int(random.random() < x['chosen_preference']),
        'rank': 1 - x['rank'],
    })

    return concatenate_datasets([original, swapped]).shuffle(seed=seed)
drpo_train = process_split(drpo_train)
drpo_train_reshuffle = drpo_train.shuffle(seed=seed)
drpo_train_split_1 = drpo_train_reshuffle.select(range(FIRST))
drpo_train_split_2 = drpo_train_reshuffle.select(range(FIRST, FIRST + SECOND))
drpo_train_split_3 = drpo_train_reshuffle.select(range(FIRST + SECOND, len(drpo_train_reshuffle)))



with open("/workspace/Self_play_DRPO/DRPO_scripts/hh/train_configs/config_gpm_original.yaml", "r") as f:
    training_args_config = yaml.safe_load(f)
    
training_args = DRPOConfig(
    **training_args_config
)


device = 'cuda'
model_name = "cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr"   # use 0.5B model to test for now 
cache_path = "/workspace/model_cache"
model_args = ModelConfig(model_name)
torch_dtype = (
    model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
)
quantization_config = get_quantization_config(model_args)


model_kwargs = dict(
    revision=model_args.model_revision,
    attn_implementation=model_args.attn_implementation,
    torch_dtype=torch_dtype,
    use_cache=False if training_args.gradient_checkpointing else True,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)

lm_model_instance = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path,
)

ref_model = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path,
)

lm_model_tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path, 
    padding_side = 'left', 
    use_fast = True,
    trust_remote_code = model_args.trust_remote_code,
    cache_dir = cache_path
)

if not lm_model_tokenizer.pad_token:
    lm_model_tokenizer.pad_token = lm_model_tokenizer.eos_token
    lm_model_tokenizer.pad_token_id = lm_model_tokenizer.eos_token_id


if training_args.is_bt_model:
    if isinstance(training_args.preference_model_id, dict):
        preference_pipeline = estDPOStylePipeline(training_args.preference_model_id)
    else: 
        preference_pipeline = BTRewardNetwork(training_args.preference_model_id, pad_token_id=lm_model_tokenizer.pad_token_id)
else:
    print("\033[33m++++++++++++++++++ using GPM ++++++++++++++++++\033[0m")
    preference_pipeline = GPMwithRewardNetwork(training_args.preference_model_id)

trainer = DRPOTrainer(
    model=lm_model_instance,
    ref_model=ref_model,
    preference_model=preference_pipeline,
    train_dataset = drpo_train_split_2,
    processing_class=lm_model_tokenizer,
    args=training_args,
)

trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


[33m++++++++++++++++++ using GPM ++++++++++++++++++[0m


Fetching 8 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:00<00:00, 74400.07it/s]


GPTNeoXConfig {
  "architectures": [
    "CustomRewardModel"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 8,
  "num_hidden_layers": 16,
  "partial_rotary_factor": 0.25,
  "rope_scaling": null,
  "rope_theta": 10000,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.52.4",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

after chat template dataset sample: {'prompt': 'If I am using JavaMailSender email client with a Postfix email server what could be the cause of duplicate To: headers being added to the email?', 'a1': 'It is possible that the em

TypeError: GPMwithRewardNetwork.forward() missing 1 required positional argument: 'attention_mask'