In [None]:
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM
from transformers import DataCollatorWithPadding
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch.nn.functional as F
from tqdm import tqdm
import re
import yaml

LOCAL_TRL_PARENT = "/workspace/Self_play_DRPO"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

    
# now the import will use your local copy:
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig,
    DRPOTrainer,
    DRPOConfig,
)

from trl.trainer.drpo_utils import GPMwithRewardNetwork, estDPOStylePipeline, BTRewardNetwork, PairRMPipeline

def strip_prompt(prompt: str, text: str) -> str:
    """
    If `text` literally starts with `prompt` (ignoring leading/trailing
    whitespace), cut that prefix off and return the remainder.
    """
    p = prompt.strip()
    # Escaping safeguards punctuation / regex metacharacters
    pattern = r"^\s*" + re.escape(p) + r"\s*"
    return re.sub(pattern, "", text, count=1).lstrip()

seed = 42
FIRST = 100
SECOND = 100
data_cache_path = "/workspace/dataset"
drpo_train = load_dataset("august66/DRPO_data_from_ultrafeed", split="train", cache_dir=data_cache_path)


def process_split(original):
    swapped = original.map(lambda x: {
        'a1': x['a2'],
        'a2': x['a1'],
        # 'rank': 1 - int(random.random() < x['chosen_preference']),
        'rank': 1 - x['rank'],
    })

    return concatenate_datasets([original, swapped]).shuffle(seed=seed)
drpo_train = process_split(drpo_train)
drpo_train_reshuffle = drpo_train.shuffle(seed=seed)
drpo_train_split_1 = drpo_train_reshuffle.select(range(FIRST))
drpo_train_split_2 = drpo_train_reshuffle.select(range(FIRST, FIRST + SECOND))
drpo_train_split_3 = drpo_train_reshuffle.select(range(FIRST + SECOND, len(drpo_train_reshuffle)))

device = 'cuda'
model_name = "Qwen/Qwen2.5-0.5B-Instruct"   # use 0.5B model to test for now 
cache_path = "/workspace/model_cache"
model_args = ModelConfig(model_name)
model_torch_dtype = torch.bfloat16
model_args.trust_remote_code = True
model_kwargs = dict(
    revision = model_args.model_revision,
    torch_dtype = model_torch_dtype, 
    trust_remote_code = model_args.trust_remote_code,
)
lm_model_instance = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path,
)

ref_model = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path,
)

lm_model_tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path, 
    padding_side = 'left', 
    use_fast = True,
    trust_remote_code = model_args.trust_remote_code,
    cache_dir = cache_path
)

if not lm_model_tokenizer.pad_token:
    lm_model_tokenizer.pad_token = lm_model_tokenizer.eos_token


with open("/workspace/Self_play_DRPO/DRPO_scripts/hh/train_configs/config_gpm.yaml", "r") as f:
    training_args_config = yaml.safe_load(f)


training_args = DRPOConfig(
    **training_args_config
)


training_args.preference_model_id = 'llm-blender/PairRM-hf'

preference_pipeline = PairRMPipeline(
    model_name_or_path = training_args.preference_model_id,
)


trainer = DRPOTrainer(
    model=lm_model_instance,
    ref_model=ref_model,
    preference_model=preference_pipeline,
    train_dataset = drpo_train_split_2,
    processing_class=lm_model_tokenizer,
    args=training_args
)

trainer.train()


after chat template dataset sample: {'prompt': 'Write a comprehensive, informative, and well-structured article that provides practical solutions to the problem of overheating associated with the use of heated car seat cushions, taking into consideration the different factors that contribute to the problem, the health risks associated with prolonged exposure to heat, and the best practices for safe and comfortable use of heated car seat cushions. Your article should be written in a clear, concise, and engaging style, highlighting the key points and providing relevant examples, tips, and recommendations to help readers overcome the challenges of using heated car seat cushions.', 'a1': 'Title: Practical Solutions to Keep You Safe and Comfortable While Using Heated Car Seat Cushions\n\nIntroduction\n\nHeated car seat cushions have become increasingly popular in recent years, providing warmth and comfort during cold winter months. While these cushions offer numerous benefits, many people s

In [None]:
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch.nn.functional as F
from tqdm import tqdm
import re
import yaml



LOCAL_TRL_PARENT = "/workspace/Self_play_DRPO"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

    
# now the import will use your local copy:
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig,
    DRPOTrainer,
    DRPOConfig,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
)


from trl.trainer.drpo_utils import GPMwithRewardNetwork, estDPOStylePipeline, BTRewardNetwork, PairRMPipeline

def strip_prompt(prompt: str, text: str) -> str:
    """
    If `text` literally starts with `prompt` (ignoring leading/trailing
    whitespace), cut that prefix off and return the remainder.
    """
    p = prompt.strip()
    # Escaping safeguards punctuation / regex metacharacters
    pattern = r"^\s*" + re.escape(p) + r"\s*"
    return re.sub(pattern, "", text, count=1).lstrip()

seed = 42
FIRST = 20
SECOND = 20
data_cache_path = "/workspace/dataset"
drpo_train = load_dataset("august66/DRPO_data_from_ultrafeed", split="train", cache_dir=data_cache_path)


def process_split(original):
    swapped = original.map(lambda x: {
        'a1': x['a2'],
        'a2': x['a1'],
        # 'rank': 1 - int(random.random() < x['chosen_preference']),
        'rank': 1 - x['rank'],
    })

    return concatenate_datasets([original, swapped]).shuffle(seed=seed)
drpo_train = process_split(drpo_train)
drpo_train_reshuffle = drpo_train.shuffle(seed=seed)
drpo_train_split_1 = drpo_train_reshuffle.select(range(FIRST))
drpo_train_split_2 = drpo_train_reshuffle.select(range(FIRST, FIRST + SECOND))
drpo_train_split_3 = drpo_train_reshuffle.select(range(FIRST + SECOND, len(drpo_train_reshuffle)))



with open("/workspace/Self_play_DRPO/DRPO_scripts/hh/train_configs/config_gpm_original.yaml", "r") as f:
    training_args_config = yaml.safe_load(f)
    
training_args = DRPOConfig(
    **training_args_config
)


device = 'cuda'
model_name = "cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr"   # use 0.5B model to test for now 
cache_path = "/workspace/model_cache"
model_args = ModelConfig(model_name)
torch_dtype = (
    model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
)
quantization_config = get_quantization_config(model_args)


model_kwargs = dict(
    revision=model_args.model_revision,
    attn_implementation=model_args.attn_implementation,
    torch_dtype=torch_dtype,
    use_cache=False if training_args.gradient_checkpointing else True,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)

lm_model_instance = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path,
)

ref_model = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path,
)

lm_model_tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path, 
    padding_side = 'left', 
    use_fast = True,
    trust_remote_code = model_args.trust_remote_code,
    cache_dir = cache_path
)

if not lm_model_tokenizer.pad_token:
    lm_model_tokenizer.pad_token = lm_model_tokenizer.eos_token
    lm_model_tokenizer.pad_token_id = lm_model_tokenizer.eos_token_id


if training_args.is_bt_model:
    if isinstance(training_args.preference_model_id, dict):
        preference_pipeline = estDPOStylePipeline(training_args.preference_model_id)
    else: 
        preference_pipeline = BTRewardNetwork(training_args.preference_model_id, pad_token_id=lm_model_tokenizer.pad_token_id)
else:
    print("\033[33m++++++++++++++++++ using GPM ++++++++++++++++++\033[0m")
    preference_pipeline = GPMwithRewardNetwork(training_args.preference_model_id)

trainer = DRPOTrainer(
    model=lm_model_instance,
    ref_model=ref_model,
    preference_model=preference_pipeline,
    train_dataset = drpo_train_split_2,
    processing_class=lm_model_tokenizer,
    args=training_args,
)

trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


[33m++++++++++++++++++ using GPM ++++++++++++++++++[0m


Fetching 8 files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:00<00:00, 74400.07it/s]


GPTNeoXConfig {
  "architectures": [
    "CustomRewardModel"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 8,
  "num_hidden_layers": 16,
  "partial_rotary_factor": 0.25,
  "rope_scaling": null,
  "rope_theta": 10000,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.52.4",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

after chat template dataset sample: {'prompt': 'If I am using JavaMailSender email client with a Postfix email server what could be the cause of duplicate To: headers being added to the email?', 'a1': 'It is possible that the em

TypeError: GPMwithRewardNetwork.forward() missing 1 required positional argument: 'attention_mask'

In [5]:
lm_model_instance

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 2048)
    (emb_dropout): Dropout(p=0, inplace=False)
    (layers): ModuleList(
      (0-15): 16 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0, inplace=False)
        (post_mlp_dropout): Dropout(p=0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((2048,), eps=1e-05, elemen