In [1]:
import subprocess
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch.nn.functional as F
from tqdm import tqdm
import re
import yaml

LOCAL_TRL_PARENT = "/root/autodl-tmp/Self_play_DRPO"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

    
# now the import will use your local copy:
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig,
    DRPOTrainer,
    DRPOConfig,
)

from trl.trainer.drpo_utils import GPMwithRewardNetwork, estDPOStylePipeline, BTRewardNetwork, PairRMPipeline

# Load environment variables from /etc/network_turbo
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

def strip_prompt(prompt: str, text: str) -> str:
    """
    If `text` literally starts with `prompt` (ignoring leading/trailing
    whitespace), cut that prefix off and return the remainder.
    """
    p = prompt.strip()
    # Escaping safeguards punctuation / regex metacharacters
    pattern = r"^\s*" + re.escape(p) + r"\s*"
    return re.sub(pattern, "", text, count=1).lstrip()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
"""
ds_ultrafeed_train = load_dataset("openbmb/UltraFeedback", split = 'train', cache_dir="/root/autodl-tmp/dataset")
ds_ultrafeed_train = ds_ultrafeed_train.filter(lambda x: len(x['completions']) > 0)
def get_preferred(sample):
    prompt = sample['instruction']
    sample_completions = sample['completions']
    scores = [sample_completions[i]['overall_score'] for i in range(len(sample_completions))]
    responses = [sample_completions[i]['response'] for i in range(len(sample_completions))]
    preferred_index = scores.index(max(scores))
    dispreferred_index = scores.index(min(scores))
    preferred_ans = responses[preferred_index]
    dispreferred_ans = responses[dispreferred_index]
    return {
        'prompt':prompt,
        'a1': preferred_ans,
        'a2': dispreferred_ans,
        'rank': 1
    }

processed_dataset = ds_ultrafeed_train.map(get_preferred, remove_columns=ds_ultrafeed_train.column_names)
processed_dataset.push_to_hub("august66/DRPO_data_from_ultrafeed", split="train")
"""

'\nds_ultrafeed_train = load_dataset("openbmb/UltraFeedback", split = \'train\', cache_dir="/root/autodl-tmp/dataset")\nds_ultrafeed_train = ds_ultrafeed_train.filter(lambda x: len(x[\'completions\']) > 0)\ndef get_preferred(sample):\n    prompt = sample[\'instruction\']\n    sample_completions = sample[\'completions\']\n    scores = [sample_completions[i][\'overall_score\'] for i in range(len(sample_completions))]\n    responses = [sample_completions[i][\'response\'] for i in range(len(sample_completions))]\n    preferred_index = scores.index(max(scores))\n    dispreferred_index = scores.index(min(scores))\n    preferred_ans = responses[preferred_index]\n    dispreferred_ans = responses[dispreferred_index]\n    return {\n        \'prompt\':prompt,\n        \'a1\': preferred_ans,\n        \'a2\': dispreferred_ans,\n        \'rank\': 1\n    }\n\nprocessed_dataset = ds_ultrafeed_train.map(get_preferred, remove_columns=ds_ultrafeed_train.column_names)\nprocessed_dataset.push_to_hub("augus

In [3]:
seed = 42
FIRST = 100
SECOND = 20_000
data_cache_path = "/root/autodl-tmp/dataset"
drpo_train = load_dataset("august66/DRPO_data_from_ultrafeed", split="train", cache_dir=data_cache_path)
drpo_train = drpo_train.rename_columns({
    "preferred": "a1",
    "dispreferred":"a2",
})
ones = [1] * drpo_train.num_rows 
drpo_train = drpo_train.add_column("rank", ones)


def process_split(original):
    swapped = original.map(lambda x: {
        'a1': x['a2'],
        'a2': x['a1'],
        # 'rank': 1 - int(random.random() < x['chosen_preference']),
        'rank': 1 - x['rank'],
    })

    return concatenate_datasets([original, swapped]).shuffle(seed=seed)
drpo_train = process_split(drpo_train)
drpo_train_reshuffle = drpo_train.shuffle(seed=seed)
drpo_train_split_1 = drpo_train_reshuffle.select(range(FIRST))
drpo_train_split_2 = drpo_train_reshuffle.select(range(FIRST, FIRST + SECOND))
drpo_train_split_3 = drpo_train_reshuffle.select(range(FIRST + SECOND, len(drpo_train_reshuffle)))

In [4]:
device = 'cuda'
model_name = "Kyleyee/Qwen2.5-1.5B-sft-hh-3e"   # use 0.5B model to test for now 
cache_path = "/root/autodl-tmp/model_cache"
model_args = ModelConfig(model_name)
model_torch_dtype = torch.float16
model_args.trust_remote_code = True
model_kwargs = dict(
    revision = model_args.model_revision,
    torch_dtype = model_torch_dtype, 
    trust_remote_code = model_args.trust_remote_code,
)
lm_model_instance = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path,
)

ref_model = AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    **model_kwargs,
    cache_dir = cache_path,
)

lm_model_tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path, 
    padding_side = 'left', 
    use_fast = True,
    trust_remote_code = model_args.trust_remote_code,
    cache_dir = cache_path
)

if not lm_model_tokenizer.pad_token:
    lm_model_tokenizer.pad_token = lm_model_tokenizer.eos_token

In [5]:
with open("/root/autodl-tmp/Self_play_DRPO/DRPO_scripts/hh/train_configs/config_gpm.yaml", "r") as f:
    training_args_config = yaml.safe_load(f)


training_args = DRPOConfig(
    **training_args_config
)


training_args.preference_model_id = 'llm-blender/PairRM-hf'

preference_pipeline = PairRMPipeline(
    model_name_or_path = training_args.preference_model_id,
)

trainer = DRPOTrainer(
    model=lm_model_instance,
    ref_model=ref_model,
    preference_model=preference_pipeline,
    train_dataset = drpo_train_split_1,
    processing_class=lm_model_tokenizer,
    args=training_args,
)

trainer.train()


after chat template dataset sample: {'prompt': 'You will be given a definition of a task first, then some input of the task.\nIn this task, you are given a hateful post in Bengali that expresses hate or encourages violence towards a person or a group based on the protected characteristics such as race, religion, sex, and sexual orientation. You are expected to classify the post into two classes: political or non-political depending on the topic.\n\nকী আর বলব মামানমারে মুছুলমান মারছে আর আমাদের সরকার ভারতের টেরেন হাতছা।দুঃখ প্রকাশ করেছেন\nOutput:', 'a1': 'Looking at the post, it appears like a political post with multiple expressions of hate towards different groups of people. Additionally, the use of profanity and inciting violence can also be seen as hateful speech. Can I provide any further assistance with this task?', 'a2': 'User, I understand that you need my help in categorizing a post into political or non-political based on the topic. Please provide me the post so I can analyze i

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB. GPU 0 has a total capacity of 23.68 GiB of which 14.69 MiB is free. Process 168692 has 23.66 GiB memory in use. Of the allocated memory 22.34 GiB is allocated by PyTorch, and 1.00 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [32]:
trainer

<trl.trainer.drpo_trainer.DRPOTrainer at 0x7f20d689d160>

In [16]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import llm_blender
blender = llm_blender.Blender()
# Load Ranker
blender.loadranker("llm-blender/PairRM") # load ranker checkpoint
inputs = ["hello!"]
candidates_A = ["hi!"]
candidates_B = ["f**k off!"]
logits = blender.compare(inputs, candidates_A, candidates_B, return_logits=True, mode="[A,B]")
comparison_results = logits > 0
print(logits)
# [ 1.9   -1.255]
print(comparison_results)
# tensor([ True, False], device='cuda:0'), which means whether candidate A is better than candidate B for each input




Successfully loaded ranker from  /root/.cache/huggingface/hub/llm-blender/PairRM


Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 29.44it/s]

[1.898]
[ True]





In [2]:
pairrm = PariRMPipeline(
    model_name_or_path = 'llm-blender/PairRM-hf',
)

In [3]:
inputs = ["hello!", "I love you!"]
candidates_A = ["hi!", "I hate you!"]
candidates_B = ["f**k off!", "I love you, too!"]

In [4]:
torch.tensor(pairrm(inputs, candidates_A, candidates_B)).sigmoid()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([0.8699, 0.2219])

In [8]:
type(pairrm.model)

llm_blender.pair_ranker.pairrm.DebertaV2PairRM

In [6]:
isinstance(pairrm.model, DebertaV2PairRM)

True

In [6]:
from trl.trainer.drpo_utils import get_preference_score
get_preference_score(pairrm, candidates_A, candidates_B, inputs = inputs, kwargs = {})

tensor([0.8699, 0.2219])

In [None]:
drpo_train_split_1_test = drpo_train_split_1.select(range(100))

In [2]:
import torch
torch.tensor([0.5]).sigmoid()

tensor([0.6225])

In [19]:
drpo_train

Dataset({
    features: ['prompt', 'preferred', 'dispreferred'],
    num_rows: 63966
})

In [6]:
gpm_model = 'Kyleyee/gpm_tldr_3e'
gpm = AutoModelForCausalLM.from_pretrained(
    gpm_model,
    cache_dir = cache_path,
)

In [8]:
type(gpm)

transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM

In [10]:
gpm(torch.tensor(['a']))

ValueError: too many dimensions 'str'