In [1]:
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch.nn.functional as F
from tqdm import tqdm
import re
import yaml

LOCAL_TRL_PARENT = "/workspace/Self_play_DRPO"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

    
# now the import will use your local copy:
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig
)
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
from trl.data_utils import apply_chat_template

data_cache_path = "/workspace/dataset"
model_cache_path = '/workspace/model_cache'
ds_path = 'august66/drpo_ultrafeedback_qwen2.5-1.5b_first_iter_20k'
ref_policy_path = "Qwen/Qwen2.5-1.5B-Instruct" 
target_policy_path = "Qwen/Qwen2.5-1.5B-Instruct" 


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_model(model_path, task = 'generation', model_type = 'decoder', model_cache_path =  '/workspace/model_cache'):

    model_args = ModelConfig(model_path)
    model_torch_dtype = (model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype))
    model_kwargs = dict(
    revision = model_args.model_revision,
    torch_dtype = model_torch_dtype, 
    trust_remote_code = model_args.trust_remote_code,
    )

    padding_side = 'left' if model_type == 'decoder' else 'right'
    truncation_side = 'left' if model_type == 'decoder' else 'right'

    if task == 'generation':
        model_instance = AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
            cache_dir = model_cache_path,
        )

    elif task == 'reward':
        model_instance = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
            cache_dir = model_cache_path,
        )
    

    model_tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, 
        padding_side = padding_side, 
        truncation_side = truncation_side,
        use_fast = True,
        trust_remote_code = model_args.trust_remote_code,
        cache_dir = model_cache_path
    )

    if model_tokenizer.pad_token is None:
        model_tokenizer.pad_token = model_tokenizer.eos_token

    if getattr(model_instance.config, "pad_token_id", None) is None:
        model_instance.config.pad_token_id = model_tokenizer.pad_token_id

    if model_tokenizer.eos_token is None:
        model_tokenizer.eos_token = model_tokenizer.pad_token  

    if getattr(model_instance.config, "eos_token_id", None) is None:
        model_instance.config.eos_token_id = model_tokenizer.eos_token_id

    return model_instance, model_tokenizer


In [3]:
seed = 1234
ref_policy_model, ref_policy_tokenizer = load_model(ref_policy_path)
target_policy_model, target_policy_tokenizer = load_model(target_policy_path)
drpo_train = load_dataset(ds_path, cache_dir=data_cache_path, split = 'train')
drpo_train = drpo_train.remove_columns(['rank'])
mapping = {'a1':'chosen', 'a2':'rejected'}
drpo_train  = drpo_train.rename_columns(mapping)

In [None]:
config = DPOConfig(
    beta=0.1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    report_to="wandb",          
    run_name="my-dpo-run",
    logging_steps=20,
    max_prompt_length = 1024,
    max_completion_length = 1024,
    max_length = 2048,
    output_dir = 'dpo_out',
    save_strategy = 'steps',
    save_steps = 500,
    save_total_limit = 1,
    push_to_hub = True,
    hub_model_id = 'august66/ultrafeedback_20k_qwen_1.5b_dpo_model',
    hub_strategy = 'every_save'
)
trainer = DPOTrainer(
    model=target_policy_model,
    ref_model=ref_policy_model,     
    args=config,
    train_dataset=drpo_train,
    processing_class=ref_policy_tokenizer
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mooooaugust[0m ([33mooooaugust-london-school-of-economics-and-political-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


KeyboardInterrupt: 

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x788624b5e4d0>> (for post_run_cell), with arguments args (<ExecutionResult object at 788800395450, execution_count=4 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 788625adbc10, raw_cell="config = DPOConfig(
    beta=0.1,
    per_device_t.." transformed_cell="config = DPOConfig(
    beta=0.1,
    per_device_t.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22413130305f72756e706f64227d/workspace/Self_play_DRPO/self_play_drpo_code/ultrafeedback_dpo_train.ipynb#W3sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [8]:
drpo_train['chosen'][0]['content']

TypeError: list indices must be integers or slices, not str