In [1]:
import os

from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

from peft import (
    PeftModelForSequenceClassification,
    TaskType, 
    get_peft_model
)

from trl import (
    ModelConfig, 
    PPOConfig, 
    PPOTrainer, 
    ScriptArguments,
    get_peft_config,
    get_quantization_config,
)

from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE

from accelerate import PartialState


os.environ["CUDA_VISIBLE_DEVICES"] = "6, 7"

In [2]:
# =============================================================================
# Configs
# =============================================================================

# Dataset
# =============================================================================

script_args = ScriptArguments(
    dataset_name = "trl-internal-testing/descriptiveness-sentiment-trl-style",
    dataset_train_split = "descriptiveness",
)

# Model configs
# =============================================================================

policy_config = ModelConfig(
    model_name_or_path = "EleutherAI/pythia-70m-deduped",
    use_peft = True,
    lora_r = 16,
    lora_alpha = 32,
    lora_dropout = 0.05,
    lora_target_modules = None,
    lora_task_type = TaskType.CAUSAL_LM,
    load_in_8bit = False,
    load_in_4bit = False,
)

vf_config = ModelConfig(
    use_peft = True,
    lora_r = 8,
    lora_alpha = 16,
    lora_dropout = 0.01,
    lora_target_modules = None,
    lora_task_type = TaskType.SEQ_CLS, 
    lora_modules_to_save = ["classifier"],
    load_in_8bit = False,
    load_in_4bit = False,
)

reward_config = ModelConfig(
    # model_name_or_path = "EleutherAI/pythia-70m-deduped",
    model_name_or_path = "RLHF-And-Friends/Pythia-70M-Reward",
    use_peft=True,
    load_in_8bit=False,
    load_in_4bit=False,
)

# PPO config
# =============================================================================

ppo_config = PPOConfig(
    # Common
    # -------------------------------------------------------------------------
    run_name = f"peft_ppo_test_{1}",
    output_dir = f"./ppo_{policy_config.model_name_or_path}",
    dataset_num_proc = 8,
    num_mini_batches = 1,
    learning_rate = 1e-5,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16,
    push_to_hub = False,

    # On-policy params
    # -------------------------------------------------------------------------
    missing_eos_penalty = 1.0,
    local_rollout_forward_batch_size = 1,

    # PPO params
    # -------------------------------------------------------------------------
    reward_model_path = reward_config.model_name_or_path,
    num_ppo_epochs = 1,
    whiten_rewards = False,
    kl_coef = 0.05,
    cliprange = 0.2,
    vf_coef = 0.1,
    cliprange_value = 0.2,
    gamma = 1.0,
    lam = 0.95,
)


In [3]:
# =============================================================================
# Tokenizer
# =============================================================================

tokenizer = AutoTokenizer.from_pretrained(
    policy_config.model_name_or_path,
    padding_side="left",
    trust_remote_code=policy_config.trust_remote_code,
)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

if tokenizer.chat_template is None:
    tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE

In [4]:
# =============================================================================
# Models
# =============================================================================

# SFT model
# -----------------------------------------------------------------------------

sft_policy = AutoModelForCausalLM.from_pretrained(
    policy_config.model_name_or_path,
    quantization_config = get_quantization_config(policy_config)
)

# Trainable policy
# -----------------------------------------------------------------------------

if policy_config.use_peft:
    policy = get_peft_model(sft_policy, get_peft_config(policy_config))
else:
    policy = AutoModelForCausalLM.from_pretrained(
        policy_config.model_name_or_path
    )

# Value model
# -----------------------------------------------------------------------------

value_model = AutoModelForSequenceClassification.from_pretrained(
    policy_config.model_name_or_path,
    num_labels = 1,
    quantization_config = get_quantization_config(vf_config)
)
if vf_config.use_peft:
    value_model = get_peft_model(value_model, get_peft_config(vf_config))

# Reward model
# -----------------------------------------------------------------------------

if reward_config.use_peft:
    reward_model = PeftModelForSequenceClassification.from_pretrained(
        value_model,
        reward_config.model_name_or_path,
        num_labels = 1,
        quantization_config = get_quantization_config(reward_config)
    )
else:
    reward_model = AutoModelForSequenceClassification.from_pretrained(
        reward_config.model_name_or_path,
        num_labels = 1,
        quantization_config = get_quantization_config(reward_config)
    )

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# =============================================================================
#  Dataset
# =============================================================================

dataset = load_dataset(
    script_args.dataset_name,
    split=script_args.dataset_train_split
)

eval_samples = 100
train_dataset = dataset.select(range(len(dataset) - eval_samples))
eval_dataset = dataset.select(range(len(dataset) - eval_samples, len(dataset)))
dataset_text_field = "prompt"

def prepare_dataset(dataset, tokenizer):
    """
    pre-tokenize the dataset before training; only collate during training
    """

    def tokenize(element):
        outputs = tokenizer(
            element[dataset_text_field],
            padding=False,
        )
        return {"input_ids": outputs["input_ids"]}

    return dataset.map(
        tokenize,
        batched=True,
        remove_columns=dataset.column_names
    )

# Compute that only on the main process for faster data processing.
# see: https://github.com/huggingface/trl/pull/1255
with PartialState().local_main_process_first():
    train_dataset = prepare_dataset(train_dataset, tokenizer)
    eval_dataset = prepare_dataset(eval_dataset, tokenizer)


In [6]:
# =============================================================================
# Training
# =============================================================================

trainer = PPOTrainer(
    config            = ppo_config,
    processing_class  = tokenizer,
    policy            = policy,
    ref_policy        = sft_policy,
    reward_model      = reward_model,
    value_model       = value_model,
    train_dataset     = train_dataset,
    eval_dataset      = eval_dataset,
)

trainer.train()

===training policy===


From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


Step,Training Loss


KeyboardInterrupt: 