In [1]:
import os

from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

from peft import (
    PeftModelForSequenceClassification,
    TaskType, 
    get_peft_model
)

from trl import (
    ModelConfig, 
    PPOConfig, 
    PPOTrainer, 
    ScriptArguments,
    get_peft_config,
    get_quantization_config,
)

from accelerate import PartialState


os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ["WANDB_PROJECT"] = "Llama-3.2-1B-Instruct-PPO"
os.environ["WANDB_ENTITY"] = "RADFAN"

In [None]:
# =============================================================================
# Configs
# =============================================================================

# Model configs
# =============================================================================

# Policy
# -----------------------------------------------------------------------------

policy_config = ModelConfig(
    model_name_or_path   = "meta-llama/Llama-3.2-1B-Instruct",
    # LoRA
    # -------------------------------------------------------------------------
    use_peft             = True,
    lora_r               = 16,
    lora_alpha           = 32,
    lora_dropout         = 0.05,
    lora_task_type       = TaskType.CAUSAL_LM,
    lora_target_modules  = None,
    lora_modules_to_save = None,
    # Quantization
    # -------------------------------------------------------------------------
    load_in_8bit         = False,
    load_in_4bit         = False,
    torch_dtype          = "bfloat16",
)

# Value model
# -----------------------------------------------------------------------------

vf_config = ModelConfig(
    # LoRA
    # -------------------------------------------------------------------------
    use_peft            = True,
    lora_r              = 8,
    lora_alpha          = 16,
    lora_dropout        = 0.01,
    lora_target_modules = None,
    lora_task_type      = TaskType.SEQ_CLS,
    # Quantization
    # -------------------------------------------------------------------------
    load_in_8bit        = False,
    load_in_4bit        = False,
    torch_dtype         = "bfloat16",
)

# Reward model
# -----------------------------------------------------------------------------

reward_config = ModelConfig(
    model_name_or_path  = "RLHF-And-Friends/Llama-3.2-1B-Instruct-Reward",
    use_peft            = True,
    load_in_8bit        = False,
    load_in_4bit        = False,
)

# PPO config
# =============================================================================

ppo_config = PPOConfig(
    # Common
    # -------------------------------------------------------------------------
    run_name            = f"peft_ppo_test_{1}",
    output_dir          = f"./ppo_{policy_config.model_name_or_path}",
    dataset_num_proc    = 16,
    num_mini_batches    = 1,
    learning_rate       = 1e-5,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 8,
    num_train_epochs    = 1,
    # Push to hub after training
    # -------------------------------------------------------------------------
    push_to_hub         = True,
    hub_model_id        = "RLHF-And-Friends/Llama-3.2-1B-Instruct-PPO",

    # On-policy params
    # -------------------------------------------------------------------------
    missing_eos_penalty = 1.0,
    local_rollout_forward_batch_size = 1,

    # PPO params
    # -------------------------------------------------------------------------
    reward_model_path   = reward_config.model_name_or_path,
    num_ppo_epochs      = 1,
    whiten_rewards      = False,
    kl_coef             = 0.05,
    cliprange           = 0.2,
    vf_coef             = 0.1,
    cliprange_value     = 0.2,
    gamma               = 1.0,
    lam                 = 0.95,
)

# Dataset
# =============================================================================

script_args = ScriptArguments(
    dataset_name = "trl-internal-testing/descriptiveness-sentiment-trl-style",
    dataset_train_split = "descriptiveness",
)

In [3]:
# =============================================================================
# Tokenizer
# =============================================================================

tokenizer = AutoTokenizer.from_pretrained(
    policy_config.model_name_or_path,
    use_fast = True,
    padding_side="left",
)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
# =============================================================================
# Models
# =============================================================================

# SFT model
# -----------------------------------------------------------------------------

sft_policy = AutoModelForCausalLM.from_pretrained(
    policy_config.model_name_or_path,
    quantization_config = get_quantization_config(policy_config)
)
if sft_policy.config.pad_token_id is None:
    sft_policy.config.pad_token_id = tokenizer.pad_token_id

# Trainable policy
# -----------------------------------------------------------------------------

if policy_config.use_peft:
    policy = get_peft_model(sft_policy, get_peft_config(policy_config))
else:
    policy = AutoModelForCausalLM.from_pretrained(
        policy_config.model_name_or_path
    )

# Base model for Value and Reward models
# -----------------------------------------------------------------------------

base_value_head_model = AutoModelForSequenceClassification.from_pretrained(
    policy_config.model_name_or_path,
    num_labels = 1,
    quantization_config = get_quantization_config(vf_config)
)
if base_value_head_model.config.pad_token_id is None:
    base_value_head_model.config.pad_token_id = tokenizer.pad_token_id

# Value model with LoRA
# -----------------------------------------------------------------------------

if vf_config.use_peft:
    value_model = get_peft_model(
        base_value_head_model, get_peft_config(vf_config))
else:
    value_model = base_value_head_model

# Reward model
# -----------------------------------------------------------------------------

if reward_config.use_peft:
    reward_model = PeftModelForSequenceClassification.from_pretrained(
        base_value_head_model,
        reward_config.model_name_or_path,
        num_labels = 1,
        quantization_config = get_quantization_config(reward_config)
    )
else:
    reward_model = AutoModelForSequenceClassification.from_pretrained(
        reward_config.model_name_or_path,
        num_labels = 1,
        quantization_config = get_quantization_config(reward_config)
    )

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# =============================================================================
#  Dataset
# =============================================================================

dataset = load_dataset(
    script_args.dataset_name,
    split=script_args.dataset_train_split
)

eval_samples = 100
train_dataset = dataset.select(range(len(dataset) - eval_samples))
eval_dataset = dataset.select(range(len(dataset) - eval_samples, len(dataset)))
dataset_text_field = "prompt"

def prepare_dataset(dataset, tokenizer):
    """
    pre-tokenize the dataset before training; only collate during training
    """

    def tokenize(element):
        outputs = tokenizer(
            element[dataset_text_field],
            padding=False,
        )
        return {"input_ids": outputs["input_ids"]}

    return dataset.map(
        tokenize,
        batched=True,
        remove_columns=dataset.column_names
    )

# Compute that only on the main process for faster data processing.
# see: https://github.com/huggingface/trl/pull/1255
with PartialState().local_main_process_first():
    train_dataset = prepare_dataset(train_dataset, tokenizer)
    eval_dataset = prepare_dataset(eval_dataset, tokenizer)


In [6]:
# =============================================================================
# Training
# =============================================================================

trainer = PPOTrainer(
    args            = ppo_config,
    processing_class  = tokenizer,
    model             = policy,
    ref_model         = sft_policy,
    reward_model      = reward_model,
    value_model       = value_model,
    train_dataset     = train_dataset,
    eval_dataset      = eval_dataset,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


===training policy===


[34m[1mwandb[0m: Currently logged in as: [33mevgurovv[0m ([33mRADFAN[0m). Use [1m`wandb login --relogin`[0m to force relogin


From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


Step,Training Loss


adapter_model.safetensors:   0%|          | 0.00/6.82M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/6.26k [00:00<?, ?B/s]

In [7]:
trainer.save_model(ppo_config.output_dir)
if ppo_config.push_to_hub:
    trainer.push_to_hub(dataset_name=script_args.dataset_name)

adapter_model.safetensors:   0%|          | 0.00/6.82M [00:00<?, ?B/s]

AttributeError: 'LlamaForCausalLM' object has no attribute 'policy'