In [None]:
import os

from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

from accelerate import PartialState

from trl import ModelConfig, PPOConfig, PPOTrainer, ScriptArguments
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE

from utils import PolicyMixture


os.environ["CUDA_VISIBLE_DEVICES"] = "6, 7"

In [3]:
###############################################################################
# Configs
###############################################################################

script_args = ScriptArguments(
    dataset_name = "trl-internal-testing/descriptiveness-sentiment-trl-style",
    dataset_train_split = "descriptiveness",
)

# Model to use for policies
# =============================================================================

model_config = ModelConfig(
    model_name_or_path  = "EleutherAI/pythia-70m-deduped",
    trust_remote_code   = False,
)

# PPO trainers' configs
# =============================================================================

ppo_config_1 = PPOConfig(
    reward_model_path   = "EleutherAI/pythia-70m-deduped",
    dataset_num_proc    = 1,
    output_dir          = "./policy_1",
    num_train_epochs    = 0.01,
    num_ppo_epochs      = 1,
    num_mini_batches    = 1,
    learning_rate       = 3e-6,
    # total_episodes      = 10000,
    missing_eos_penalty = 1.0,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16,
    local_rollout_forward_batch_size = 1,
    push_to_hub = False,
)

# -----------------------------------------------------------------------------

ppo_config_2 = PPOConfig(
    reward_model_path   = "EleutherAI/pythia-70m-deduped",
    dataset_num_proc    = 1,
    output_dir          = "./policy_2",
    num_train_epochs    = 0.01,
    num_ppo_epochs      = 1,
    num_mini_batches    = 1,
    learning_rate       = 3e-6,
    # total_episodes      = 10000,
    missing_eos_penalty = 1.0,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16,
    local_rollout_forward_batch_size = 1,
    push_to_hub = False,
)


In [4]:
###############################################################################
#  Tokenizer
###############################################################################

tokenizer = AutoTokenizer.from_pretrained(
    model_config.model_name_or_path,
    padding_side="left",
    trust_remote_code=model_config.trust_remote_code,
)
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

if tokenizer.chat_template is None:
    tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE

In [5]:
###############################################################################
#  Models
###############################################################################

# Policy 1
# -----------------------------------------------------------------------------

policy_1 = AutoModelForCausalLM.from_pretrained(
    model_config.model_name_or_path
)
sft_policy_1 = AutoModelForCausalLM.from_pretrained(
    model_config.model_name_or_path
)
value_model_1 = AutoModelForSequenceClassification.from_pretrained(
    model_config.model_name_or_path, 
    num_labels=1
)
reward_model_1 = AutoModelForSequenceClassification.from_pretrained(
    ppo_config_1.reward_model_path,
    num_labels=1
)

# Policy 2
# -----------------------------------------------------------------------------

policy_2 = AutoModelForCausalLM.from_pretrained(
    model_config.model_name_or_path
)

# Ref policy 1
# -----------------------------------------------------------------------------

ref_policy_1 = PolicyMixture([sft_policy_1, policy_2], [0.8, 0.2])

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-70m-deduped and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
###############################################################################
#  Dataset
###############################################################################

dataset = load_dataset(
    script_args.dataset_name,
    split=script_args.dataset_train_split
)
eval_samples = 100
train_dataset = dataset.select(range(len(dataset) - eval_samples))
eval_dataset = dataset.select(range(len(dataset) - eval_samples, len(dataset)))
dataset_text_field = "prompt"

def prepare_dataset(dataset, tokenizer):
    """
    pre-tokenize the dataset before training; only collate during training
    """

    def tokenize(element):
        outputs = tokenizer(
            element[dataset_text_field],
            padding=False,
        )
        return {"input_ids": outputs["input_ids"]}

    return dataset.map(
        tokenize,
        batched=True,
        remove_columns=dataset.column_names
    )

# Compute that only on the main process for faster data processing.
# see: https://github.com/huggingface/trl/pull/1255
with PartialState().local_main_process_first():
    train_dataset = prepare_dataset(train_dataset, tokenizer)
    eval_dataset = prepare_dataset(eval_dataset, tokenizer)


In [7]:
###############################################################################
#  Training
###############################################################################

NUM_EPOCHS = 2

trainer = PPOTrainer(
    config            = ppo_config_1,
    processing_class  = tokenizer,
    policy            = policy_1,
    ref_policy        = ref_policy_1,
    reward_model      = reward_model_1,
    value_model       = value_model_1,
    train_dataset     = train_dataset,
    eval_dataset      = eval_dataset,
)

for epoch in range(NUM_EPOCHS):
    trainer.train()
    ref_policy_1 = PolicyMixture([sft_policy_1, policy_2], coefs=[0.8, 0.2])
    trainer.ref_policy = ref_policy_1

===training policy===


From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


Step,Training Loss


===training policy===


Step,Training Loss
