In [1]:
import os

import torch
from torch.optim import AdamW

from datasets import load_dataset

from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer
)

from peft import get_peft_model, TaskType

from trl import (
    ModelConfig,
    RewardConfig,
    RewardTrainer,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
)

from fed_ppo.callbacks import WeightChangeCallback
from fed_ppo.utils import (
    custom_optimizer, 
    apply_chat_template,
    tokenize,
    OptimizerConfig,
)


### Devices

In [2]:
# Visible devices
# -------------------------------------------------------------------------------------------------
VISIBLE_DEVICES = "4"
# -------------------------------------------------------------------------------------------------

# Enumerate GPUs based on their PCI bus IDs
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

os.environ["CUDA_VISIBLE_DEVICES"] = f"{VISIBLE_DEVICES}"

### Model and dataset

In [3]:
# Model path
# -------------------------------------------------------------------------------------------------
MODEL_PATH = "meta-llama/Llama-3.2-1B-Instruct"
# -------------------------------------------------------------------------------------------------
MODEL_NAME = MODEL_PATH.split('/')[1]

# Dataset path
# -------------------------------------------------------------------------------------------------
DATASET_PATH        = "trl-lib/ultrafeedback_binarized"
DATASET_TRAIN_SPLIT = "train"
DATASET_VAL_SPLIT   = "test"
# -------------------------------------------------------------------------------------------------
DATASET_NAME        = DATASET_PATH.split('/')[1]

### WandB settings

In [4]:
os.environ["WANDB_PROJECT"] = f"{MODEL_NAME}-Reward-{DATASET_NAME}"
os.environ["WANDB_ENTITY"]  = "RADFAN"

### Other constants

### Configs

In [5]:
# Datasets will be filtered according to max length
MAX_LENGTH = 512

# Model config
# =================================================================================================

model_config = ModelConfig(
    model_name_or_path   = MODEL_PATH,
    # LoRA
    # ---------------------------------------------------------------------------------------------
    use_peft             = True,
    lora_task_type       = TaskType.SEQ_CLS,
    lora_r               = 8,
    lora_alpha           = 16,
    lora_dropout         = 0.0,
    lora_target_modules  = ["q_proj", "k_proj", "v_proj", "o_proj"],
    # Head will require grad automatically
    lora_modules_to_save = None,
    # Quantization
    # ---------------------------------------------------------------------------------------------
    load_in_8bit         = False,
    load_in_4bit         = False,
    torch_dtype          = "bfloat16",
)

# Reward trainer config
# =================================================================================================

training_args = RewardConfig(
    # Reward trainer params
    # ---------------------------------------------------------------------------------------------
    max_length                  = None,
    dataset_num_proc            = 16,
    center_rewards_coefficient  = None,
    # Common
    # ---------------------------------------------------------------------------------------------
    run_name                    = f"LoRA-{model_config.lora_r}r-max_length-{MAX_LENGTH}",
    output_dir                  = f"{os.environ['WANDB_PROJECT']}-LoRA-{model_config.lora_r}r",
    per_device_train_batch_size = 4,
    num_train_epochs            = 2,
    gradient_checkpointing      = False,
    gradient_accumulation_steps = 4,

    # Frequency of logs
    # ---------------------------------------------------------------------------------------------
    logging_steps               = 20,

    # Evaluation
    # ---------------------------------------------------------------------------------------------
    eval_strategy               = "steps",
    eval_steps                  = 100,

    # Push to hub after training
    # ---------------------------------------------------------------------------------------------
    push_to_hub                 = True,
    hub_model_id                = f"RLHF-And-Friends/{MODEL_NAME}-Reward-{DATASET_NAME}-max_length-{MAX_LENGTH}"
                                  f"-LoRA-{model_config.lora_r}r"
)

# Optimizer config
# =================================================================================================

optimizer_config = OptimizerConfig(
    optimizer_type = AdamW,
    layer_lr       = {
        "lora":  1e-5, # LoRA adapters
        "score": 1e-4, # Head
    }
)

### Tokenizer & Model

In [6]:
# Model
# =================================================================================================

quantization_config = get_quantization_config(model_config)
lora_config = get_peft_config(model_config)
device_map = (
    get_kbit_device_map() if quantization_config is not None else "auto"
)
use_cache = False if training_args.gradient_checkpointing else True
torch_dtype = (
    getattr(
        torch, model_config.torch_dtype
    ) if model_config.torch_dtype is not None 
    else None
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_config.model_name_or_path, 
    num_labels = 1,
    quantization_config = quantization_config,
    device_map = device_map,
    use_cache = use_cache,
    torch_dtype = torch_dtype
)

model = get_peft_model(model, lora_config)

# Tokenizer
# =================================================================================================

tokenizer = AutoTokenizer.from_pretrained(
    model_config.model_name_or_path, 
    use_fast=True
)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

# Sync padding tokens
# =================================================================================================

model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model.print_trainable_parameters()

trainable params: 1,705,984 || all params: 1,237,524,480 || trainable%: 0.1379


### Dataset

In [8]:
dataset = load_dataset(DATASET_PATH)

train_dataset = dataset[DATASET_TRAIN_SPLIT]
eval_dataset = (
    dataset[DATASET_VAL_SPLIT] if training_args.eval_strategy != "no"
    else None
)

# Apply chat tamplate and tokenize beforehand to avoid doing it inside 
# the 'RewardTrainer'
# -------------------------------------------------------------------------------------------------

train_dataset = train_dataset.map(
    apply_chat_template, 
    fn_kwargs={"tokenizer": tokenizer}, 
    load_from_cache_file=False
)
eval_dataset = eval_dataset.map(
    apply_chat_template, 
    fn_kwargs={"tokenizer": tokenizer},
    load_from_cache_file = False
)

train_dataset = train_dataset.map(
    tokenize, 
    fn_kwargs={"tokenizer": tokenizer}, 
    load_from_cache_file = False
)
eval_dataset = eval_dataset.map(
    tokenize, 
    fn_kwargs={"tokenizer": tokenizer},
    load_from_cache_file = False
)

# Filter datasets by length (keep only examples which are no longer then 
# `max_length` tokens)
# -------------------------------------------------------------------------------------------------

length_filter = (
    lambda x: len(x["input_ids_chosen"]) <= MAX_LENGTH
              and len(x["input_ids_rejected"]) <= MAX_LENGTH
)

train_dataset = train_dataset.filter(
    length_filter,
    num_proc=training_args.dataset_num_proc,
)

eval_dataset = eval_dataset.filter(
    length_filter,
    num_proc=training_args.dataset_num_proc,
)

Map:   0%|          | 0/62135 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/62135 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter (num_proc=16):   0%|          | 0/62135 [00:00<?, ? examples/s]

Filter (num_proc=16):   0%|          | 0/1000 [00:00<?, ? examples/s]

### Training

In [10]:
optimizer = custom_optimizer(model, optimizer_config)

trainer = RewardTrainer(
    model            = model,
    processing_class = tokenizer,
    args             = training_args,
    train_dataset    = train_dataset,
    eval_dataset     = eval_dataset,
    callbacks        = [
        WeightChangeCallback(
            ["base_model.model.score.modules_to_save.default.weight"]
        )
    ],
    optimizers       = (optimizer, None)
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mevgurovv[0m ([33mRADFAN[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
100,0.7703,0.694622,0.596429
200,0.6323,0.66859,0.620504
300,0.636,0.656222,0.629234
400,0.6212,0.646919,0.635714
500,0.602,0.638472,0.639432
600,0.6167,0.628741,0.645276
700,0.6128,0.627333,0.652406
800,0.6104,0.621149,0.662478
900,0.5666,0.617188,0.667851
1000,0.5293,0.613792,0.658887






















































































'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/96/b6/96b60a8d041826551d778d539ec90378e19271c0b3df7d9969a66d68012eb61e/8b1dbc8ab9b01f4b39c3e0ca26640d2a7fd96a7355400e6d509e6444faf072e7?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20250105%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250105T164419Z&X-Amz-Expires=86400&X-Amz-Signature=678f17c1619446b1c6b47afdcb2be7c8d647021bed67b81d42c190f5db423b0f&X-Amz-SignedHeaders=host&partNumber=1&uploadId=iN1jnrq.jDqlie9Zadk9Sk2BmnK1xrY1MjPZA7E1SA3niXZyjxUb5OwqUe05tnCeWz3bFklcJhYOzvWpcNNmfqQYcKpefcAFhzYb7YKu_wOyqtTUA6pqErTmb0ufKfPH&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2426)')))"), '(Request ID: dfbc7b36-b19d-4727-922c-35882a060cef)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/96/b6/9



TrainOutput(global_step=4400, training_loss=0.5809103393554688, metrics={'train_runtime': 5197.4401, 'train_samples_per_second': 13.548, 'train_steps_per_second': 0.847, 'total_flos': 0.0, 'train_loss': 0.5809103393554688, 'epoch': 1.9993183367416496})

### Save model and push to Hub

In [11]:
trainer.save_model(training_args.output_dir)
if training_args.push_to_hub:
    trainer.push_to_hub(dataset_name=DATASET_PATH)

