In [1]:
import os

import torch

from datasets import load_dataset

from transformers import AutoModelForSequenceClassification, AutoTokenizer

from peft import get_peft_model, TaskType

from trl import (
    ModelConfig,
    RewardConfig,
    RewardTrainer,
    ScriptArguments,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"

os.environ["WANDB_PROJECT"] = "Llama-3.2-1B-Instruct-Reward"
os.environ["WANDB_ENTITY"] = "RADFAN"

In [2]:
# =============================================================================
# Configs
# =============================================================================

# Model config
# =============================================================================

model_config = ModelConfig(
    model_name_or_path   = "meta-llama/Llama-3.2-1B-Instruct",
    # LoRA
    # -------------------------------------------------------------------------
    use_peft             = True,
    lora_task_type       = TaskType.SEQ_CLS,
    lora_r               = 8,
    lora_alpha           = 8,
    lora_dropout         = 0.05,
    lora_target_modules  = None,
    lora_modules_to_save = None,
    # Quantization
    # -------------------------------------------------------------------------
    load_in_8bit         = False,
    load_in_4bit         = False,
    torch_dtype          = None
)

# Reward trainer config
# =============================================================================

training_args = RewardConfig(
    # Reward trainer params
    # -------------------------------------------------------------------------
    max_length                  = None,
    dataset_num_proc            = 16,
    center_rewards_coefficient  = None,
    # Common
    # -------------------------------------------------------------------------
    output_dir                  = f"{model_config.model_name_or_path}-reward",
    run_name                    = f"{model_config.model_name_or_path}-{model_config.lora_r}",
    per_device_train_batch_size = 4,
    num_train_epochs            = 0.1,
    gradient_checkpointing      = False,
    gradient_accumulation_steps = 2,
    learning_rate               = 1e-4,

    # Frequency of logs
    # -------------------------------------------------------------------------
    logging_steps               = 25,

    # Evaluation
    # -------------------------------------------------------------------------
    eval_strategy               = "steps",
    eval_steps                  = 125,

    # Push to hub after training
    # -------------------------------------------------------------------------
    push_to_hub                 = True,
    hub_model_id                = "RLHF-And-Friends/Llama-3.2-1B-Instruct-Reward"
)

# Script config
# =============================================================================

script_args = ScriptArguments(
    dataset_name        = "trl-lib/ultrafeedback_binarized",
    dataset_train_split = "train",
    dataset_test_split  = "test",
)

In [3]:
# =============================================================================
# Model & Tokenizer
# =============================================================================

# Tokenizer
# -----------------------------------------------------------------------------

tokenizer = AutoTokenizer.from_pretrained(
    model_config.model_name_or_path, use_fast=True
)
tokenizer.pad_token = tokenizer.eos_token

# Model
# -----------------------------------------------------------------------------

quantization_config = get_quantization_config(model_config)
lora_config = get_peft_config(model_config)
device_map = (
    get_kbit_device_map() if quantization_config is not None else "auto"
)
use_cache = False if training_args.gradient_checkpointing else True
torch_dtype = (
    getattr(
        torch, model_config.torch_dtype
    ) if model_config.torch_dtype is not None 
    else None
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_config.model_name_or_path, 
    num_labels = 1,
    quantization_config = quantization_config,
    device_map = device_map,
    use_cache = use_cache,
    torch_dtype = torch_dtype
)

model = get_peft_model(model, lora_config)

if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
     

In [5]:
# =============================================================================
# Load dataset
# =============================================================================

dataset = load_dataset(script_args.dataset_name)

train_dataset = dataset[script_args.dataset_train_split]
eval_dataset = (
    dataset[script_args.dataset_test_split] 
    if training_args.eval_strategy != "no" 
    else None
)

In [None]:
# =============================================================================
# Training
# =============================================================================

trainer = RewardTrainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

In [7]:
# =============================================================================
# Save model and push to Hub
# =============================================================================

trainer.save_model(training_args.output_dir)
if training_args.push_to_hub:
    trainer.push_to_hub(dataset_name=script_args.dataset_name)

In [None]:
# =============================================================================
# Evaluate Model
# =============================================================================

if training_args.eval_strategy != "no":
    metrics = trainer.evaluate()
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)