<a href="https://colab.research.google.com/github/Realmbird/Recreation-Consitutional-AI/blob/main/Copy_of_Reward_model_preference_model_helpful_antrophic_dataset_Qwen_0_6B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install -U datasets transformers # needed to prevent invalid pattern error

In [None]:
!pip install trl bitsandbytes accelerate torch

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-base")

In [None]:
len(dataset["train"])

In [None]:
print(dataset["train"][0])

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
import torch

In [None]:
model_name = "Qwen/Qwen3-0.6B" # Or any other suitable model
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# FP8 Qwen/Qwen3-0.6B-FP4 does not work with automodelforsequenceclassification
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0, # Recommended for 8-bit to handle outliers
)

In [None]:
# Important: Add a pad token if the tokenizer doesn't have one, especially for decoder models.
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, pad_token_id=tokenizer.pad_token_id, quantization_config=bnb_config) # accepted and rejceted label
# Resize token embeddings if you added a new pad token
model.resize_token_embeddings(len(tokenizer))

In [None]:
from peft import LoraConfig, TaskType

In [None]:
# stacks on top of BitsAndBytes
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [None]:
from trl.trainer.utils import disable_dropout_in_model

In [None]:
from trl import RewardTrainer, RewardConfig
from transformers import TrainingArguments

# Define training arguments errors I encountered 1. https://discuss.huggingface.co/t/rewardtrainer-problem/138214/7 2. https://github.com/tatsu-lab/stanford_alpaca/issues/133
training_args = RewardConfig(
    output_dir="Qwen3.0-1.7B-Reward",
    num_train_epochs=1,
    per_device_train_batch_size=4, # <--- Try reducing this to 1 if weaker one
    gradient_accumulation_steps=10, # <--- Add this to compensate for smaller batch size
    bf16=True, # <--- ADD THIS LINE: Explicitly disable bfloat16) undone since bp16 is better than fp16(was forced to use because of T4 GPU)
    report_to="none"
)
trainer = RewardTrainer(
    args=training_args,
    model=model,
    processing_class=tokenizer,
    train_dataset=dataset["train"], # Use the training split
    eval_dataset=dataset["test"],
    peft_config=peft_config,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./reward_model_helpful_qwen")

In [None]:
trainer.save_model("./final_reward_model_helpful_base")

In [None]:
from huggingface_hub import notebook_login, HfApi

In [None]:
notebook_login()

In [None]:
trainer.push_to_hub()

In [None]:
hub_model_id="Realmbird/helpfulness-preference-model-qwen-0.6B"

In [None]:
tokenizer.push_to_hub(repo_id = hub_model_id)

In [None]:
pip install nb-clean