<a href="https://colab.research.google.com/github/Realmbird/Recreation-Consitutional-AI/blob/main/Helpful_RLHF_Qwen_0_6B_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
!pip install datasets==4.0.0 transformers==4.54.1 trl==0.20.0 bitsandbytes==0.46.1 accelerate==1.9.0 peft==0.16.0 huggingface-hub==0.34.1



In [35]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, AutoModelForCausalLM, pipeline

import torch


model_name = "Qwen/Qwen3-0.6B" # Or any other suitable model

mname = model_name

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [36]:
from datasets import load_dataset


dataset = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-base")


def get_prompts(dataset):

    prompts = []

    for example in dataset:

        chosen_text = example['chosen']

        # Find the end of the human's prompt (before "Assistant:")

        prompt_end_idx = chosen_text.find("\n\nAssistant:")

        if prompt_end_idx != -1:

            prompts.append(chosen_text[:prompt_end_idx].strip())

    return prompts


# prompts had to formated of just queries

test_prompts_list = get_prompts(dataset["train"]) # example just used train not test for promptlist

eval_prompts_list = get_prompts(dataset["test"])



#dataset that ppotrainer accepts is tokens

def tokenize(sample):

    sample["input_ids"] = tokenizer.encode(sample)

    return sample



test_dataset = dataset.map(tokenize, batched=False)

eval_dataset = dataset.map(tokenize, batched=False)



In [37]:
bnb_config = BitsAndBytesConfig(

    load_in_4bit=True,   # ✅ Use 4-bit instead
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",

)


# Important: Add a pad token if the tokenizer doesn't have one, especially for decoder models.

if tokenizer.pad_token is None:

    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})


model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, pad_token_id=tokenizer.pad_token_id, quantization_config=bnb_config) # accepted and rejceted label

# Resize token embeddings if you added a new pad token

model.resize_token_embeddings(len(tokenizer))


model = AutoModelForCausalLM.from_pretrained(

    model_name,

    quantization_config=bnb_config,

    device_map="auto"

)

ref_model = AutoModelForCausalLM.from_pretrained(

    model_name,

    quantization_config=bnb_config,

    device_map="auto"

)



Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from peft import LoraConfig, TaskType, PeftModel, get_peft_model


# stacks on top of BitsAndBytes

peft_config = LoraConfig(

    task_type=TaskType.SEQ_CLS,

    inference_mode=False,

    r=8,

    lora_alpha=32,

    lora_dropout=0.1,

)

In [39]:
from trl.trainer.utils import disable_dropout_in_model

from trl import PPOTrainer, PPOConfig

from transformers import TrainingArguments


# Load the reward model directly

reward_model_name = "Realmbird/helpfulness-preference-model-qwen-0.6B-v2"

reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_name, num_labels=1, device_map="auto", torch_dtype=torch.float16) # Added device_map="auto"



In [40]:
value_model = AutoModelForSequenceClassification.from_pretrained(

    model_name,

    num_labels=1,

    quantization_config=bnb_config,

    device_map="auto"

)

value_model.resize_token_embeddings(len(tokenizer))


# Apply PEFT to the value model as well

peft_config_value = LoraConfig(

    task_type=TaskType.SEQ_CLS,

    inference_mode=False,

    r=8,

    lora_alpha=32,

    lora_dropout=0.1,

)

value_model = get_peft_model(value_model, peft_config_value)

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:


ppo_config = PPOConfig(

    output_dir= "./rlhf_helpful_Qwen3_0.6B",

    learning_rate=1.41e-5,

    batch_size=4,

    mini_batch_size=1,

    gradient_accumulation_steps=4,

)

trainer = PPOTrainer(

    args=ppo_config,

    model=model,

    ref_model = ref_model,

    processing_class=tokenizer,

    train_dataset=test_dataset, # Use the training split

    eval_dataset=eval_dataset,

    peft_config=peft_config,

    reward_model=reward_model,

    value_model= value_model

)

In [None]:
import trl

print(trl.__version__)