<a href="https://colab.research.google.com/github/Realmbird/Recreation-Consitutional-AI/blob/main/Helpful_RLHF_Qwen_0_6B_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets==4.0.0 transformers==4.54.1 trl==0.20.0 bitsandbytes==0.46.1 accelerate==1.9.0 peft==0.16.0 huggingface-hub==0.34.1

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, AutoModelForCausalLM, pipeline

import torch


model_name = "Qwen/Qwen3-0.6B" # Or any other suitable model

mname = model_name

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Important: Add a pad token if the tokenizer doesn't have one, especially for decoder models.

if tokenizer.pad_token is None:

    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

In [None]:
def get_prompts(dataset):


    prompts = []


    for example in dataset:


        chosen_text = example['chosen']


        # Find the end of the human's prompt (before "Assistant:")


        prompt_end_idx = chosen_text.find("\n\nAssistant:")


        if prompt_end_idx != -1:


            prompts.append(chosen_text[:prompt_end_idx].strip())


    return prompts

In [None]:
from datasets import load_dataset, Dataset


dataset = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-base")



In [None]:
def tokenize_prompts(prompts_list, tokenizer):
    tokenized_data = []
    for prompt in prompts_list:
        # TRL's PPOTrainer often expects "input_ids" and "attention_mask"
        # for the queries.
        tokenized_prompt = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        tokenized_data.append({
            "input_ids": tokenized_prompt["input_ids"].squeeze(0),
            "attention_mask": tokenized_prompt["attention_mask"].squeeze(0)
        })
    return Dataset.from_list(tokenized_data) # Convert list of dicts to a datasets.Dataset

In [None]:
# prompts had to formated of just queries
train_prompts_list = get_prompts(dataset["train"])
eval_prompts_list = get_prompts(dataset["test"])

train_dataset = tokenize_prompts(train_prompts_list, tokenizer)
eval_dataset = tokenize_prompts(eval_prompts_list, tokenizer)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,   # ✅ Use 4-bit instead
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)





model = AutoModelForCausalLM.from_pretrained(

    model_name,

    quantization_config=bnb_config,

    device_map="auto"

)

ref_model = AutoModelForCausalLM.from_pretrained(

    model_name,

    quantization_config=bnb_config,

    device_map="auto"

)



In [None]:
from peft import LoraConfig, TaskType, PeftModel, get_peft_model


# stacks on top of BitsAndBytes

In [None]:
from trl.trainer.utils import disable_dropout_in_model

from trl import PPOTrainer, PPOConfig

from transformers import TrainingArguments


# Load the reward model directly

reward_model_name = "Realmbird/helpfulness-preference-model-qwen-0.6B-merged"


reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_name, num_labels=1, device_map="auto", torch_dtype=torch.float16) # Added device_map="auto"



In [None]:
reward_model

In [None]:
value_model = AutoModelForSequenceClassification.from_pretrained(

    model_name,

    num_labels=1,

    quantization_config=bnb_config,

    device_map="auto"

)

value_model.resize_token_embeddings(len(tokenizer))


# Apply PEFT to the value model as well

peft_config_value = LoraConfig(

    task_type=TaskType.SEQ_CLS,

    inference_mode=False,

    r=8,

    lora_alpha=32,

    lora_dropout=0.1,

)

value_model = get_peft_model(value_model, peft_config_value)

In [None]:
# --- NEW: Set pad_token_id on the model's config ---
model.config.pad_token_id = tokenizer.pad_token_id
ref_model.config.pad_token_id = tokenizer.pad_token_id
value_model.config.pad_token_id = tokenizer.pad_token_id
reward_model.config.pad_token_id = tokenizer.pad_token_id

In [None]:

peft_config_policy = LoraConfig( # Renamed for clarity
    task_type=TaskType.CAUSAL_LM, # <--- This MUST be CAUSAL_LM for your generator!
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

ppo_config = PPOConfig(

    output_dir= "./rlhf_helpful_Qwen3_0.6B",

    learning_rate=1.41e-5,

    num_train_epochs=1,

    batch_size=4,

    mini_batch_size=1,

    gradient_accumulation_steps=32,

    report_to="none"

)

trainer = PPOTrainer(

    args=ppo_config,

    model=model,

    ref_model = False,

    processing_class=tokenizer,

    train_dataset=train_dataset, # Use the training split

    eval_dataset=eval_dataset,

    peft_config=peft_config_policy,

    reward_model=reward_model,

    value_model= value_model,

)

In [None]:
with torch.autocast(device_type="cuda", dtype=torch.bfloat16): # or torch.bfloat16
    trainer.train()