In [None]:
import torch
import pandas as pd
from datasets import Dataset

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

In [None]:
max_length= 1024

In [None]:
model_name= "Qwen/Qwen2-0.5B-Instruct"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    # quantization_config=quantization_config,
    device_map={"": 0},
    trust_remote_code=True,
    num_labels=1,
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(
    model_name
)

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2-0.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments
from peft import LoraConfig
from trl import RewardTrainer

In [None]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=8,
    bias="none",
    task_type="SEQ_CLS",
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    modules_to_save=["scores"],
    lora_dropout= 0.1,
    inference_mode=False
)

In [None]:
prompt= "### Question:\n{}\n### Answer:\n{}"

In [None]:
data= pd.concat([pd.read_json('1k9_rlhf.json').rename(columns= {'answers': 'answer'}), pd.read_json('4k_rlhf.json')])

data= pd.DataFrame({'chosen': data.apply(lambda x: prompt.format(x['question'], x['answer']) + tokenizer.eos_token, axis= 1),
                   'rejected': data.apply(lambda x: prompt.format(x['question'], x['negative_answer']) + tokenizer.eos_token, axis= 1)})

In [None]:
data= Dataset.from_pandas(data)

In [None]:
def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_j = tokenizer(chosen, truncation=True, padding= "max_length", max_length= max_length)
        tokenized_k = tokenizer(rejected, truncation=True, padding= "max_length", max_length= max_length)

        new_examples["input_ids_chosen"].append(tokenized_j["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_j["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_k["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_k["attention_mask"])

    return new_examples

In [None]:
dataset = data.map(
    preprocess_function,
    batched= True,
    num_proc=8,
)

Map (num_proc=8):   0%|          | 0/5929 [00:00<?, ? examples/s]

In [None]:
dataset= dataset.train_test_split(test_size=0.15)

train_data= dataset['train']
eval_data= dataset['test']

In [None]:
model.config.update({"pad_token_id": tokenizer.eos_token_id})

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = RewardTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset= eval_data,
    max_length = max_length,
    peft_config= peft_config,
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4,
        warmup_steps = 50,
        num_train_epochs = 3,
        gradient_checkpointing= True,
        learning_rate = 1e-4,
        bf16= is_bfloat16_supported(),
        logging_steps=1,
        eval_strategy= "epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",  # Sử dụng độ chính xác làm chỉ số đánh giá
        greater_is_better=True,

        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "output_reward",
        remove_unused_columns=False,
    ),
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




In [None]:
trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss,Accuracy
0,0.001,0.001212,0.998876
2,0.0,0.000599,1.0


TrainOutput(global_step=471, training_loss=0.016141187356307957, metrics={'train_runtime': 677.3002, 'train_samples_per_second': 22.319, 'train_steps_per_second': 0.695, 'total_flos': 0.0, 'train_loss': 0.016141187356307957, 'epoch': 2.9904761904761905})

In [None]:
trainer.evaluate()

{'eval_loss': 0.0006576794548891485,
 'eval_accuracy': 1.0,
 'eval_runtime': 29.8257,
 'eval_samples_per_second': 29.84,
 'eval_steps_per_second': 3.755,
 'epoch': 2.9904761904761905}

In [None]:
trainer.model.save_pretrained("reward_model") # Local saving
tokenizer.save_pretrained("reward_model")

('reward_model/tokenizer_config.json',
 'reward_model/special_tokens_map.json',
 'reward_model/vocab.json',
 'reward_model/merges.txt',
 'reward_model/added_tokens.json',
 'reward_model/tokenizer.json')

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.433 GB.
3.441 GB of memory reserved.
