### Clone and install the repository

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone https://github.com/OpenRLHF/OpenRLHF.git

Cloning into 'OpenRLHF'...
remote: Enumerating objects: 11798, done.[K
remote: Counting objects: 100% (427/427), done.[K
remote: Compressing objects: 100% (134/134), done.[K
remote: Total 11798 (delta 351), reused 293 (delta 293), pack-reused 11371 (from 3)[K
Receiving objects: 100% (11798/11798), 2.93 MiB | 27.78 MiB/s, done.
Resolving deltas: 100% (8684/8684), done.


In [None]:
%cd OpenRLHF

/content/OpenRLHF


In [None]:
!pip install openrlhf[vllm_latest]
!pip install flash-attn==2.8.0.post2
!pip install datasets==2.14.5

Collecting flash-attn==2.8.0.post2 (from openrlhf[vllm_latest])
  Using cached flash_attn-2.8.0.post2-cp311-cp311-linux_x86_64.whl
Installing collected packages: flash-attn
  Attempting uninstall: flash-attn
    Found existing installation: flash_attn 2.8.1
    Uninstalling flash_attn-2.8.1:
      Successfully uninstalled flash_attn-2.8.1
Successfully installed flash-attn-2.8.0.post2


### Train Reward Model

In [None]:
!deepspeed --module openrlhf.cli.train_rm \
   --save_path ./checkpoint/Llama-3.2-1B-rm-dpo \
   --save_steps -1 \
   --logging_steps 1 \
   --eval_steps -1 \
   --train_batch_size 16 \
   --micro_train_batch_size 2 \
   --pretrain DATVO110/Llama-3.2-1B-Instruct-Chat-sft \
   --value_head_prefix score \
   --bf16 \
   --max_epochs 1 \
   --max_len 2048 \
   --zero_stage 2 \
   --learning_rate 5e-6 \
   --dataset thuanan/Vi-Alpaca-Preference \
   --apply_chat_template \
   --chosen_key chosen \
   --rejected_key rejected \
   --flash_attn \
   --load_checkpoint \
   --packing_samples \
   --gradient_checkpointing \
   --adam_offload \
   --lora_rank 16 \
   --lora_alpha 32

[2025-07-11 23:50:13,735] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-07-11 23:50:17,793] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
2025-07-11 23:50:18.986159: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752277819.006482   22254 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752277819.012872   22254 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-11 23:50:19.033514: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations

In [None]:
from transformers import AutoModelForCausalLM
from peft import PeftModel
from transformers import AutoTokenizer
# 1. Load base model
model = AutoModelForCausalLM.from_pretrained(
    "DATVO110/Llama-3.2-1B-Instruct-Chat-sft",
    device_map="auto",
    torch_dtype="auto"
)

# 2. Load LoRA adapter từ thư mục local
model = PeftModel.from_pretrained(
    model,
    "/content/OpenRLHF/checkpoint/Llama-3.2-1B-rm-dpo",
    is_local=True,         # CHỖ QUAN TRỌNG để tránh lỗi
    is_trainable=False
)

# 3. Merge lại adapter vào base model
model = model.merge_and_unload()

# Copy tokenizer từ model gốc sang thư mục merged model
tokenizer = AutoTokenizer.from_pretrained("DATVO110/Llama-3.2-1B-Instruct-Chat-sft")
tokenizer.save_pretrained("./checkpoint/Llama-3.2-1B-rm-dpo-combined")

# 4. Lưu lại model đã merge ra thư mục mới
model.save_pretrained("./checkpoint/Llama-3.2-1B-rm-dpo-combined")

### Merge LoRA Adapter Weights

In [None]:
# !python -m openrlhf.cli.lora_combiner \
#     --model_path DATVO110/Llama-3.2-8B-Instruct-Chat-sft \
#     --lora_path ./checkpoint/Llama-3.2-8B-rm-dpo \
#     --output_path ./checkpoint/Llama-3.2-8B-rm-dpo-combined \
#     --is_rm \
#     --bf16

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Push to Hugging Face Hub

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

ckpt_path = "/content/checkpoint/Llama-3.2-1B-rm-dpo-combined"
tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
model = AutoModelForSequenceClassification.from_pretrained(ckpt_path)

In [None]:
type(model)

In [None]:
model

In [None]:
from huggingface_hub import login



In [None]:
model.push_to_hub(
    "DATVO110/Llama-3.2-1B-RM-DPO",
    commit_message="Add model ckpt",
)


In [None]:
tokenizer.push_to_hub(
    "DATVO110/Llama-3.2-1B-RM-DPO",
    commit_message="Add tokenizer",
)

### Test Reward Model

In [None]:
inputs = tokenizer(
    "Tại sao bạn lại thích học lập trình?",
    return_tensors="pt",
    max_length=2048,
    truncation=True,
)

In [None]:
model

In [None]:
reward = model.model(**inputs).last_hidden_state
reward


In [None]:
reward = model.score(reward)[:, -1]
reward