# 1. Setup

Keywords: Direct Policy Optimization (DPO), Proximal Policy Optimization (PPO), RLHF, Reward Model

The general pipeline:
+ 1. Pre-training: low-quality data + language modeling objective => optimized for text completion
+ 2. Finetuning: high-quality data + supervised training -> dialogue-like generation
+ 3. RLHF: comparison data -> scalar score -> reward model ==> Prompt + RL -> Final model

In [None]:
%%capture
!pip install -U datasets
!pip install -U trl
!pip install -U transformers
!pip install -U accelerate
!pip install -U bitsandbytes
!pip install -U sentencepiece
!pip install -U peft
!pip install -U huggingface_hub
!pip install -U warnings
!pip install -U wandb

In [None]:
import random

# SFT
from trl import SFTConfig, SFTTrainer
from datasets import load_dataset
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig

# DPO
from trl import DPOTrainer, DPOConfig

# diagnostics
import warnings
from huggingface_hub import login
import wandb


CACHE_DIR = "./cache"
BASE_MODEL_ID = "thainq107/Llama-3.2-1B-Instruct-sft"
SFT_OUTPUT_DIR = "output-SFT"
DPO_FULL_OUTPUT_DIR = "output-DPO-final"

In [None]:
#warning
warnings.filterwarnings("ignore")

# huggingface
API_KEY = "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"
login(token=API_KEY)

# wandb
wb_token = "79126da44d32381139323a9fc5fc6ba0e32b99c4"
wandb.login(key=wb_token)
wandb.init(project="Finetuning Llama 3.2 1B Alpaca", name="defaul_run", reinit=True) # could comment out 

In [None]:
dataset = load_dataset("thainq107/Vi-Alpaca-Preference", cache_dir=CACHE_DIR)
print(dataset)

In [None]:
idx = random.randint(1, 60000)
dataset['train'][idx]

# 2. Supervised Finetuning (SFT)

In [None]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_ID,
    trust_remote_code=True,
    cache_dir=CACHE_DIR,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
# model
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ]
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=CACHE_DIR,
)
base_model.config.use_cache = False

In [None]:
# hyperparameters
hyperparameters = {
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 8,
    "gradient_accumulation_steps": 2,
    "gradient_checkpointing": True,
    "learning_rate": 3e-5,
    "logging_steps": 500,
    "max_steps": 5000,
    "save_strategy": "no",
    "overwrite_output_dir": True,
    "optim": "paged_adamw_8bit",
    "lr_scheduler_type": "cosine",
    "warmup_steps": 500,
    # "bf16": True,
    "fp16": True,
    "disable_tqdm": False,
    "eval_strategy": "steps",      
    "eval_steps": 500,
    "dataloader_num_workers": 8,
}

MAX_LENGTH = 512

In [None]:
# testing 
conversation = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user",   "content": "This is the prompt."},
        {"role": "assistant","content": "This is the chosen."},
]

print(tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False))

In [None]:
def format_prompt(ex):
    conv = [
        {"role":"system","content":"You are a helpful assistant."},
        {"role":"user","content":ex["question"]},
        {"role":"assistant","content":ex["chosen"]},
    ]
    return tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=False)

sft_config = SFTConfig(
    **{**hyperparameters, "output_dir": SFT_OUTPUT_DIR, "max_seq_length": MAX_LENGTH}
)

sft_trainer = SFTTrainer(
    model=base_model,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset['train'],
    eval_dataset =dataset['test'],
    formatting_func=format_prompt,
)

sft_trainer.train()
sft_trainer.save_model("./output-sft")
sft_trainer.push_to_hub("Savoxism/Llama-3.2-1B-Instruct-Alpaca-SFT")

# 4. Direct Policy Optimization (DPO)

In [None]:
# chat template
def convert_to_conversational_preference_format(example):
    return {
        "id": example["id"],
        "prompt": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user",   "content": example["question"]}
        ],
        "chosen":  [{"role": "assistant", "content": example["chosen"]}],
        "rejected":[{"role": "assistant", "content": example["rejected"]}],
    }

dpo_dataset = dataset.map(convert_to_conversational_preference_format)

In [None]:
dpo_full_model = base_model.load_adapter(
    SFT_OUTPUT_DIR, is_trainable=True, adapter_name="dpo_full_adapter"
)

dpo_full_args = DPOConfig(
    **{**hyperparameters, "output_dir": DPO_FULL_OUTPUT_DIR, "max_length": MAX_LENGTH}
)

dpo_full_trainer = DPOTrainer(
    dpo_full_model,
    args=dpo_full_args,
    train_dataset=dpo_dataset['train'],
    eval_dataset =dpo_dataset['test'],
    processing_class=tokenizer,
    peft_config=peft_config,
)
dpo_full_trainer.train()
dpo_full_trainer.save_model(DPO_FULL_OUTPUT_DIR)
dpo_full_trainer.push_to_hub("Savoxism/Llama-3.2-1B-Instruct-Alpaca-DPO-full")

# 4. Inference & Deployment

In [None]:
!pip install -q gradio

import gradio as gr

In [None]:
# helper functions
def get_model_response(model, tokenizer, instruction):
    cur_conversation = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user",   "content": instruction}
    ]
    cur_input_prompt = tokenizer.apply_chat_template(
        cur_conversation, add_generation_prompt=True, tokenize=True
    )
    cur_output_ids = model.generate(
        input_ids=torch.LongTensor([cur_input_prompt]).to(model.device),
        max_new_tokens=1000
    )
    cur_generated_ids = cur_output_ids[0][len(cur_input_prompt):]
    return tokenizer.decode(cur_generated_ids, skip_special_tokens=True)

def respond_fn(instruction: str) -> str:
    return get_model_response(model, tokenizer, instruction)

In [None]:
# interface
iface = gr.Interface(
    fn=respond_fn,
    inputs=gr.Textbox(lines=2, placeholder="Nhập câu hỏi của bạn..."),
    outputs=gr.Textbox(label="Phản hồi"),
    title="Chatbot LLaMA-3.2-1B",
    description="Giao diện đơn giản cho LLaMA fine-tuned với SFT/DPO"
)
iface.launch(share=True)