# 1. Setup

The pipeline follows like this: Setup environment -> Load base model -> Configure QLoRA -> Prepare dataset -> Fine-tune LLM -> Save checkpoints -> Run inference

In [1]:
!nvidia-smi

Sun Apr 27 18:15:00 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:01:00.0 Off |                  Off |
|  0%   39C    P5             57W /  450W |       2MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!pip install -q -U wandb
!pip install -q -U bitsandbytes
!pip install -q -U datasets
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U peft
!pip install -q -U huggingface_hub
!pip install -q -U torch
!pip install -q -U scikit-learn
!pip install -q -U tqdm

In [2]:
import os
import bitsandbytes as bnb
import torch

from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import wandb
wb_token = "79126da44d32381139323a9fc5fc6ba0e32b99c4"
wandb.login(key=wb_token)
run = wandb.init(
    project='Finetuning Selection LLM', 
    job_type="training", 
    anonymous="allow"
)

from huggingface_hub import login
API_KEY = "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"
login(token=API_KEY)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msavoxism[0m ([33msavoxism-hanoi-university-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
SEED = 42
TRAIN_PATH = "selection_train.json"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "v_proj",
    ],
    lora_dropout=0.05,
    bias='none',
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)

# 3. Dataset

In [6]:
dataset = load_dataset("json", data_files=TRAIN_PATH, split="train")
dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=SEED)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'target'],
        num_rows: 866
    })
    test: Dataset({
        features: ['prompt', 'target'],
        num_rows: 97
    })
})

In [8]:
QWEN25_SYSTEM_PROMPT = """You are the Selection module in the Selection-Inference framework.
When given input consisting of numbered sentences:
sent 0: …
sent 1: …
…

followed by a question, you must output **only** the selection in exactly this format:

sent <X>. We know that sent <Y> [and sent <Z>]. Therefore, <conclusion>.

Do **not** output any other text, explanation, or formatting."""

def preprocess(batch):
    # build chat-style strings
    chats = []
    for src, tgt in zip(batch["prompt"], batch["target"]):
        user = src.strip() + "\nSelection:"
        # apply_chat_template sẽ nối sẵn system + user + assistant
        chat = tokenizer.apply_chat_template(
            [
                {"role": "system", "content": QWEN25_SYSTEM_PROMPT},
                {"role": "user",   "content": user},
                {"role": "assistant", "content": tgt.strip()},
            ],
            tokenize=False,
            add_generation_prompt=False,
        )
        chats.append(chat)
    tok = tokenizer(
        chats,
        padding="max_length",
        truncation=True,
        max_length=512, # 1024
    )
    tok["labels"] = tok["input_ids"].copy()
    return tok

In [9]:
dataset = dataset.map(
    preprocess,
    batched=True,
    num_proc=4,
    remove_columns=["prompt", "target"],
)

train_ds = dataset["train"]
eval_ds  = dataset["test"]

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)

# 3. Modeling

In [13]:
training_args = TrainingArguments(
    # Saving model
    output_dir="./qwen_2.5-7b-instruct-selection-llm",
    overwrite_output_dir=True,
    seed=SEED,

    # Training & Evaluation
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",           # run evaluation at end of each epoch
    save_strategy="epoch", 
    logging_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=3,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Batch & Gradient accumulation
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,

    # hyperparameters
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=1000,
    lr_scheduler_type="cosine",  # hoặc "linear", "polynomial", "cosine_with_restarts",...

    # ---- precision & performance ----
    optim="paged_adamw_8bit",
    fp16=True,
    gradient_checkpointing=True,
    dataloader_num_workers=4,
    dataloader_drop_last=True,
)

model.config.use_cache = False
model.enable_input_require_grads()

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,0.6005,0.551389
2,0.5944,0.539096
3,0.5787,0.520919


TrainOutput(global_step=162, training_loss=0.5912075631412459, metrics={'train_runtime': 730.0654, 'train_samples_per_second': 3.559, 'train_steps_per_second': 0.222, 'total_flos': 5.632077241044173e+16, 'train_loss': 0.5912075631412459, 'epoch': 3.0})

In [17]:
model.save_pretrained("./qwen_2.5-selection-llm")
PEFT_MODEL = "Savoxism/InstructionTuning-Qwwn2.5-7B-Selection-LLM"

model.push_to_hub(
    PEFT_MODEL,
)

adapter_model.safetensors:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Savoxism/InstructionTuning-Qwwn2.5-7B-Selection-LLM/commit/dbaa33ab507ad428c5fd6bc60f5cb5ff8d840135', commit_message='Upload model', commit_description='', oid='dbaa33ab507ad428c5fd6bc60f5cb5ff8d840135', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Savoxism/InstructionTuning-Qwwn2.5-7B-Selection-LLM', endpoint='https://huggingface.co', repo_type='model', repo_id='Savoxism/InstructionTuning-Qwwn2.5-7B-Selection-LLM'), pr_revision=None, pr_num=None)

# 4. Inference