# QLoRA with Bitsandbytes on Intel GPU

## 1. Sanity check PyTorch version and XPU devices

In [1]:
import torch
print(torch.__version__)
[print(f'[{i}]: {torch.xpu.get_device_properties(i)}') for i in range(torch.xpu.device_count())];

2.6.0+cu124


## 2. Import necessary packages

In [None]:
import torch
import os

os.environ["WANDB_DISABLED"] = "true"
import transformers
from transformers import AutoTokenizer
from peft import LoraConfig
from transformers import AutoModelForCausalLM
from peft import get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

## 3. Load the model

- Load the model and Tokenizer

In [None]:
model_path = "Qwen/Qwen2.5-0.5B"

tokenizer = AutoTokenizer.from_pretrained(model_path)

ds = load_dataset("HuggingFaceTB/smoltalk", "everyday-conversations")

def process_dataset(sample):
    messages = sample["messages"]
    # formatted = [ {"role": msg["role"].lower(), "content": msg["content"].lower()} for msg in messages]
    sample = tokenizer.apply_chat_template(messages, tokenize=True, return_dict=True)
    # print(sample)
    return sample

ds = ds.map(process_dataset)

model = AutoModelForCausalLM.from_pretrained(model_path,
                                              device_map="auto")

## 4. Setup LoRA config

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear",
    modules_to_save=["lm_head", "embed_token"],
    task_type="CAUSAL_LM",
)

## 5. Run the SFTTrainer

In [None]:
finetuned_model = "Qwen2.5-0.5B-SFT"
if torch.xpu.is_available():
    torch.xpu.empty_cache()
trainer = SFTTrainer(
    model,
    train_dataset=ds["train"],
    eval_dataset= ds["test"],
    args=SFTConfig(
        per_device_train_batch_size=4,
        gradient_accumulation_steps= 1,
        warmup_steps=20,
        max_steps=200,
        learning_rate=2e-5,
        save_steps=100,
        bf16=True,  # bf16 is more stable in training
        logging_steps=20,
        output_dir=finetuned_model,
        optim="adamw_torch", # paged_adamw_8bit is not supported yet
        report_to = None,
        gradient_checkpointing=True, # can further reduce memory but slower        
    ),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
result = trainer.train()
print(result)

## 6. Inference finetuned model

* Run cell #2 to Import necessary packages

In [None]:
finetuned_model_path = f"{finetuned_model}/checkpoint-200"
loaded_model = AutoModelForCausalLM.from_pretrained(finetuned_model_path, device_map="auto")
prompt = "Once upon a time, a little girl"
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)
input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("xpu")
output = loaded_model.generate(input_ids, max_new_tokens=100, do_sample=True)
print(tokenizer.batch_decode(output, skip_special_tokens=True))