# QLoRA SFT for Instruction-style NER on Qwen2.5-0.5B

This notebook fine-tunes the locally downloaded `Qwen2.5-0.5B` with PEFT LoRA in 4-bit (QLoRA) for instruction-style NER via causal language modeling. The dataset is the standardized JSONL with `instruction` and `response` fields; training text is `<instruction>\n<response>`. The LoRA adapter is saved separately for later merging or on-device inference.


In [None]:
%pip install --upgrade -q pip wheel setuptools
%pip install -q torch --index-url https://download.pytorch.org/whl/cu124
%pip install -q transformers==4.45.2 datasets==3.0.1 peft==0.13.2 accelerate==1.0.1 trl==0.11.4 evaluate==0.4.3 scipy==1.13.1

import torch
print('Torch:', torch.__version__, 'CUDA:', torch.version.cuda, 'is_available:', torch.cuda.is_available())


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.


Note: you may need to restart the kernel to use updated packages.
Torch: 2.5.1+cu121 CUDA: 12.1 is_available: True


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_dir = "models/Qwen2.5-0.5B"  # local path

tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    device_map="auto",
    trust_remote_code=True,
)

model.gradient_checkpointing_enable()
model.config.use_cache = False
model.resize_token_embeddings(len(tokenizer))

print("Loaded model and tokenizer from", model_dir)


Loaded model and tokenizer from models/Qwen2.5-0.5B


In [3]:
from peft import LoraConfig, get_peft_model, TaskType

lora_targets = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj",
]

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=lora_targets,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 8,798,208 || all params: 502,588,160 || trainable%: 1.7506


In [4]:
from datasets import load_dataset, DatasetDict

train_path = "outputs/standardized_gretel_pii_masking_en_train.jsonl"
val_path = "outputs/standardized_gretel_pii_masking_en_validation.jsonl"
test_path = "outputs/standardized_gretel_pii_masking_en_test.jsonl"

datasets = DatasetDict({
    "train": load_dataset("json", data_files=train_path, split="train"),
    "validation": load_dataset("json", data_files=val_path, split="train"),
    "test": load_dataset("json", data_files=test_path, split="train"),
})

concat_key = "text"
for split in ["train", "validation", "test"]:
    datasets[split] = datasets[split].map(
        lambda ex: {concat_key: ex["instruction"] + "\n" + ex["response"]},
        remove_columns=datasets[split].column_names,
    )

print(datasets)


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 20
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 10
    })
    test: Dataset({
        features: ['text'],
        num_rows: 10
    })
})


In [5]:
# DEBUG PREVIEW — raw instruction/response and concatenated text
import json
from itertools import islice

print("Train file:", train_path)

def peek_jsonl(path, n=2):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in islice(f, n):
            try:
                rows.append(json.loads(line))
            except Exception as e:
                print("Parse error:", e)
    return rows

samples = peek_jsonl(train_path, n=2)
print(f"Loaded {len(samples)} sample(s) from JSONL for preview\n")

for i, ex in enumerate(samples):
    instr = ex.get("instruction", "")
    resp = ex.get("response", "")
    concat_text = instr + "\n" + resp
    print(f"--- Sample {i} ---")
    print("instruction (first 200 chars):\n", instr[:200].replace("\n", "\\n"))
    print("\nresponse (first 200 chars):\n", resp[:200].replace("\n", "\\n"))
    print("\nconcat length:", len(concat_text))
    print()



Train file: outputs/standardized_gretel_pii_masking_en_train.jsonl
Loaded 2 sample(s) from JSONL for preview

--- Sample 0 ---
instruction (first 200 chars):
 You will label tokens with BIO tags. Output only the tag after each token. One token per line.\n\nText:\n**ADOPTION CERTIFICATE**\nIssued by Guernsey Adoption Agency, this certificate confirms the adoptio

response (first 200 chars):
 O\nO\nO\nO\nO\nO\nO\nO\nO\nB-COUNTRY\nI-COUNTRY\nI-COUNTRY\nO\nO\nO\nO\nO\nO\nO\nO\nO\nB-NAME\nI-NAME\nI-NAME\nI-NAME\nI-NAME\nO\nO\nO\nO\nB-DATE_OF_BIRTH\nI-DATE_OF_BIRTH\nI-DATE_OF_BIRTH\nI-DATE_OF_BIRTH\nI-DATE_OF_BIRTH\nI-DATE_OF_B

concat length: 1134

--- Sample 1 ---
instruction (first 200 chars):
 You will label tokens with BIO tags. Output only the tag after each token. One token per line.\n\nText:\n**Account Closure Form**\n\n**Account Holder:**\n- **Name:** Zashil Tripathi\n- **Date of Birth:** 200

response (first 200 chars):
 O\nO\nO\nO\nO\nO\nO\nO\nO\nO\nO\nO\nO\nO\nB-NAME\nI-NAME\nI-N

In [6]:
# DEBUG PREVIEW — tokenization and collator mini-batch
from transformers import DataCollatorForLanguageModeling

# Use local defaults if globals not defined yet
dbg_max_length = max_length if 'max_length' in globals() else 1024

# Take first 2 concatenated examples from datasets["train"] before tokenization
raw_preview = datasets["train"].select(range(min(2, len(datasets["train"]))))
print("Raw concatenated examples (first 2):\n")
for i in range(len(raw_preview)):
    txt = raw_preview[i]["text"]
    print(f"--- Example {i} length={len(txt)} ---")
    print(txt[:300].replace("\n", "\\n"))
    print()

# Tokenize those two directly to inspect ids/tokens
_tok = tokenizer(list(raw_preview["text"]), truncation=True, max_length=dbg_max_length, padding=False, return_attention_mask=True)
print("Tokenized preview keys:", list(_tok.keys()))
print("input_ids lens:", [len(x) for x in _tok["input_ids"]])

# Show first 30 tokens and ids for example 0
ids0 = _tok["input_ids"][0]
print("\nFirst 30 token ids (ex0):", ids0[:30])
print("First 30 tokens (ex0):", tokenizer.convert_ids_to_tokens(ids0[:30]))

# Build a mini-batch using a local collator (pads and creates labels)
collator_local = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)

batch_features = []
for i in range(len(_tok["input_ids"])):
    batch_features.append({
        "input_ids": _tok["input_ids"][i],
        "attention_mask": _tok["attention_mask"][i],
    })

batch = collator_local(batch_features)
print("\nCollated batch tensors:")
for k, v in batch.items():
    try:
        print(k, v.shape, v.dtype)
    except Exception:
        print(k, type(v))

# Show a short slice of labels to confirm LM objective alignment
print("\nlabels slice ex0:", batch["labels"][0][:30].tolist())



Raw concatenated examples (first 2):

--- Example 0 length=1134 ---
You will label tokens with BIO tags. Output only the tag after each token. One token per line.\n\nText:\n**ADOPTION CERTIFICATE**\nIssued by Guernsey Adoption Agency, this certificate confirms the adoption of Urvashi Jaggi, born on 2015-07-26, by the adoptive parents. The adoption was finalized on 2022-

--- Example 1 length=1303 ---
You will label tokens with BIO tags. Output only the tag after each token. One token per line.\n\nText:\n**Account Closure Form**\n\n**Account Holder:**\n- **Name:** Zashil Tripathi\n- **Date of Birth:** 2008-11-18\n- **SSN:** 819-46-1268\n\n**Account Details:**\n- **Account Number:** G42778036994\n\n**Reason fo

Tokenized preview keys: ['input_ids', 'attention_mask']
input_ids lens: [515, 652]

First 30 token ids (ex0): [2610, 686, 2383, 11211, 448, 72066, 9492, 13, 9258, 1172, 279, 4772, 1283, 1817, 3950, 13, 3776, 3950, 817, 1555, 382, 1178, 510, 334, 28814, 6578, 62357, 82023, 1019, 28

In [7]:
from transformers import DataCollatorForLanguageModeling

max_length = 1024

tokenizer.padding_side = "right"

def tokenize_function(batch):
    out = tokenizer(
        batch["text"],
        truncation=True,
        max_length=max_length,
        padding=False,
        return_attention_mask=True,
    )
    return out

with torch.autocast("cuda", dtype=torch.bfloat16):
    tokenized = {}
    for split in ["train", "validation", "test"]:
        tokenized[split] = datasets[split].map(
            tokenize_function,
            batched=True,
            remove_columns=datasets[split].column_names,
            desc=f"Tokenizing {split}",
        )

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)


Tokenizing train:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing validation:   0%|          | 0/10 [00:00<?, ? examples/s]

Tokenizing test:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
import math, os

run_name = "qwen25-0.5b-qlora-ner"
output_dir = os.path.join("outputs", run_name)

per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 4
num_train_epochs = 3
learning_rate = 2e-4
warmup_ratio = 0.03
weight_decay = 0.0
logging_steps = 10
eval_steps = 200
save_steps = 200
max_grad_norm = 0.3

training_args = TrainingArguments(
    output_dir=output_dir,
    run_name=run_name,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    evaluation_strategy="steps",
    logging_steps=logging_steps,
    eval_steps=eval_steps,
    save_steps=save_steps,
    save_total_limit=2,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    warmup_ratio=warmup_ratio,
    weight_decay=weight_decay,
    lr_scheduler_type="cosine",
    fp16=False,
    bf16=True,
    optim="adamw_torch",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    max_grad_norm=max_grad_norm,
    torch_compile=False,
    report_to=["none"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=collator,
)

print("Train examples:", len(tokenized["train"]))
print("Eval examples:", len(tokenized["validation"]))


Train examples: 20
Eval examples: 10




In [9]:
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

print("Training complete.")


  0%|          | 0/3 [00:00<?, ?it/s]



{'train_runtime': 9.81, 'train_samples_per_second': 6.116, 'train_steps_per_second': 0.306, 'train_loss': 1.1724990208943684, 'epoch': 2.4}
***** train metrics *****
  epoch                    =        2.4
  total_flos               =    73963GF
  train_loss               =     1.1725
  train_runtime            = 0:00:09.81
  train_samples_per_second =      6.116
  train_steps_per_second   =      0.306
Training complete.


In [10]:
from peft import PeftModel

adapter_dir = os.path.join(output_dir, "lora_adapter")
os.makedirs(adapter_dir, exist_ok=True)

model.save_pretrained(adapter_dir)

tokenizer.save_pretrained(os.path.join(adapter_dir, "tokenizer"))

print("Saved LoRA adapter to:", adapter_dir)

# Example: how to load later for inference without altering base model
# base_model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, quantization_config=bnb_config)
# peft_model = PeftModel.from_pretrained(base_model, adapter_dir)
# peft_model.eval()




Saved LoRA adapter to: outputs\qwen25-0.5b-qlora-ner\lora_adapter
