# Evaluate QLoRA NER on Test Set (BIO)

This notebook loads the base `Qwen2.5-0.5B` with the trained LoRA adapter, generates BIO tag sequences for test instructions, and computes per-entity TP/FP/FN and precision/recall/F1, plus a seqeval classification report.


In [1]:
%pip install -q evaluate==0.4.3 seqeval==1.2.2
import torch, os, json, re
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

print('Torch:', torch.__version__, 'CUDA:', torch.version.cuda, 'is_available:', torch.cuda.is_available())


Note: you may need to restart the kernel to use updated packages.
Torch: 2.5.1+cu121 CUDA: 12.1 is_available: True


In [2]:
model_dir = "models/Qwen2.5-0.5B"
adapter_dir = os.path.join("outputs", "qwen25-0.5b-qlora-ner", "lora_adapter")

tok_path = os.path.join(adapter_dir, "tokenizer")
if os.path.isdir(tok_path):
    tokenizer = AutoTokenizer.from_pretrained(tok_path, use_fast=True, trust_remote_code=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

device_map = "auto"

base_model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
)

try:
    base_model.resize_token_embeddings(len(tokenizer))
except Exception as e:
    print("resize_token_embeddings skipped:", e)

model = PeftModel.from_pretrained(base_model, adapter_dir, is_trainable=False, ignore_mismatched_sizes=True)
model.eval()
print("Loaded base + LoRA from:", model_dir, adapter_dir)


Loaded base + LoRA from: models/Qwen2.5-0.5B outputs\qwen25-0.5b-qlora-ner\lora_adapter


In [3]:
test_path = "outputs/standardized_gretel_pii_masking_en_test.jsonl"

ds = load_dataset("json", data_files=test_path, split="train")

print(ds[0].keys())

texts = [ex["instruction"] for ex in ds]
# Parse per-line BIO tags (one label per line)
labels_true = [re.findall(r"^(?:B|I)-[A-Za-z0-9_]+|^O$", ex["response"], flags=re.M) for ex in ds]

print("Loaded", len(texts), "test examples")


dict_keys(['id', 'source', 'text', 'tokens', 'labels', 'instruction', 'response'])
Loaded 10 test examples


In [4]:
from tqdm.auto import tqdm

gen_kwargs = dict(
    max_new_tokens=512,  # allow enough lines
    do_sample=False,
    num_beams=1,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

pred_sequences = []

for text in tqdm(texts):
    prompt = text
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.bfloat16):
        outputs = model.generate(**inputs, **gen_kwargs)
    gen_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
    # Parse one tag per line after "Labels:"; still robust to stray prose
    tags = re.findall(r"^(?:B|I)-[A-Za-z0-9_]+|^O$", gen_text, flags=re.M)
    pred_sequences.append(tags)

print("Example prediction:")
print(pred_sequences[0][:20])


  0%|          | 0/10 [00:00<?, ?it/s]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Example prediction:
[]


In [5]:
# Align lengths by truncating/padding with 'O'
true_aligned, pred_aligned = [], []
for ex, (instr, t, p) in enumerate(zip(texts, labels_true, pred_sequences)):
    # Infer the intended token count from the instruction (count lines after 'Tokens:')
    tokens_match = re.split(r"\bTokens:\s*\n", instr)
    expected = None
    if len(tokens_match) > 1:
        # Up to 'Labels:' marker
        token_block = tokens_match[1].split("\n\nLabels:", 1)[0]
        expected = len([ln for ln in token_block.splitlines() if ln.strip() != ""])
    L = expected or max(len(t), len(p))
    t_adj = (t + ["O"] * (L - len(t)))[:L]
    p_adj = (p + ["O"] * (L - len(p)))[:L]
    true_aligned.append(t_adj)
    pred_aligned.append(p_adj)

print("Aligned sequences prepared.")
print("Example true len, pred len:", len(true_aligned[0]), len(pred_aligned[0]))


Aligned sequences prepared.
Example true len, pred len: 81 81


In [6]:
%pip install -q nervaluate pandas
import pandas as pd
from nervaluate import Evaluator

valid = re.compile(r'^(?:(?:B|I)-[A-Za-z0-9_]+|O)$')
true_clean = [[t if valid.match(t) else 'O' for t in seq] for seq in true_aligned]
pred_clean = [[t if valid.match(t) else 'O' for t in seq] for seq in pred_aligned]

entity_types = sorted({t.split('-', 1)[1] for seq in (true_clean + pred_clean) for t in seq if '-' in t})

# Some versions of nervaluate return 3 values: (results, results_by_tag, results_per_doc)
_eval_out = Evaluator(true_clean, pred_clean, tags=entity_types).evaluate()
if isinstance(_eval_out, tuple) and len(_eval_out) >= 2:
    results_overall, results_by_tag = _eval_out[0], _eval_out[1]
else:
    results_overall, results_by_tag = _eval_out, {}

rows = []
for et in entity_types:
    m = results_by_tag.get(et, {}).get('exact', {})
    rows.append({
        'entity': et,
        'TP': m.get('tp', 0),
        'FP': m.get('fp', 0),
        'FN': m.get('fn', 0),
        'precision': m.get('precision', 0.0),
        'recall': m.get('recall', 0.0),
        'f1': m.get('f1', 0.0),
        'mode': 'exact'
    })

agg = results_overall.get('exact', {})
rows.append({'entity': 'OVERALL', 'TP': agg.get('tp', 0), 'FP': agg.get('fp', 0), 'FN': agg.get('fn', 0), 'precision': agg.get('precision', 0.0), 'recall': agg.get('recall', 0.0), 'f1': agg.get('f1', 0.0), 'mode': 'exact'})

out_csv = os.path.join('outputs', 'ner_eval_per_type_exact.csv')
os.makedirs('outputs', exist_ok=True)
pd.DataFrame(rows).to_csv(out_csv, index=False)
print('Saved:', out_csv)


Note: you may need to restart the kernel to use updated packages.
Saved: outputs\ner_eval_per_type_exact.csv


In [7]:
# Sample inference output
idx = 0  # change to inspect another sample

prompt = texts[idx]
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.bfloat16):
    outputs = model.generate(**inputs, **gen_kwargs)

gen_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
pred_tokens = re.findall(r"^(?:B|I)-[A-Za-z0-9_]+|^O$", gen_text, flags=re.M)

print("Instruction:\n", prompt)
print("\nGround truth tags:\n", "\n".join(labels_true[idx]))
print("\nModel raw output:\n", gen_text)
print("\nExtracted BIO tags:\n", "\n".join(pred_tokens))

Instruction:
 You will label tokens with BIO tags. Output only the tag after each token. One token per line.

Text:
Transaction details: gasLimit set to 1000000 units by tw_brian740, gasPrice set to 10 Gwei by veronicawood@example.org, contactable at +1-869-341-9301x7005, located at Suite 378, Yolanda Mountain, Burkeberg.

Tokens:
Transaction
Ġdetails
:
Ġgas
Limit
Ġset
Ġto
Ġ
1
0
0
0
0
0
0
Ġunits
Ġby
Ġtw
_b
rian
7
4
0
,
Ġgas
Price
Ġset
Ġto
Ġ
1
0
ĠG
wei
Ġby
Ġver
onic
aw
ood
@example
.org
,
Ġcontact
able
Ġat
Ġ+
1
-
8
6
9
-
3
4
1
-
9
3
0
1
x
7
0
0
5
,
Ġlocated
Ġat
ĠSuite
Ġ
3
7
8
,
ĠY
ol
anda
ĠMountain
,
ĠBurke
berg
.

Labels:

Ground truth tags:
 O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
B-USER_NAME
I-USER_NAME
I-USER_NAME
I-USER_NAME
I-USER_NAME
I-USER_NAME
O
O
O
O
O
O
O
O
O
O
O
B-EMAIL
I-EMAIL
I-EMAIL
I-EMAIL
I-EMAIL
I-EMAIL
O
O
O
O
B-PHONE_NUMBER
I-PHONE_NUMBER
I-PHONE_NUMBER
I-PHONE_NUMBER
I-PHONE_NUMBER
I-PHONE_NUMBER
I-PHONE_NUMBER
I-PHONE_NUMBER
I-PHONE_NUMBER
I-PHONE_NUMBER
I-PHONE_NUMBER
