# Evaluate QLoRA NER on Test Set (BIO)

This notebook loads the base `Qwen2.5-0.5B` with the trained LoRA adapter, generates BIO tag sequences for test instructions, and computes per-entity TP/FP/FN and precision/recall/F1, plus a seqeval classification report.


In [1]:
%pip install -q evaluate==0.4.3 seqeval==1.2.2
import torch, os, json, re
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

print('Torch:', torch.__version__, 'CUDA:', torch.version.cuda, 'is_available:', torch.cuda.is_available())


Note: you may need to restart the kernel to use updated packages.
Torch: 2.5.1+cu121 CUDA: 12.1 is_available: True


In [2]:
model_dir = "models/Qwen2.5-0.5B"
adapter_dir = os.path.join("outputs", "qwen25-0.5b-qlora-ner", "lora_adapter")

tok_path = os.path.join(adapter_dir, "tokenizer")
if os.path.isdir(tok_path):
    tokenizer = AutoTokenizer.from_pretrained(tok_path, use_fast=True, trust_remote_code=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# Match training
try:
    tokenizer.padding_side = "right"
except Exception:
    pass

device_map = "auto"

base_model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    device_map=device_map,
    trust_remote_code=True,
)

try:
    base_model.resize_token_embeddings(len(tokenizer))
except Exception as e:
    print("resize_token_embeddings skipped:", e)

model = PeftModel.from_pretrained(base_model, adapter_dir, is_trainable=False, ignore_mismatched_sizes=True)
model.eval()
print("Loaded base + LoRA from:", model_dir, adapter_dir)


Loaded base + LoRA from: models/Qwen2.5-0.5B outputs\qwen25-0.5b-qlora-ner\lora_adapter


In [3]:
test_path = "outputs/standardized_gretel_pii_masking_en_test.jsonl"

ds = load_dataset("json", data_files=test_path, split="train")

print(ds[0].keys())

texts = [ex["instruction"] for ex in ds]
# Parse per-line BIO tags (one label per line)
labels_true = [re.findall(r"^(?:B|I)-[A-Za-z0-9_]+|^O$", ex["response"], flags=re.M) for ex in ds]

print("Loaded", len(texts), "test examples")

# Build allowed BIO tag set from TRAIN ground truth to guide decoding
train_path = "outputs/standardized_gretel_pii_masking_en_train.jsonl"
train_ds = load_dataset("json", data_files=train_path, split="train")
labels_train = [re.findall(r"^(?:B|I)-[A-Za-z0-9_]+|^O$", ex["response"], flags=re.M) for ex in train_ds]
allowed_tags = sorted({t for seq in labels_train for t in seq} | {"O"})
allowed_types = sorted({t.split('-', 1)[1] for t in allowed_tags if '-' in t})

# Minimal few-shot example to anchor line-by-line BIO output
few_shot_prefix = (
    "Output only one BIO tag per line. Allowed tags: " + ", ".join(allowed_tags) + "\n\n"
    "Example:\n"
    "Tokens:\nJohn\nDoe\n\nLabels:\n"
    + ("B-NAME\nI-NAME\n" if any(t.startswith("B-NAME") for t in allowed_tags) else "O\nO\n")
    + "\n"
)


dict_keys(['id', 'source', 'text', 'tokens', 'labels', 'instruction', 'response'])
Loaded 1 test examples


In [4]:
# Minimal, consistent header (same as training)
tag_header = (
    "You will label tokens with BIO tags.\n"
    "Output only one tag per line (no extra text).\n"
    "Allowed tags: " + ", ".join(allowed_tags) + "\n\n"
)
# Use the same variable the rest of the notebook expects
few_shot_prefix = tag_header


In [5]:
# Constrained per-line tag projection (argmax over allowed tags)
from functools import lru_cache
import torch

def build_allowed_tag_texts(allowed_types_list):
    tags = ["O\n"]
    tags += [f"B-{t}\n" for t in allowed_types_list]
    tags += [f"I-{t}\n" for t in allowed_types_list]
    return tags

allowed_tag_texts = build_allowed_tag_texts(allowed_types)

@lru_cache(maxsize=None)
def _cand_ids_cached(text: str):
    return tokenizer(text, add_special_tokens=False).input_ids

@torch.inference_mode()
def choose_tags_argmax(prompt_text: str, num_lines: int):
    # Tokenize prompt once
    prompt_ids = tokenizer(prompt_text, return_tensors="pt").input_ids[0].to(model.device)
    chosen_ids = torch.tensor([], dtype=prompt_ids.dtype, device=model.device)
    chosen_tags = []

    for _ in range(num_lines):
        best_score = -1e30
        best_tag = "O\n"
        best_ids = None

        # Run one forward per candidate to score its logprob
        for cand in allowed_tag_texts:
            cand_ids = torch.tensor(_cand_ids_cached(cand), dtype=prompt_ids.dtype, device=model.device)
            # Build full input ids
            input_ids = torch.cat([prompt_ids, chosen_ids, cand_ids], dim=0).unsqueeze(0)
            out = model(input_ids=input_ids)
            logits = out.logits[0]
            log_probs = torch.log_softmax(logits, dim=-1)
            # Positions: each token prob comes from previous position logits
            seq_len = input_ids.shape[-1]
            start = seq_len - cand_ids.shape[0]
            score = 0.0
            pos = start
            for k in range(cand_ids.shape[0]):
                pred_idx = pos - 1
                score += log_probs[pred_idx, cand_ids[k]].item()
                pos += 1
            if score > best_score:
                best_score = score
                best_tag = cand
                best_ids = cand_ids

        # Commit best tag
        chosen_ids = torch.cat([chosen_ids, best_ids], dim=0)
        chosen_tags.append(best_tag.strip())

    return chosen_tags



In [6]:
# Helpers for robust BIO tag normalization and extraction
import string

def _levenshtein(a: str, b: str) -> int:
    la, lb = len(a), len(b)
    if la == 0:
        return lb
    if lb == 0:
        return la
    dp = list(range(lb + 1))
    for i in range(1, la + 1):
        prev = dp[0]
        dp[0] = i
        ca = a[i - 1]
        for j in range(1, lb + 1):
            temp = dp[j]
            cost = 0 if ca == b[j - 1] else 1
            dp[j] = min(
                dp[j] + 1,       # deletion
                dp[j - 1] + 1,   # insertion
                prev + cost      # substitution
            )
            prev = temp
    return dp[lb]

def _normalize_type_token(s: str) -> str:
    s = s.strip().replace(" ", "_").replace("-", "_").upper()
    s = re.sub(r"[^A-Z0-9_]", "", s)
    s = re.sub(r"_+", "_", s)
    return s.strip("_")

def _best_allowed_type(candidate: str, allowed_types_list) -> str | None:
    if not candidate:
        return None
    cand = _normalize_type_token(candidate)
    if cand in allowed_types_list:
        return cand
    # quick substring heuristic
    cand_flat = cand.replace("_", "")
    for t in allowed_types_list:
        tf = t.replace("_", "")
        if cand_flat in tf or tf in cand_flat:
            return t
    # Levenshtein-based fallback
    best_t, best_d = None, 10**9
    for t in allowed_types_list:
        d = _levenshtein(cand_flat, t.replace("_", ""))
        if d < best_d:
            best_d, best_t = d, t
    # accept if reasonably close
    if best_t is not None and best_d <= max(2, len(best_t)//3):
        return best_t
    return None

def extract_bio_tags(text: str, allowed_types_list):
    tags = []
    for raw in text.splitlines():
        ln = raw.strip().strip(",;:\u202f \t\r\n")
        if not ln:
            continue
        # Normalize any separators between prefix and type
        m = re.match(r"^(B|I)[\s\-_:]*([A-Za-z0-9_]+)$", ln, flags=re.IGNORECASE)
        if m:
            pref = m.group(1).upper()
            mapped = _best_allowed_type(m.group(2), allowed_types_list)
            tags.append(f"{pref}-{mapped}" if mapped else "O")
            continue
        if ln.upper() == "O":
            tags.append("O")
            continue
        # If line looks like only a type, coerce to O
        tags.append("O")
    return tags



In [7]:
from tqdm.auto import tqdm

gen_kwargs = dict(
    max_new_tokens=1024,  # allow enough lines
    min_new_tokens=4,     # enforce at least a few tokens
    do_sample=False,
    num_beams=1,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

pred_sequences = []

for text in tqdm(texts):
    prompt = text if text.rstrip().endswith('Labels:') else text + '\n'
    prompt = few_shot_prefix + prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    autocast_ctx = (
        torch.amp.autocast("cuda", dtype=torch.bfloat16)
        if torch.cuda.is_available() else torch.cpu.amp.autocast(dtype=torch.bfloat16)
    )
    with torch.no_grad(), autocast_ctx:
        outputs = model.generate(**inputs, **gen_kwargs)
    gen_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()
    tags = extract_bio_tags(gen_text, allowed_types)
    pred_sequences.append(tags)

if len(pred_sequences) == 0:
    pred_sequences = [[]]

print("Example prediction:")
print(pred_sequences[0][:20])


  0%|          | 0/1 [00:00<?, ?it/s]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Example prediction:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [8]:
# Align lengths by truncating/padding with 'O'
true_aligned, pred_aligned = [], []
for ex, (instr, t, p) in enumerate(zip(texts, labels_true, pred_sequences)):
    # Infer the intended token count from the instruction (count lines after 'Tokens:')
    tokens_match = re.split(r"\bTokens:\s*\n", instr)
    expected = None
    if len(tokens_match) > 1:
        # Up to 'Labels:' marker
        token_block = tokens_match[1].split("\n\nLabels:", 1)[0]
        expected = len([ln for ln in token_block.splitlines() if ln.strip() != ""])
    L = expected or max(len(t), len(p))
    t_adj = (t + ["O"] * (L - len(t)))[:L]
    p_adj = (p + ["O"] * (L - len(p)))[:L]
    true_aligned.append(t_adj)
    pred_aligned.append(p_adj)

print("Aligned sequences prepared.")
print("Example true len, pred len:", len(true_aligned[0]), len(pred_aligned[0]))


Aligned sequences prepared.
Example true len, pred len: 81 81


In [9]:
%pip install -q nervaluate pandas
import pandas as pd
from nervaluate import Evaluator

valid = re.compile(r'^(?:(?:B|I)-[A-Za-z0-9_]+|O)$')
true_clean = [[t if valid.match(t) else 'O' for t in seq] for seq in true_aligned]
pred_clean = [[t if valid.match(t) else 'O' for t in seq] for seq in pred_aligned]

entity_types = sorted({t.split('-', 1)[1] for seq in (true_clean + pred_clean) for t in seq if '-' in t})

# Some versions of nervaluate return 3 values: (results, results_by_tag, results_per_doc)
_eval_out = Evaluator(true_clean, pred_clean, tags=entity_types).evaluate()
if isinstance(_eval_out, tuple) and len(_eval_out) >= 2:
    results_overall, results_by_tag = _eval_out[0], _eval_out[1]
else:
    results_overall, results_by_tag = _eval_out, {}

rows = []
for et in entity_types:
    m = results_by_tag.get(et, {}).get('exact', {})
    rows.append({
        'entity': et,
        'TP': m.get('tp', 0),
        'FP': m.get('fp', 0),
        'FN': m.get('fn', 0),
        'precision': m.get('precision', 0.0),
        'recall': m.get('recall', 0.0),
        'f1': m.get('f1', 0.0),
        'mode': 'exact'
    })

agg = results_overall.get('exact', {})
rows.append({'entity': 'OVERALL', 'TP': agg.get('tp', 0), 'FP': agg.get('fp', 0), 'FN': agg.get('fn', 0), 'precision': agg.get('precision', 0.0), 'recall': agg.get('recall', 0.0), 'f1': agg.get('f1', 0.0), 'mode': 'exact'})

out_csv = os.path.join('outputs', 'ner_eval_per_type_exact.csv')
os.makedirs('outputs', exist_ok=True)
pd.DataFrame(rows).to_csv(out_csv, index=False)
print('Saved:', out_csv)


Note: you may need to restart the kernel to use updated packages.
Saved: outputs\ner_eval_per_type_exact.csv


In [None]:
# Sample inference output
idx = 0  # change to inspect another sample

prompt = texts[idx]
# ensure newline after Labels: to nudge model to start tagging
prompt = prompt if prompt.rstrip().endswith('Labels:') else prompt + '\n'
prompt = few_shot_prefix + prompt
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

autocast_ctx = (
    torch.amp.autocast("cuda", dtype=torch.bfloat16)
    if torch.cuda.is_available() else torch.cpu.amp.autocast(dtype=torch.bfloat16)
)
with torch.no_grad(), autocast_ctx:
    outputs = model.generate(**inputs, **gen_kwargs)

gen_ids = outputs[0][inputs.input_ids.shape[-1]:]
gen_text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
if len(gen_text) == 0:
    with torch.no_grad(), autocast_ctx:
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.8,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    gen_ids = outputs[0][inputs.input_ids.shape[-1]:]
    gen_text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

# Try projected tags first
proj_L = 0
m_tokens = re.split(r"\bTokens:\s*\n", prompt)
if len(m_tokens) > 1:
    token_block = m_tokens[1].split("\n\nLabels:", 1)[0]
    proj_L = len([ln for ln in token_block.splitlines() if ln.strip() != ""])
if proj_L > 0:
    pred_tokens = choose_tags_argmax(prompt, proj_L)
else:
    pred_tokens = extract_bio_tags(gen_text, allowed_types)

# print("Instruction:\n", prompt)
# print("\nGround truth tags:\n", "\n".join(labels_true[idx]))
print("\nModel raw output (len=", len(gen_text), "):\n", gen_text)
print("\nGenerated token ids (first 20):", gen_ids[:20].tolist())
print("\nExtracted BIO tags (n=", len(pred_tokens), "):\n", "\n".join(pred_tokens))