# Gretel PII Masking (English) — Load local Parquet dataset

This notebook loads the Gretel PII Masking English dataset from local Parquet files in the `dataset` folder, shows the dataset structure, and displays one example for quick understanding.


In [1]:
from datasets import load_dataset

# Load local Parquet files (train/validation/test)
# Keep paths simple and local under the `dataset/` folder
data_files = {
    "train": "dataset/train-00000-of-00001.parquet",
    "validation": "dataset/validation-00000-of-00001.parquet",
    "test": "dataset/test-00000-of-00001.parquet",
}

print("Loading local Parquet dataset...")
ds = load_dataset("parquet", data_files=data_files)
print("Loaded splits:", list(ds.keys()))
for name, split in ds.items():
    print(f"- {name}: {split.num_rows} rows; columns={split.column_names}")

# Quick peek at one record for sanity
train_sample = ds["train"][0]
print("\n[Sample train] text[:200]:", (train_sample.get("text") or "")[:200].replace("\n", " "))
print("[Sample train] entities (raw) snippet:", str(train_sample.get("entities"))[:120])

# Show dataset object for a quick summary
ds


Loading local Parquet dataset...
Loaded splits: ['train', 'validation', 'test']
- train: 50000 rows; columns=['uid', 'domain', 'document_type', 'document_description', 'entities', 'text']
- validation: 5000 rows; columns=['uid', 'domain', 'document_type', 'document_description', 'entities', 'text']
- test: 5000 rows; columns=['uid', 'domain', 'document_type', 'document_description', 'entities', 'text']

[Sample train] text[:200]: **ADOPTION CERTIFICATE** Issued by Guernsey Adoption Agency, this certificate confirms the adoption of Urvashi Jaggi, born on 2015-07-26, by the adoptive parents. The adoption was finalized on 2022-12
[Sample train] entities (raw) snippet: [{'entity': 'Guernsey', 'types': ['country']}, {'entity': 'Urvashi Jaggi', 'types': ['name']}, {'entity': '2015-07-26', 


DatasetDict({
    train: Dataset({
        features: ['uid', 'domain', 'document_type', 'document_description', 'entities', 'text'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['uid', 'domain', 'document_type', 'document_description', 'entities', 'text'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['uid', 'domain', 'document_type', 'document_description', 'entities', 'text'],
        num_rows: 5000
    })
})

In [2]:
# Show one example (pretty)
example = ds["train"][0]
print("UID:", example.get("uid"))
print("Text preview:", (example.get("text") or "")[:200].replace("\n", " "))
print("Entities (raw):", str(example.get("entities"))[:200])


UID: 24bb7570700746458ede975349077acb
Text preview: **ADOPTION CERTIFICATE** Issued by Guernsey Adoption Agency, this certificate confirms the adoption of Urvashi Jaggi, born on 2015-07-26, by the adoptive parents. The adoption was finalized on 2022-12
Entities (raw): [{'entity': 'Guernsey', 'types': ['country']}, {'entity': 'Urvashi Jaggi', 'types': ['name']}, {'entity': '2015-07-26', 'types': ['date_of_birth']}, {'entity': 'UID-PRWBO4TB', 'types': ['unique_identi


In [3]:
# STEP 2 — Normalize entity structure (simple)
# - Expect `text` and `entities` per row
# - If `entities` is a JSON string, parse to Python list[dict]
# - Keep only {"entity": <text>, "label": <UPPER>}

import json
import ast


def _first_non_empty(seq):
    for item in seq:
        if item is not None and str(item).strip() != "":
            return item
    return None


def parse_entities(entities):
    if entities is None:
        return []
    if isinstance(entities, str):
        try:
            entities = json.loads(entities)
        except Exception:
            try:
                entities = ast.literal_eval(entities)
            except Exception:
                return []
    if isinstance(entities, dict):
        entities = [entities]
    if not isinstance(entities, list):
        return []
    cleaned = []
    for e in entities:
        if isinstance(e, dict):
            ent_text = e.get("entity") or e.get("text") or e.get("span") or e.get("value")
            raw_label = e.get("label") or e.get("type") or e.get("types") or e.get("tag") or e.get("category")
            if isinstance(raw_label, (list, tuple)):
                raw_label = _first_non_empty(raw_label)
            if ent_text and raw_label:
                cleaned.append({"entity": str(ent_text), "label": str(raw_label).upper()})
    return cleaned


normalized = {}
for split_name, split in ds.items():
    print(f"Normalizing entities for split: {split_name} ...")
    rows = []
    for i in range(split.num_rows):
        row = split[i]
        text = row.get("text") or ""
        clean_entities = parse_entities(row.get("entities"))
        rows.append({"text": text, "entities": clean_entities})
    normalized[split_name] = rows
    has_ents = sum(1 for r in rows if r["entities"])
    print(f"- {split_name}: {len(rows)} rows; with_entities={has_ents}; example:", rows[0] if rows else None)

# Quick peek of one normalized example
print("\nNormalized train example:", normalized.get("train", [])[0] if normalized.get("train") else None)


Normalizing entities for split: train ...
- train: 50000 rows; with_entities=50000; example: {'text': '**ADOPTION CERTIFICATE**\nIssued by Guernsey Adoption Agency, this certificate confirms the adoption of Urvashi Jaggi, born on 2015-07-26, by the adoptive parents. The adoption was finalized on 2022-12-15. Unique identifier: UID-PRWBO4TB.', 'entities': [{'entity': 'Guernsey', 'label': 'COUNTRY'}, {'entity': 'Urvashi Jaggi', 'label': 'NAME'}, {'entity': '2015-07-26', 'label': 'DATE_OF_BIRTH'}, {'entity': 'UID-PRWBO4TB', 'label': 'UNIQUE_IDENTIFIER'}]}
Normalizing entities for split: validation ...
- validation: 5000 rows; with_entities=5000; example: {'text': 'Patient Name: Alec Stafford, DOB: 1942-08-10, Medical Record Number: P4076208', 'entities': [{'entity': 'Alec', 'label': 'FIRST_NAME'}, {'entity': 'Stafford', 'label': 'LAST_NAME'}, {'entity': '1942-08-10', 'label': 'DATE_OF_BIRTH'}, {'entity': 'P4076208', 'label': 'MEDICAL_RECORD_NUMBER'}]}
Normalizing entities for split: test .

In [4]:
# STEP 3 — Build simple character spans by substring match (case-insensitive)
# - Replace newlines for stable indices
# - Skip entities we can't find

import uuid

spanned = {}
for split_name, rows in normalized.items():
    print(f"Building spans for split: {split_name} ...")
    out_rows = []
    for row in rows:
        original_text = row.get("text") or ""
        clean_text = original_text.replace("\n", " ").replace("\r", " ")

        spans = []
        for ent in row.get("entities", []):
            value = (ent.get("entity") or "").strip().strip(".,;:!?()[]{}")
            label = str(ent.get("label") or "").upper()
            if not value or not label:
                continue
            start = clean_text.lower().find(value.lower())
            if start == -1:
                continue
            end = start + len(value)
            spans.append({"start": start, "end": end, "label": label})

        out_rows.append({
            "id": str(uuid.uuid4()),
            "source": "gretel_pii_masking_en",
            "text": original_text,
            "clean_text": clean_text,
            "spans": spans,
        })
    spanned[split_name] = out_rows
    print(f"- {split_name}: {len(out_rows)} rows; example spans count:", len(out_rows[0]["spans"]) if out_rows else 0)

# Peek one example
print("\nSpanned train example:", spanned.get("train", [])[0] if spanned.get("train") else None)


Building spans for split: train ...
- train: 50000 rows; example spans count: 4
Building spans for split: validation ...
- validation: 5000 rows; example spans count: 4
Building spans for split: test ...
- test: 5000 rows; example spans count: 4

Spanned train example: {'id': '6e85c290-ad9b-4c35-bd80-afb1d80c3e78', 'source': 'gretel_pii_masking_en', 'text': '**ADOPTION CERTIFICATE**\nIssued by Guernsey Adoption Agency, this certificate confirms the adoption of Urvashi Jaggi, born on 2015-07-26, by the adoptive parents. The adoption was finalized on 2022-12-15. Unique identifier: UID-PRWBO4TB.', 'clean_text': '**ADOPTION CERTIFICATE** Issued by Guernsey Adoption Agency, this certificate confirms the adoption of Urvashi Jaggi, born on 2015-07-26, by the adoptive parents. The adoption was finalized on 2022-12-15. Unique identifier: UID-PRWBO4TB.', 'spans': [{'start': 35, 'end': 43, 'label': 'COUNTRY'}, {'start': 103, 'end': 116, 'label': 'NAME'}, {'start': 126, 'end': 136, 'label': 'DATE_

In [5]:
# STEP 4 — Tokenize and align spans to BIO labels
# - Use the local Qwen tokenizer for consistency with later training
# - Align BIO tags from character spans via offset mapping

from transformers import AutoTokenizer

tokenizer_path = "models/Qwen2.5-0.5B"  # or use remote: "Qwen/Qwen2.5-0.5B"
print("Loading tokenizer from:", tokenizer_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)

aligned = {}
for split_name, rows in spanned.items():
    print(f"Tokenizing and aligning split: {split_name} ...")
    aligned_rows = []
    for row in rows:
        text = row.get("text") or ""
        clean_text = row.get("clean_text", text)
        enc = tokenizer(
            clean_text,
            return_offsets_mapping=True,
            add_special_tokens=False,
        )
        offsets = enc["offset_mapping"]
        tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])

        labels = ["O"] * len(tokens)
        for span in row.get("spans", []):
            s, e, lab = span["start"], span["end"], span["label"]
            began = False
            for ti, (ts, te) in enumerate(offsets):
                if te <= s:
                    continue
                if ts >= e:
                    break
                if ts < e and te > s:  # overlap between token and span
                    labels[ti] = f"B-{lab}" if not began else f"I-{lab}"
                    began = True

        aligned_rows.append({
            "id": row["id"],
            "source": row["source"],
            "text": text,
            "tokens": tokens,
            "labels": labels,
        })
    aligned[split_name] = aligned_rows
    print(f"- {split_name}: {len(aligned_rows)} rows; example tokens/labels:", (aligned_rows[0]["tokens"][:10], aligned_rows[0]["labels"][:10]) if aligned_rows else None)

# Peek one aligned example
print("\nAligned train example:", aligned.get("train", [])[0] if aligned.get("train") else None)


Loading tokenizer from: models/Qwen2.5-0.5B
Tokenizing and aligning split: train ...
- train: 50000 rows; example tokens/labels: (['**', 'ADO', 'PTION', 'ĠCERT', 'IFICATE', '**', 'ĠIss', 'ued', 'Ġby', 'ĠGu'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-COUNTRY'])
Tokenizing and aligning split: validation ...
- validation: 5000 rows; example tokens/labels: (['Patient', 'ĠName', ':', 'ĠAlec', 'ĠStafford', ',', 'ĠDO', 'B', ':', 'Ġ'], ['O', 'O', 'O', 'B-FIRST_NAME', 'B-LAST_NAME', 'O', 'O', 'O', 'O', 'O'])
Tokenizing and aligning split: test ...
- test: 5000 rows; example tokens/labels: (['Transaction', 'Ġdetails', ':', 'Ġgas', 'Limit', 'Ġset', 'Ġto', 'Ġ', '1', '0'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])

Aligned train example: {'id': '6e85c290-ad9b-4c35-bd80-afb1d80c3e78', 'source': 'gretel_pii_masking_en', 'text': '**ADOPTION CERTIFICATE**\nIssued by Guernsey Adoption Agency, this certificate confirms the adoption of Urvashi Jaggi, born on 2015-07-26, by the adoptive p

In [6]:
# STEP 5 — Build instruction/response in Token-per-line (CoNLL-lite) format
# - Instruction: explains task, shows text and token list (one per line), ends with "Labels:" line
# - Response: exactly one BIO tag per line (same number as tokens)

instr_aligned = {}
for split_name, rows in aligned.items():
    print(f"Building instruction/response for split: {split_name} ...")
    out_rows = []
    for row in rows:
        tokens = row["tokens"]
        labels = row["labels"]
        token_lines = "\n".join(tokens)
        instruction = (
            "You will label tokens with BIO tags. Output only the tag after each token. One token per line.\n\n"
            "Text:\n" + row["text"] + "\n\n"
            "Tokens:\n" + token_lines + "\n\n"
            "Labels:"
        )
        response = "\n".join(labels)
        out_rows.append({
            "id": row["id"],
            "source": row["source"],
            "text": row["text"],
            "tokens": tokens,
            "labels": labels,
            "instruction": instruction,
            "response": response,
        })
    instr_aligned[split_name] = out_rows
    print(f"- {split_name}: {len(out_rows)} rows; example token/label count:", (len(out_rows[0]["tokens"]), len(out_rows[0]["labels"])) if out_rows else None)

print("\nInstruction/response train example:", instr_aligned.get("train", [])[0] if instr_aligned.get("train") else None)


Building instruction/response for split: train ...
- train: 50000 rows; example token/label count: (75, 75)
Building instruction/response for split: validation ...
- validation: 5000 rows; example token/label count: (33, 33)
Building instruction/response for split: test ...
- test: 5000 rows; example token/label count: (81, 81)

Instruction/response train example: {'id': '6e85c290-ad9b-4c35-bd80-afb1d80c3e78', 'source': 'gretel_pii_masking_en', 'text': '**ADOPTION CERTIFICATE**\nIssued by Guernsey Adoption Agency, this certificate confirms the adoption of Urvashi Jaggi, born on 2015-07-26, by the adoptive parents. The adoption was finalized on 2022-12-15. Unique identifier: UID-PRWBO4TB.', 'tokens': ['**', 'ADO', 'PTION', 'ĠCERT', 'IFICATE', '**', 'ĠIss', 'ued', 'Ġby', 'ĠGu', 'ern', 'sey', 'ĠAdoption', 'ĠAgency', ',', 'Ġthis', 'Ġcertificate', 'Ġconfirms', 'Ġthe', 'Ġadoption', 'Ġof', 'ĠU', 'rv', 'ashi', 'ĠJag', 'gi', ',', 'Ġborn', 'Ġon', 'Ġ', '2', '0', '1', '5', '-', '0', '7', '-', '2',

In [7]:
# STEP 6 — Save to JSONL per split
# - Compact JSON lines for easy streaming use later

import os
import json

os.makedirs("outputs", exist_ok=True)

for split_name, rows in instr_aligned.items():
    out_path = os.path.join("outputs", f"standardized_gretel_pii_masking_en_{split_name}.jsonl")
    with open(out_path, "w", encoding="utf-8") as f:
        for row in rows:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
    print(f"Wrote {len(rows)} rows ->", out_path)

# Preview first 2 lines of train file (if present)
preview_path = os.path.join("outputs", "standardized_gretel_pii_masking_en_train.jsonl")
if os.path.exists(preview_path):
    print("\nPreview (first 2 lines of train):")
    with open(preview_path, "r", encoding="utf-8") as f:
        for _ in range(2):
            line = f.readline().strip()
            if not line:
                break
            print(line[:2000])


Wrote 50000 rows -> outputs\standardized_gretel_pii_masking_en_train.jsonl
Wrote 5000 rows -> outputs\standardized_gretel_pii_masking_en_validation.jsonl
Wrote 5000 rows -> outputs\standardized_gretel_pii_masking_en_test.jsonl

Preview (first 2 lines of train):
{"id": "6e85c290-ad9b-4c35-bd80-afb1d80c3e78", "source": "gretel_pii_masking_en", "text": "**ADOPTION CERTIFICATE**\nIssued by Guernsey Adoption Agency, this certificate confirms the adoption of Urvashi Jaggi, born on 2015-07-26, by the adoptive parents. The adoption was finalized on 2022-12-15. Unique identifier: UID-PRWBO4TB.", "tokens": ["**", "ADO", "PTION", "ĠCERT", "IFICATE", "**", "ĠIss", "ued", "Ġby", "ĠGu", "ern", "sey", "ĠAdoption", "ĠAgency", ",", "Ġthis", "Ġcertificate", "Ġconfirms", "Ġthe", "Ġadoption", "Ġof", "ĠU", "rv", "ashi", "ĠJag", "gi", ",", "Ġborn", "Ġon", "Ġ", "2", "0", "1", "5", "-", "0", "7", "-", "2", "6", ",", "Ġby", "Ġthe", "Ġadopt", "ive", "Ġparents", ".", "ĠThe", "Ġadoption", "Ġwas", "Ġfinalized", "Ġ