In [4]:
from dataclasses import dataclass
from typing import Dict, List, Tuple
from googletrans import Translator

# ---------------------------
# Data structures
# ---------------------------
@dataclass(frozen=True)
class Ent:
    start: int
    end: int
    typ: str
    text: str
# ---------------------------
# 1) Mask using start/end + per-type indexing
# ---------------------------
def mask_indexed(text: str, entities_pred: List[dict]) -> Tuple[str, Dict[str, str]]:
    """
    Replace entity spans in raw text with protected placeholders.
    Uses XML-like tags that translation engines typically preserve.
    """
    ents = [Ent(e["start"], e["end"], e["type"], e["text"]) for e in entities_pred]
    ents_sorted = sorted(ents, key=lambda x: x.start)

    counters: Dict[str, int] = {}
    assigned: List[Tuple[Ent, str]] = []
    mapping: Dict[str, str] = {}
    for ent in ents_sorted:
        counters[ent.typ] = counters.get(ent.typ, 0) + 1
        # Use XML-like placeholder that translators won't touch
        ph = f"<x id=\"{ent.typ}_{counters[ent.typ]}\"/>"
        assigned.append((ent, ph))
        mapping[ph] = ent.text

    masked = text
    for ent, ph in sorted(assigned, key=lambda x: x[0].start, reverse=True):
        masked = masked[:ent.start] + ph + masked[ent.end:]

    return masked, mapping

# ---------------------------
# 2) Demask by ID
# ---------------------------
def demask_by_id(text: str, mapping: Dict[str, str]) -> str:
    for ph in sorted(mapping.keys(), key=len, reverse=True):
        text = text.replace(ph, mapping[ph])
    return text

# ---------------------------
# 3) Toy "translation" to show reordering doesn't break IDs
# ---------------------------
def toy_translate(masked_text: str, target_lang: str = "es") -> str:
    """
    Perform real translation using Google Translate while preserving placeholders.
    """
    translator = Translator()
    translated = translator.translate(masked_text, dest=target_lang).text
    return translated

# ---------------------------
# 4) Run on your examples
# ---------------------------
def run(doc: dict) -> None:
    original = doc["text"]
    masked, mapping = mask_indexed(original, doc["entities_pred"])
    mt_out = toy_translate(masked)
    final = demask_by_id(mt_out, mapping)

    print("\n---", doc["doc_id"], "---")
    print("ORIGINAL:\n", original)
    print("\nMASKED:\n", masked)
    print("\nMAPPING:\n", mapping)
    print("\nMT OUT (masked):\n", mt_out)
    print("\nFINAL (demasked):\n", final)

# ---------------------------
# Example docs (paste yours here)
# ---------------------------
doc_000000 = {
    "doc_id": "doc_000000",
    "text": "Welcome to Krasnapolsky Amsterdam, a comfortable stay with canal-view rooms, a winter-garden breakfast, and 24/7 reception. Find us at Damrak 96, 1012 LP Amsterdam, Amsterdam. Just minutes from Dam Square and popular local cafés, shops, and evening spots. Enjoy a quiet lobby with tea and coffee available most afternoons. If you’re arriving early, luggage storage is available until your room is ready. Transport tips: buses run frequently nearby, and rideshare pickup is easiest at the main entrance.",
    "entities_pred": [
        {"start": 135, "end": 163, "type": "ADDRESS", "text": "Damrak 96, 1012 LP Amsterdam", "score": 0.85, "method": "faiss_e5_top1"},
        {"start": 194, "end": 204, "type": "LANDMARK_POI", "text": "Dam Square", "score": 0.85, "method": "faiss_e5_top1"},
    ],
}

doc_000001 = {
    "doc_id": "doc_000001",
    "text": "Welcome to Rembrandt Square Hotel, a comfortable stay with bike rentals, blackout curtains, and a small cocktail bar. Find us at Amstelstraat 20, 1017 DA Amsterdam, Amsterdam. Just minutes from Rembrandtplein and popular local cafés, shops, and evening spots. Enjoy a quiet lobby with tea and coffee available most afternoons. If you’re arriving early, luggage storage is available until your room is ready. Transport tips: the nearest metro is a short walk away, and taxis can be requested at reception.",
    "entities_pred": [
        {"start": 11, "end": 33, "type": "HOTEL_NAME", "text": "Rembrandt Square Hotel", "score": 0.86, "method": "faiss_e5_top1"},
        {"start": 129, "end": 163, "type": "ADDRESS", "text": "Amstelstraat 20, 1017 DA Amsterdam", "score": 0.86, "method": "faiss_e5_top1"},
        {"start": 194, "end": 208, "type": "LANDMARK_POI", "text": "Rembrandtplein", "score": 0.86, "method": "faiss_e5_top1"},
    ],
}

if __name__ == "__main__":
    run(doc_000000)
    run(doc_000001)



--- doc_000000 ---
ORIGINAL:
 Welcome to Krasnapolsky Amsterdam, a comfortable stay with canal-view rooms, a winter-garden breakfast, and 24/7 reception. Find us at Damrak 96, 1012 LP Amsterdam, Amsterdam. Just minutes from Dam Square and popular local cafés, shops, and evening spots. Enjoy a quiet lobby with tea and coffee available most afternoons. If you’re arriving early, luggage storage is available until your room is ready. Transport tips: buses run frequently nearby, and rideshare pickup is easiest at the main entrance.

MASKED:
 Welcome to Krasnapolsky Amsterdam, a comfortable stay with canal-view rooms, a winter-garden breakfast, and 24/7 reception. Find us at <x id="ADDRESS_1"/>, Amsterdam. Just minutes from <x id="LANDMARK_POI_1"/> and popular local cafés, shops, and evening spots. Enjoy a quiet lobby with tea and coffee available most afternoons. If you’re arriving early, luggage storage is available until your room is ready. Transport tips: buses run frequently nearby, an