In [5]:
import re

# Compile once, reuse many times.
def remove_trailing_numbers(text: str) -> str:
    # Matches any word with optional hyphens followed by digits
    # e.g., "neurogenesis52", "patterned-ectoderm25"
    def repl(match):
        word = match.group(1)
        digits = match.group(2)
        # only drop the digits if the alphabetic part is long enough
        return word if len(word) > 8 else word + digits

    return re.sub(r"\b([A-Za-z\-]+?)(\d{1,3})\b", repl, text)

PATTERNS = [
    # 1) Parenthetical references to figures/tables/supplementary and "ref(s)"
    re.compile(r"\((?:see\s+)?(?:extended\s+data\s+)?fig(?:s|\.|ure)?[^)]*\)", re.I),
    re.compile(r"\((?:see\s+)?table(?:s|\.)?[^)]*\)", re.I),
    re.compile(r"\((?:supplementary|supp\.)[^)]*\)", re.I),
    re.compile(r"\((?:ref\.?|refs\.?)\s*[\d,\-\–\s]+[a-z]?\)", re.I),

    # 2) Bracketed numeric citations like [1], [2–4,6]
    re.compile(r"\[\s*\d+(?:\s*[\-–,]\s*\d+)*\s*\]"),

    # 3) Standalone inline "ref./refs." without parentheses (rarer, but appears)
    re.compile(r"\b(?:ref\.?|refs\.?)\s*\d+(?:\s*[\-–,]\s*\d+)*\b", re.I),

    # 4) Non-parenthetical mentions of Fig./Figs./Extended Data Fig./Table, optionally with parts a,b,c
    #    Examples: "Fig. 4d", "Figs. 2–4", "Extended Data Fig. 10a,b", "Table 3"
    re.compile(
        r"\b(?:extended\s+data\s+)?fig(?:s|\.|ure)?\.?\s*\d+[a-z]?(?:\s*[,\-–]\s*\d+[a-z]?)*(?:\s*[a-z](?:\s*,\s*[a-z])*)?",
        re.I,
    ),
    re.compile(
        r"\btable(?:s|\.)?\s*\d+[a-z]?(?:\s*[,\-–]\s*\d+[a-z]?)*",
        re.I,
    ),

    # 5) Trailing superscript-style numbers stuck to a word (e.g., "neurogenesis52")
    #    Keep this conservative to avoid eating gene symbols like "IL6".
    #    Heuristic: preceding token is alphabetic and >= 6 chars, followed by 1–3 digits and a boundary.
    re.compile(r"(?<=\b[A-Za-z][A-Za-z\-]{5,})\d{1,3}(?=\b|[\s\.\,\;\:\)])"),
]

# Optional: normalize fancy dashes to ASCII to simplify ranges.
DASHES = re.compile(r"[–—‒−]+")

def clean_citations(text: str) -> str:
    # 0) Normalize dashes so range patterns are easier to match (purely cosmetic).
    text = DASHES.sub("-", text)

    # 1) Remove targeted patterns
    for pat in PATTERNS:
        text = pat.sub("", text)

    # 2) Tidy up leftover punctuation/spaces from removals
    # Remove empty parentheses/brackets that might be left behind
    text = re.sub(r"\(\s*\)", "", text)
    text = re.sub(r"\[\s*\]", "", text)
    # Collapse multiple spaces
    text = re.sub(r"\s{2,}", " ", text)
    # Fix spaces before punctuation
    text = re.sub(r"\s+([,\.\;\:\)])", r"\1", text)
    # Ensure space after sentence-ending punctuation if missing
    text = re.sub(r"([\.!\?])([A-Z])", r"\1 \2", text)
    return text.strip()


# --- Example ---
sample = """Beginning as early as the 16-somite stage, most neuronal diversity derives from direct neurogenesis (Fig. 4d), including motor neurons, cerebellar Purkinje cells, Cajal–Retzius cells and many other subtypes (CNS neurons sub-panel of Extended Data Fig. 3). Indirect neurogenesis52 has a later start, with intermediate neuronal progenitors first detected at E10.25, later giving rise to deep-layer neurons, upper-layer neurons, subplate neurons, and cortical interneurons (Fig. 4d and Extended Data Fig. 10a,b). Although many subtypes deriving from direct neurogenesis are easily distinguished, the majority (55%) of these 2.1 million cells could initially only be coarsely annotated as glutamatergic or GABAergic (γ-aminobutyric acid-producing) neurons or dorsal or ventral spinal cord progenitors. To leverage the greater heterogeneity evident at early stages as these trajectories ‘launch’ from the patterned neuroectoderm, we re-analysed the pre-E13 subset. This facilitated much more granular annotation, while also highlighting sources of heterogeneity—for example, anterior versus posterior or inhibitory versus excitatory (Fig. 4e, Extended Data Fig. 10c,d and Supplementary Table 12)"""

print(clean_citations(sample))

error: look-behind requires fixed-width pattern