In [1]:
import re
from pathlib import Path

In [None]:
text =r"""\newglossaryentry{pseudoinverse}
{name={pseudoinverse},
 description={The \index{pseudoinverse}Moore–Penrose {pseudoinverse} formula},
 text={pseudoinverse}}
"""
  

In [12]:
text = r"""\newglossaryentry{pseudoinverse}
{name={pseudoinverse},
	description={The \index{pseudoinverse}Moore–Penrose pseudoinverse $\mA^{+}$ 
		of a \gls{matrix} $\featuremtx \in \mathbb{R}^{\samplesize \times \nrfeatures}$ 
		generalizes the notion of an \gls{inverse} \cite{GolubVanLoanBook}. 
		The pseudoinverse arises naturally in \gls{ridgeregression} for a 
		\gls{dataset} with \gls{featuremtx} $\featuremtx$ and \gls{labelvec} 
		$\labelvec$ \cite[Ch.\ 3]{hastie01statisticallearning}. 
		The \glspl{modelparam} learned by \gls{ridgeregression} 
		are given by
		\[
		\widehat{\weights}^{(\regparam)}  = \big(\featuremtx^{T} \featuremtx + \regparam \mI \big)^{-1} \featuremtx^{T} \vy, \quad \regparam > 0.
		\]
		We can then define the pseudoinverse $\featuremtx^{+} \in \mathbb{R}^{\nrfeatures \times \samplesize}$ via 
		the limit \cite[Ch. 3]{benisrael2003generalized}
		\[
		\lim_{\regparam \to 0^+} \widehat{\weights}^{(\regparam)} = \featuremtx^+ \vy.
		\]
		\\
		See also: \gls{matrix}, \gls{inverse}, \gls{ridgeregression}. },
	first={pseudoinverse},
	text={pseudoinverse}
}

\newglossaryentry{randomexperiment}
{name={random experiment},
	description={A random experiment\index{random experiment} is a pseudoinverse \gls{pseudoinverse} physical (or abstract) process 
		that produces an outcome $\outcome$ from a set $\samplespace$ of possibilities. 
		This set of all possible outcomes is referred to as the \gls{samplespace} of 
		the experiment. The key characteristic of a random experiment is that its 
		outcome is unpredictable (or uncertain). Any measurement or observation 
		of the outcome is a \gls{rv}, i.e., a \gls{function} of the outcome $\outcome \in \samplespace$. 
		\Gls{probability} theory uses a \gls{probspace} as a mathematical structure for the study of 
		random experiments. A key conceptual property of a random experiment is that it can 
        \gls{pseudoinverse}, pseudoinverse, 
		be repeated under identical conditions. Strictly speaking, repeating a random experiment 
		a given number of $\samplesize$ times defines a new random experiment. The outcomes 
		of this new experiment are length-$\samplesize$ sequences of outcomes 
		from the original experiment (see Fig. \ref{fig_randomexperiment_dict}). While the outcome of a single experiment is 
		uncertain, the long-run behaviour of the outcomes of repeated experiments 
		tends to become increasingly predictable. This informal claim can be made 
		precise via fundamental results of \gls{probability} theory, such as the \gls{lln} 
		and the \gls{clt}.
		\begin{figure}[H]
			\begin{center}
				\begin{tikzpicture}[>=Stealth, node distance=1.5cm and 2cm, every node/.style={font=\small}]
					\node (experiment) [draw, rectangle, rounded corners, minimum width=2.6cm, align=center] {random\\experiment};
					\node (omega) [right=of experiment] {$\outcome \in \samplespace$};
					\coordinate (rightpad) at ($(omega.east) + (0.2,0)$);
					\draw[->] (experiment) -- (omega);
					\node (sequence) [below=of experiment, yshift=-0.5cm] {$(\outcome^{(1)}, \,\outcome^{(2)}, \,\dots, \,\outcome^{(\samplesize)})$};
					\node (sequence1) [below=of sequence, yshift=-0.5cm] {$(\datapoint^{(1)}, \,\datapoint^{(2)}, \,\dots, \,\datapoint^{(\samplesize)})$};
					\draw[->, thick] (experiment.south) -- node[midway, right, xshift=3pt] {repeat $\samplesize$ times} (sequence.north);
					\draw[->, thick] (sequence.south) -- node[midway, right, xshift=3pt] {\glspl{rv}} (sequence1.north);
					% Anchor node ~60% along the repeat arrow
					\path (experiment.south) -- (sequence.north) coordinate[pos=0.6] (repeatpoint);
					% Dotted rounded box enclosing experiment and part of repeat arrow
					\node[draw=black, rounded corners, dotted, fit={(experiment) (repeatpoint) (rightpad)}, inner sep=8pt, label=above:{new random experiment with $\samplespace' = \samplespace \times \ldots \times \samplespace$}] {};
				\end{tikzpicture}
			\end{center}
			\caption{A random experiment produces an outcome $\outcome \in \samplespace$ from a set 
				of possibilities (i.e., a \gls{samplespace}) 
				$\samplespace$. Repeating the experiment $\samplesize$ times yields another random 
				experiment, whose outcomes are sequences 
				$(\outcome^{(1)}, \,\outcome^{(2)}, \,\dots, \,\outcome^{(\samplesize)}) \in \samplespace\times\ldots\times \samplespace$. 
				One example of a random experiment arising in many \gls{ml} applications is the gathering 
				of a \gls{trainset} $\datapoint^{(1)},\,\ldots,\,\datapoint^{(\samplesize)}$. \label{fig_randomexperiment_dict}}
		\end{figure} 
		Examples for random experiments arising in \gls{ml} applications include the following: 
		\begin{itemize} 
			\item \Gls{data} collection: The \glspl{datapoint} collected in \gls{erm}-based methods 
			can be interpreted as \glspl{rv}, i.e., as \glspl{function} of the outcome $\outcome \in \samplespace$ 
			of a random experiment. 
			\item \Gls{stochGD} uses a random experiment at each iteration to select a subset of 
			the \gls{trainset}. 
			\item \Gls{privprot} methods use random experiments to perturb  
			the outputs of an \gls{ml} method to ensure \gls{diffpriv}. 
		\end{itemize} 
		See also: \gls{samplespace}, \gls{rv}, \gls{probability}, \gls{probspace}.},
	firstplural={random experiments},
	plural={random experiments},
	first={random experiment},
	text={random experiment}
}
"""


In [6]:
# helper to test whether index is inside any (start,end) list
def inside_any(index, spans):
    return any(s <= index < e for s, e in spans)

def extract_glossary_terms(tex_text):
    """Extract glossary entries and their names."""
    entries = re.findall(
        r'\\newglossaryentry\{(.*?)\}\s*\{.*?name=\{(.*?)\}',
        tex_text,
        re.DOTALL
    )
    return {term.strip(): name.strip() for term, name in entries}

def replace_terms_with_gls(tex_text, glossary):
    """
    Replace plain term occurrences with \\gls{<term>} except:
      - inside existing \\gls{...} commands (never replace),
      - inside the glossary definition of the same term (do not replace occurrences of 'term' inside its own definition),
      - but allow replacing the term when it appears inside OTHER glossary definitions.
    Returns updated_text, changes (list of (name, start_index) for replacements).
    """

    updated_text = tex_text
    changes = []

    # 1) Find spans of every \newglossaryentry{key}{...} and record by key
    glossary_defs = {}   # key -> (start, end)
    for m in re.finditer(r'\\newglossaryentry\{(.*?)\}\s*\{.*?\}(?=\s*\\newglossaryentry|$)', tex_text, re.DOTALL):
        print(m)
        key = m.group(1)
        glossary_defs[key] = (m.start(), m.end())

    # 2) Find spans of existing \gls{...} (never replace inside these)
    gls_spans = [ (m.start(), m.end()) for m in re.finditer(r'\\gls\{.*?\}', tex_text) ]

    # 3) Collect candidate replacements from the ORIGINAL text (not updated_text)
    #    This avoids index-shift problems when applying multiple replacements.
    replacements = []  # list of (start, end, replacement_text, name)

    for term, name in glossary.items():
        # compile pattern for whole-word, case-insensitive matching of the displayed name
        # re.escape(name) - Escapes any special regex characters inside `name`.
        # For example, if name = "C++", it becomes "C\+\+"
        # so '+' is treated literally, not as a regex operator.
        pattern = re.compile(rf'\b{re.escape(name)}\b', re.IGNORECASE)

        for m in pattern.finditer(tex_text):
            start, end = m.start(), m.end()

            # If inside any \gls{...}, skip
            if inside_any(start, gls_spans):
                continue

            # If inside this term's own glossary definition, skip
            # Glossary keys (term) should match the key used in \newglossaryentry{<key>}
            # Only protect if this key exists in glossary_defs
            def_span = glossary_defs.get(term)
            if def_span and def_span[0] <= start < def_span[1]:
                # match is inside its own definition -> skip replacing
                continue

            # Otherwise, this occurrence is OK to replace
            replacement_text = f'\\gls{{{term}}}'
            replacements.append((start, end, replacement_text, name))

    # 4) Sort replacements by start index descending, apply them to updated_text
    #    Applying from right to left prevents earlier replacements from shifting later spans.
    replacements.sort(key=lambda x: x[0], reverse=True)

    occupied = set()  # optional: track replaced positions to avoid overlapping replacements
    for start, end, repl, name in replacements:
        # Avoid overlapping replacements: if any position already replaced, skip this one
        # (This is a defensive measure — with whole-word patterns overlaps are unlikely.)
        if any(i in occupied for i in range(start, end)):
            continue

        updated_text = updated_text[:start] + repl + updated_text[end:]
        # mark occupied indices (relative to the original coordinates)
        # Note: indices in 'occupied' refer to original positions; because we apply right→left, they remain valid.
        for i in range(start, end):
            occupied.add(i)
        changes.append((name, start))

    # We returned changes in the order we applied (right->left). If you prefer chronological order,
    # you can sort changes by the index ascending here.
    changes.sort(key=lambda x: x[1])
    return updated_text, changes


In [13]:
# # def main():
# input_file = Path("test.tex")
# output_file = Path("out.tex")

# tex_text = input_file.read_text(encoding="utf-8")

glossary = extract_glossary_terms(text)
print(f"Found {len(glossary)} glossary terms. \n", glossary)

Found 2 glossary terms. 
 {'pseudoinverse': 'pseudoinverse', 'randomexperiment': 'random experiment'}


In [15]:
updated_text, changes = replace_terms_with_gls(text, glossary)

print(f"Applied {len(changes)} replacements.")
for name, pos in changes[:10]:  # show sample
    print(f"  - {name} at position {pos}")

output_file.write_text(updated_text, encoding="utf-8")
print(f"Updated file written to {output_file}")

# if __name__ == "__main__":
#     main()

<re.Match object; span=(0, 1083), match='\\newglossaryentry{pseudoinverse}\n{name={pseudoi>
<re.Match object; span=(1085, 5373), match="\\newglossaryentry{randomexperiment}\n{name={rand>
Applied 2 replacements.
  - pseudoinverse at position 1212
  - pseudoinverse at position 1905
Updated file written to out.tex


In [16]:
print(updated_text)

\newglossaryentry{pseudoinverse}
{name={pseudoinverse},
	description={The \index{pseudoinverse}Moore–Penrose pseudoinverse $\mA^{+}$ 
		of a \gls{matrix} $\featuremtx \in \mathbb{R}^{\samplesize \times \nrfeatures}$ 
		generalizes the notion of an \gls{inverse} \cite{GolubVanLoanBook}. 
		The pseudoinverse arises naturally in \gls{ridgeregression} for a 
		\gls{dataset} with \gls{featuremtx} $\featuremtx$ and \gls{labelvec} 
		$\labelvec$ \cite[Ch.\ 3]{hastie01statisticallearning}. 
		The \glspl{modelparam} learned by \gls{ridgeregression} 
		are given by
		\[
		\widehat{\weights}^{(\regparam)}  = \big(\featuremtx^{T} \featuremtx + \regparam \mI \big)^{-1} \featuremtx^{T} \vy, \quad \regparam > 0.
		\]
		We can then define the pseudoinverse $\featuremtx^{+} \in \mathbb{R}^{\nrfeatures \times \samplesize}$ via 
		the limit \cite[Ch. 3]{benisrael2003generalized}
		\[
		\lim_{\regparam \to 0^+} \widehat{\weights}^{(\regparam)} = \featuremtx^+ \vy.
		\]
		\\
		See also: \gls{matrix}, \gls{

In [10]:
# matches only if ONE char between tags
text = "<tag>first</tag>"
matches = re.findall(r"<tag>(.)</tag>", text)
print(matches)

[]


In [7]:
# matches only if ONE char 
text = "<tag>f</tag>"
matches = re.findall(r"<tag>(.)</tag>", text)
print(matches)


['f']


In [8]:
# matches all chars

text = "<tag>first</tag>"
matches = re.findall(r"<tag>(.*)</tag>", text)
print(matches)


['first']


In [11]:
# greedy
text = "<tag>first</tag><tag>second</tag>"
matches = re.findall(r"<tag>(.*)</tag>", text)
print(matches)

['first</tag><tag>second']


In [12]:
# non greedy
text = "<tag>first</tag><tag>second</tag>"
matches = re.findall(r"<tag>(.*?)</tag>", text)
print(matches)

['first', 'second']


In [None]:
# The re.finditer() works exactly the same as the re.findall() method 
# except it returns an iterator yielding match objects matching

# Protect glossary definitions and \gls{...} commands
protected = []
for match in re.finditer(r'\\newglossaryentry\{.*?\}\s*\{.*?\}', text, re.DOTALL):
    protected.append((match.start(), match.end()))
# for match in re.finditer(r'\\gls\{.*?\}', text):
#     protected.append((match.start(), match.end()))

In [104]:
matches = re.findall(r'\\newglossaryentry\{(.*?)\}\s*\{.*?name=\{(.*?)\}', text, re.DOTALL)
print(matches)

[('pseudoinverse', 'pseudoinverse'), ('randomexperiment', 'random experiment')]


In [31]:
matches = re.findall(r'\\gls\{.*?\}', text, re.DOTALL)
print(matches)

['\\gls{matrix}', '\\gls{inverse}', '\\gls{ridgeregression}', '\\gls{dataset}', '\\gls{featuremtx}', '\\gls{labelvec}', '\\gls{ridgeregression}', '\\gls{matrix}', '\\gls{inverse}', '\\gls{ridgeregression}', '\\gls{samplespace}', '\\gls{rv}', '\\gls{function}', '\\gls{probspace}', '\\gls{pseudoinverse}', '\\gls{probability}', '\\gls{lln}', '\\gls{clt}', '\\gls{samplespace}', '\\gls{ml}', '\\gls{trainset}', '\\gls{ml}', '\\gls{erm}', '\\gls{trainset}', '\\gls{ml}', '\\gls{diffpriv}', '\\gls{samplespace}', '\\gls{rv}', '\\gls{probability}', '\\gls{probspace}']


In [36]:
glossary = extract_glossary_terms(text)
print(f"Found {len(glossary)} glossary terms. \n", glossary)

Found 2 glossary terms. 
 {'pseudoinverse': 'pseudoinverse', 'randomexperiment': 'random experiment'}


In [None]:
updated_text, changes = replace_terms_with_gls(text, glossary)

print(f"Applied {len(changes)} replacements.")
for name, pos in changes[:10]:  # show sample
    print(f"  - {name} at position {pos}")

output_file.write_text(updated_text, encoding="utf-8")
print(f"Updated file written to {output_file}")


In [80]:
def is_protected(index):
    # If starting index is in protected parts --> True
    # When later replacing plain glossary terms with \gls{term},
    # we must ensure we’re not:
    #       - modifying terms *inside* glossary definitions, or
    #       - double-wrapping already existing \gls{...} commands.
    return any(start <= index < end for start, end in protected)
    
def replace_terms_with_gls(tex_text, glossary):
    """Replace plain term occurrences outside glossary definitions."""
    updated_text = tex_text
    changes = []

    # Protect glossary definitions and \gls{...} commands
    protected = []
    for match in re.finditer(r'\\newglossaryentry\{.*?\}\s*\{.*\}', tex_text, re.DOTALL):
        protected.append((match.start(), match.end()))
    for match in re.finditer(r'\\gls\{.*?\}', tex_text):
        protected.append((match.start(), match.end()))

    for term, name in glossary.items():
        # Regex to match term name in text (whole words only)
        pattern = re.compile(rf'\b{name}\b', re.IGNORECASE)
        for match in pattern.finditer(updated_text):
            if not is_protected(match.start()):
                start, end = match.span()
                replacement = f'\\gls{{{term}}}'
                updated_text = updated_text[:start] + replacement + updated_text[end:]
                changes.append((name, start))
    return updated_text, changes

In [None]:
# -----------------------------------------------------------
# Explanation: Positive Lookahead (?=...)
# -----------------------------------------------------------
# A *lookahead* in regex lets you "peek" ahead in the text
# without actually consuming the matched characters.
#
# Syntax:
#     (?=pattern)
#
# Meaning:
#   → "At this point in the string, the next characters
#      must match `pattern`, but don't include them in the match."
#
# Example:
#     re.search(r"apple(?=\spie)", "apple pie")
#     → Matches only "apple" because "(?=\spie)" checks that
#       " pie" follows, but does not consume it.
#
# Why it matters:
#   - Lookaheads are useful when you want to stop matching
#     *before* a certain pattern appears, while keeping that
#     pattern available for the next match.
#
# In our glossary example:
#     (?=\s*\\newglossaryentry|$)
#
#   → Ensures the current match ends right before either:
#       1. The next "\newglossaryentry", or
#       2. The end of the string ($)
#
#   This lets re.finditer() capture one glossary entry at a time,
#   without "eating" the next entry.
# -----------------------------------------------------------


In [114]:
for m in re.finditer(pattern, text, re.DOTALL):
    print(m.group(0), "\n")  # the entire matched string
    print(m.group(1))  # the first capturing group (the key)

\newglossaryentry{pseudoinverse}
{name={pseudoinverse},
	description={The \index{pseudoinverse}Moore–Penrose pseudoinverse $\mA^{+}$ 
		of a \gls{matrix} $\featuremtx \in \mathbb{R}^{\samplesize \times \nrfeatures}$ 
		generalizes the notion of an \gls{inverse} \cite{GolubVanLoanBook}. 
		The pseudoinverse arises naturally in \gls{ridgeregression} for a 
		\gls{dataset} with \gls{featuremtx} $\featuremtx$ and \gls{labelvec} 
		$\labelvec$ \cite[Ch.\ 3]{hastie01statisticallearning}. 
		The \glspl{modelparam} learned by \gls{ridgeregression} 
		are given by
		\[
		\widehat{\weights}^{(\regparam)}  = \big(\featuremtx^{T} \featuremtx + \regparam \mI \big)^{-1} \featuremtx^{T} \vy, \quad \regparam > 0.
		\]
		We can then define the pseudoinverse $\featuremtx^{+} \in \mathbb{R}^{\nrfeatures \times \samplesize}$ via 
		the limit \cite[Ch. 3]{benisrael2003generalized}
		\[
		\lim_{\regparam \to 0^+} \widehat{\weights}^{(\regparam)} = \featuremtx^+ \vy.
		\]
		\\
		See also: \gls{matrix}, \gls{

In [112]:
pattern = r'\\newglossaryentry\{(.*?)\}\s*\{.*?\}(?=\s*\\newglossaryentry|$)'
matches = re.findall(pattern, text, re.DOTALL)
print(matches[0])

pseudoinverse


In [68]:
# Protect glossary definitions and \gls{...} commands
protected = []
for match in re.finditer(r'\\newglossaryentry\{.*?\}\s*\{.*?\}(?=\s*\\newglossaryentry|$)', text, re.DOTALL):
    protected.append((match.start(), match.end()))
for match in re.finditer(r'\\gls\{.*?\}', text):
    protected.append((match.start(), match.end()))

protected[:5]

[(0, 1083), (1085, 5339), (141, 153), (248, 261), (328, 349)]

In [71]:
print(text[:1083]), print(text[141:153])

\newglossaryentry{pseudoinverse}
{name={pseudoinverse},
	description={The \index{pseudoinverse}Moore–Penrose pseudoinverse $\mA^{+}$ 
		of a \gls{matrix} $\featuremtx \in \mathbb{R}^{\samplesize \times \nrfeatures}$ 
		generalizes the notion of an \gls{inverse} \cite{GolubVanLoanBook}. 
		The pseudoinverse arises naturally in \gls{ridgeregression} for a 
		\gls{dataset} with \gls{featuremtx} $\featuremtx$ and \gls{labelvec} 
		$\labelvec$ \cite[Ch.\ 3]{hastie01statisticallearning}. 
		The \glspl{modelparam} learned by \gls{ridgeregression} 
		are given by
		\[
		\widehat{\weights}^{(\regparam)}  = \big(\featuremtx^{T} \featuremtx + \regparam \mI \big)^{-1} \featuremtx^{T} \vy, \quad \regparam > 0.
		\]
		We can then define the pseudoinverse $\featuremtx^{+} \in \mathbb{R}^{\nrfeatures \times \samplesize}$ via 
		the limit \cite[Ch. 3]{benisrael2003generalized}
		\[
		\lim_{\regparam \to 0^+} \widehat{\weights}^{(\regparam)} = \featuremtx^+ \vy.
		\]
		\\
		See also: \gls{matrix}, \gls{

(None, None)

In [None]:
# -----------------------------------------------------------
# Explanation: Building a regex pattern for glossary terms
# -----------------------------------------------------------
# For each (term, name) pair in the glossary dictionary:
#     glossary = {
#         "pseudoinverse": "pseudoinverse",
#         "randomexperiment": "random experiment",
#         ...
#     }
#
# We build a regular expression pattern to find occurrences
# of each glossary term (the 'name') inside a text.
#
# rf'\b{name}\b' → combines two ideas:
#   • 'r'  → raw string: backslashes (\b) are treated literally
#   • 'f'  → f-string: {name} is replaced with the variable value
#
# Example:
#     name = "pseudoinverse"
#     rf'\b{name}\b'  →  r'\bpseudoinverse\b'
#
# '\b' marks a *word boundary* in regex, so this pattern
# matches only the *whole word* "pseudoinverse", not parts
# of longer words like "pseudoinverses".
#
# re.IGNORECASE makes the match case-insensitive:
#   → "Pseudoinverse", "pseudoinverse", "PSEUDOINVERSE" all match.
#
# Compiling the pattern with re.compile() improves performance
# when matching repeatedly.
# -----------------------------------------------------------


In [73]:
glossary

{'pseudoinverse': 'pseudoinverse', 'randomexperiment': 'random experiment'}

In [84]:
print(text[1855:1868]), print(text[141:153])

pseudoinverse
\gls{matrix}


(None, None)

In [83]:
updated_text = text
for term, name in glossary.items():
    # Regex to match term name in text (whole words only)
    pattern = re.compile(rf'\b{name}\b', re.IGNORECASE)
    for match in pattern.finditer(updated_text):
        print(match)
        if not is_protected(match.start()):
            print(match)
            # start, end = match.span()
            # replacement = f'\\gls{{{term}}}'
            # updated_text = updated_text[:start] + replacement + updated_text[end:]
            # changes.append((name, start))


<re.Match object; span=(18, 31), match='pseudoinverse'>
<re.Match object; span=(40, 53), match='pseudoinverse'>
<re.Match object; span=(81, 94), match='pseudoinverse'>
<re.Match object; span=(109, 122), match='pseudoinverse'>
<re.Match object; span=(294, 307), match='pseudoinverse'>
<re.Match object; span=(737, 750), match='pseudoinverse'>
<re.Match object; span=(1044, 1057), match='pseudoinverse'>
<re.Match object; span=(1067, 1080), match='pseudoinverse'>
<re.Match object; span=(1855, 1868), match='pseudoinverse'>
<re.Match object; span=(1871, 1884), match='pseudoinverse'>
<re.Match object; span=(1128, 1145), match='random experiment'>
<re.Match object; span=(1164, 1181), match='random experiment'>
<re.Match object; span=(1188, 1205), match='random experiment'>
<re.Match object; span=(1453, 1470), match='random experiment'>
<re.Match object; span=(1808, 1825), match='random experiment'>
<re.Match object; span=(1960, 1977), match='random experiment'>
<re.Match object; span=(2033, 2050

In [77]:
replace_terms_with_gls(text, glossary)

("\\newglossaryentry{pseudoinverse}\n{name={pseudoinverse},\n\tdescription={The \\index{pseudoinverse}Moore–Penrose pseudoinverse $\\mA^{+}$ \n\t\tof a \\gls{matrix} $\\featuremtx \\in \\mathbb{R}^{\\samplesize \\times \\nrfeatures}$ \n\t\tgeneralizes the notion of an \\gls{inverse} \\cite{GolubVanLoanBook}. \n\t\tThe pseudoinverse arises naturally in \\gls{ridgeregression} for a \n\t\t\\gls{dataset} with \\gls{featuremtx} $\\featuremtx$ and \\gls{labelvec} \n\t\t$\\labelvec$ \\cite[Ch.\\ 3]{hastie01statisticallearning}. \n\t\tThe \\glspl{modelparam} learned by \\gls{ridgeregression} \n\t\tare given by\n\t\t\\[\n\t\t\\widehat{\\weights}^{(\\regparam)}  = \\big(\\featuremtx^{T} \\featuremtx + \\regparam \\mI \\big)^{-1} \\featuremtx^{T} \\vy, \\quad \\regparam > 0.\n\t\t\\]\n\t\tWe can then define the pseudoinverse $\\featuremtx^{+} \\in \\mathbb{R}^{\\nrfeatures \\times \\samplesize}$ via \n\t\tthe limit \\cite[Ch. 3]{benisrael2003generalized}\n\t\t\\[\n\t\t\\lim_{\\regparam \\to 0^+} 

In [88]:
# helper to test whether index is inside any (start,end) list
def inside_any(index, spans):
    return any(s <= index < e for s, e in spans)

In [115]:
def replace_terms_with_gls(tex_text, glossary):
    """
    Replace plain term occurrences with \\gls{<term>} except:
      - inside existing \\gls{...} commands (never replace),
      - inside the glossary definition of the same term (do not replace occurrences of 'term' inside its own definition),
      - but allow replacing the term when it appears inside OTHER glossary definitions.
    Returns updated_text, changes (list of (name, start_index) for replacements).
    """

    updated_text = tex_text
    changes = []

    # 1) Find spans of every \newglossaryentry{key}{...} and record by key
    glossary_defs = {}   # key -> (start, end)
    for m in re.finditer(r'\\newglossaryentry\{(.*?)\}\s*\{.*?\}(?=\s*\\newglossaryentry|$)', tex_text, re.DOTALL):
        print(m)
        key = m.group(1)
        glossary_defs[key] = (m.start(), m.end())

    # 2) Find spans of existing \gls{...} (never replace inside these)
    gls_spans = [ (m.start(), m.end()) for m in re.finditer(r'\\gls\{.*?\}', tex_text) ]

    # 3) Collect candidate replacements from the ORIGINAL text (not updated_text)
    #    This avoids index-shift problems when applying multiple replacements.
    replacements = []  # list of (start, end, replacement_text, name)

    for term, name in glossary.items():
        # compile pattern for whole-word, case-insensitive matching of the displayed name
        # re.escape(name) - Escapes any special regex characters inside `name`.
        # For example, if name = "C++", it becomes "C\+\+"
        # so '+' is treated literally, not as a regex operator.
        pattern = re.compile(rf'\b{re.escape(name)}\b', re.IGNORECASE)

        for m in pattern.finditer(tex_text):
            start, end = m.start(), m.end()

            # If inside any \gls{...}, skip
            if inside_any(start, gls_spans):
                continue

            # If inside this term's own glossary definition, skip
            # Glossary keys (term) should match the key used in \newglossaryentry{<key>}
            # Only protect if this key exists in glossary_defs
            def_span = glossary_defs.get(term)
            if def_span and def_span[0] <= start < def_span[1]:
                # match is inside its own definition -> skip replacing
                continue

            # Otherwise, this occurrence is OK to replace
            replacement_text = f'\\gls{{{term}}}'
            replacements.append((start, end, replacement_text, name))

    # 4) Sort replacements by start index descending, apply them to updated_text
    #    Applying from right to left prevents earlier replacements from shifting later spans.
    replacements.sort(key=lambda x: x[0], reverse=True)

    occupied = set()  # optional: track replaced positions to avoid overlapping replacements
    for start, end, repl, name in replacements:
        # Avoid overlapping replacements: if any position already replaced, skip this one
        # (This is a defensive measure — with whole-word patterns overlaps are unlikely.)
        if any(i in occupied for i in range(start, end)):
            continue

        updated_text = updated_text[:start] + repl + updated_text[end:]
        # mark occupied indices (relative to the original coordinates)
        # Note: indices in 'occupied' refer to original positions; because we apply right→left, they remain valid.
        for i in range(start, end):
            occupied.add(i)
        changes.append((name, start))

    # We returned changes in the order we applied (right->left). If you prefer chronological order,
    # you can sort changes by the index ascending here.
    changes.sort(key=lambda x: x[1])
    return updated_text, changes


In [116]:
replace_terms_with_gls(text, glossary)

<re.Match object; span=(0, 1083), match='\\newglossaryentry{pseudoinverse}\n{name={pseudoi>
<re.Match object; span=(1085, 5339), match="\\newglossaryentry{randomexperiment}\n{name={rand>


("\\newglossaryentry{pseudoinverse}\n{name={pseudoinverse},\n\tdescription={The \\index{pseudoinverse}Moore–Penrose pseudoinverse $\\mA^{+}$ \n\t\tof a \\gls{matrix} $\\featuremtx \\in \\mathbb{R}^{\\samplesize \\times \\nrfeatures}$ \n\t\tgeneralizes the notion of an \\gls{inverse} \\cite{GolubVanLoanBook}. \n\t\tThe pseudoinverse arises naturally in \\gls{ridgeregression} for a \n\t\t\\gls{dataset} with \\gls{featuremtx} $\\featuremtx$ and \\gls{labelvec} \n\t\t$\\labelvec$ \\cite[Ch.\\ 3]{hastie01statisticallearning}. \n\t\tThe \\glspl{modelparam} learned by \\gls{ridgeregression} \n\t\tare given by\n\t\t\\[\n\t\t\\widehat{\\weights}^{(\\regparam)}  = \\big(\\featuremtx^{T} \\featuremtx + \\regparam \\mI \\big)^{-1} \\featuremtx^{T} \\vy, \\quad \\regparam > 0.\n\t\t\\]\n\t\tWe can then define the pseudoinverse $\\featuremtx^{+} \\in \\mathbb{R}^{\\nrfeatures \\times \\samplesize}$ via \n\t\tthe limit \\cite[Ch. 3]{benisrael2003generalized}\n\t\t\\[\n\t\t\\lim_{\\regparam \\to 0^+} 

In [None]:
import re
from pathlib import Path

# helper to test whether index is inside any (start,end) list
def inside_any(index, spans):
    return any(s <= index < e for s, e in spans)

def extract_glossary_terms(tex_text):
    """Extract glossary entries and their names."""
    entries = re.findall(
        r'\\newglossaryentry\{(.*?)\}\s*\{.*?name=\{(.*?)\}',
        tex_text,
        re.DOTALL
    )
    return {term.strip(): name.strip() for term, name in entries}

def replace_terms_with_gls(tex_text, glossary):
    """
    Replace plain term occurrences with \\gls{<term>} except:
      - inside existing \\gls{...} commands (never replace),
      - inside the glossary definition of the same term (do not replace occurrences of 'term' inside its own definition),
      - but allow replacing the term when it appears inside OTHER glossary definitions.
    Returns updated_text, changes (list of (name, start_index) for replacements).
    """

    updated_text = tex_text
    changes = []

    # 1) Find spans of every \newglossaryentry{key}{...} and record by key
    glossary_defs = {}   # key -> (start, end)
    for m in re.finditer(r'\\newglossaryentry\{(.*?)\}\s*\{.*?\}(?=\s*\\newglossaryentry|$)', tex_text, re.DOTALL):
        print(m)
        key = m.group(1)
        glossary_defs[key] = (m.start(), m.end())

    # 2) Find spans of existing \gls{...} (never replace inside these)
    gls_spans = [ (m.start(), m.end()) for m in re.finditer(r'\\gls\{.*?\}', tex_text) ]

    # 3) Collect candidate replacements from the ORIGINAL text (not updated_text)
    #    This avoids index-shift problems when applying multiple replacements.
    replacements = []  # list of (start, end, replacement_text, name)

    for term, name in glossary.items():
        # compile pattern for whole-word, case-insensitive matching of the displayed name
        # re.escape(name) - Escapes any special regex characters inside `name`.
        # For example, if name = "C++", it becomes "C\+\+"
        # so '+' is treated literally, not as a regex operator.
        pattern = re.compile(rf'\b{re.escape(name)}\b', re.IGNORECASE)

        for m in pattern.finditer(tex_text):
            start, end = m.start(), m.end()

            # If inside any \gls{...}, skip
            if inside_any(start, gls_spans):
                continue

            # If inside this term's own glossary definition, skip
            # Glossary keys (term) should match the key used in \newglossaryentry{<key>}
            # Only protect if this key exists in glossary_defs
            def_span = glossary_defs.get(term)
            if def_span and def_span[0] <= start < def_span[1]:
                # match is inside its own definition -> skip replacing
                continue

            # Otherwise, this occurrence is OK to replace
            replacement_text = f'\\gls{{{term}}}'
            replacements.append((start, end, replacement_text, name))

    # 4) Sort replacements by start index descending, apply them to updated_text
    #    Applying from right to left prevents earlier replacements from shifting later spans.
    replacements.sort(key=lambda x: x[0], reverse=True)

    occupied = set()  # optional: track replaced positions to avoid overlapping replacements
    for start, end, repl, name in replacements:
        # Avoid overlapping replacements: if any position already replaced, skip this one
        # (This is a defensive measure — with whole-word patterns overlaps are unlikely.)
        if any(i in occupied for i in range(start, end)):
            continue

        updated_text = updated_text[:start] + repl + updated_text[end:]
        # mark occupied indices (relative to the original coordinates)
        # Note: indices in 'occupied' refer to original positions; because we apply right→left, they remain valid.
        for i in range(start, end):
            occupied.add(i)
        changes.append((name, start))

    # We returned changes in the order we applied (right->left). If you prefer chronological order,
    # you can sort changes by the index ascending here.
    changes.sort(key=lambda x: x[1])
    return updated_text, changes

def main():
    input_file = Path("test.tex")
    output_file = Path("out.tex")

    tex_text = input_file.read_text(encoding="utf-8")

    glossary = extract_glossary_terms(tex_text)
    print(f"Found {len(glossary)} glossary terms.")

    updated_text, changes = replace_terms_with_gls(tex_text, glossary)

    print(f"Applied {len(changes)} replacements.")
    for name, pos in changes[:10]:  # show sample
        print(f"  - {name} at position {pos}")

    output_file.write_text(updated_text, encoding="utf-8")
    print(f"Updated file written to {output_file}")

if __name__ == "__main__":
    main()
