In [None]:
import re
from pathlib import Path

class LatexIndexer:
    def __init__(self, similarity_threshold=0.8, max_pages=10):
        self.index_entries = {}
        self.modified_content = []
        self.similarity_threshold = similarity_threshold
        self.max_pages = max_pages
        self.debug = True

    def parse_index_list(self, index_text):
        """Parse the index list into a dictionary of terms and their pages."""
        lines = [line.strip() for line in index_text.split('\n') if line.strip()]
        lines = [line for line in lines if not line.startswith('\\begin') and not line.startswith('\\end')]

        for line in lines:
            match = re.match(r'(.*?),\s*((?:\d+(?:,\s*\d+)*))\\?$', line)
            if match:
                term, pages = match.groups()
                term = term.strip()
                pages = [p.strip() for p in pages.split(',')]
                self.index_entries[term] = pages

        if self.debug:
            print(f"Parsed {len(self.index_entries)} index terms")

    def process_latex_file(self, input_file):
        """Read and process the LaTeX file."""
        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                content = f.read()

            if self.debug:
                print(f"File length: {len(content)} characters")
                print("First 100 characters:", content[:100])
                print("Contains \\begin{document}:", '\\begin{document}' in content)
                # Normalize line endings
                content = content.replace('\r\n', '\n').replace('\r', '\n')

            # Find document begin and end positions
            doc_begin = content.find('\\begin{document}')
            doc_end = content.find('\\end{document}')

            if doc_begin == -1:
                raise ValueError("Could not find \\begin{document}. Document content:\n" + content[:500] + "...")

            if doc_end == -1:
                raise ValueError("Could not find \\end{document}")

            # Split into preamble and body
            preamble = content[:doc_begin]
            body = content[doc_begin:doc_end]
            ending = content[doc_end:]

            # Add required packages to preamble if not present
            if '\\usepackage{makeidx}' not in preamble:
                preamble = preamble.rstrip() + '\n\\usepackage{makeidx}\n'

            if '\\makeindex' not in preamble:
                preamble = preamble.rstrip() + '\n\\makeindex\n'

            # Process the body text
            modified_body = body
            for term in self.index_entries.keys():
                # Escape special LaTeX characters in the term
                escaped_term = re.escape(term)
                # Create pattern that matches the word boundary
                pattern = fr'\b{escaped_term}\b'

                # Add \index{term} after each occurrence
                modified_body = re.sub(
                    pattern,
                    f'\\g<0>\\\\index{{{term}}}',
                    modified_body
                )

            # Add \printindex before \end{document}
            if '\\printindex' not in content:
                ending = '\n\\printindex\n' + ending

            # Combine everything back
            final_content = preamble + modified_body + ending

            # Save the modified content
            self.modified_content = final_content.split('\n')

            if self.debug:
                # Count the number of \index commands added
                index_count = len(re.findall(r'\\index{[^}]*}', final_content))
                print(f"Added {index_count} index tags")

        except UnicodeDecodeError:
            print("Trying with different encodings...")
            encodings = ['latin-1', 'utf-16', 'cp1252']
            for encoding in encodings:
                try:
                    with open(input_file, 'r', encoding=encoding) as f:
                        content = f.read()
                    print(f"Successfully read file with {encoding} encoding")
                    break
                except UnicodeDecodeError:
                    continue
            else:
                raise ValueError(f"Could not read file with any of these encodings: utf-8, {', '.join(encodings)}")

    def save_modified_file(self, output_file):
        """Save the modified content to a new file."""
        content = '\n'.join(self.modified_content)
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(content)

        if self.debug:
            print(f"Saved modified content to {output_file}")
            print("\nTo generate the index, run these commands in order:")
            print(f"1. pdflatex {output_file}")
            print(f"2. makeindex {output_file.replace('.tex', '.idx')}")
            print(f"3. pdflatex {output_file}")

def process_files(tex_file_path, index_file_path, output_file_path=None):
    """Main function to process the files."""
    if output_file_path is None:
        output_file_path = tex_file_path.replace('.tex', '_indexed.tex')

    print(f"Processing LaTeX file: {tex_file_path}")
    print(f"Using index file: {index_file_path}")
    print(f"Output will be saved to: {output_file_path}")

    with open(index_file_path, 'r', encoding='utf-8') as f:
        index_content = f.read()

    indexer = LatexIndexer()
    indexer.parse_index_list(index_content)
    indexer.process_latex_file(tex_file_path)
    indexer.save_modified_file(output_file_path)

    return output_file_path

In [None]:
process_files('output-gpt-processed-2.tex', 'sample-indices-3.txt')

Processing LaTeX file: output-gpt-processed.tex
Using index file: sample-indices-2.txt
Output will be saved to: output-gpt-processed_indexed.tex
Parsed 65 index terms
File length: 207983 characters
First 100 characters: \documentclass{book} \usepackage{graphicx} \usepackage{amsmath} \usepackage{amssymb} \usepackage[dvi
Contains \begin{document}: True
Added 77 index tags
Saved modified content to output-gpt-processed_indexed.tex

To generate the index, run these commands in order:
1. pdflatex output-gpt-processed_indexed.tex
2. makeindex output-gpt-processed_indexed.idx
3. pdflatex output-gpt-processed_indexed.tex


'output-gpt-processed_indexed.tex'

In [None]:
import re
from pathlib import Path
from typing import Dict, List, Tuple, Set

class PageAwareLatexIndexer:
    def __init__(self, debug=True):
        self.index_entries: Dict[str, Set[int]] = {}  # term -> set of pages
        self.modified_content: List[str] = []
        self.debug = debug

    def parse_index_list(self, index_text: str) -> None:
        """Parse the index list into a dictionary of terms and their pages."""
        lines = [line.strip() for line in index_text.split('\n') if line.strip()]

        for line in lines:
            # Split on the last comma to separate term and pages
            parts = line.rsplit(',', 1)
            if len(parts) != 2:
                continue

            term, pages = parts
            term = term.strip()
            pages_str = pages.strip().rstrip('\\')

            # Initialize set for this term
            if term not in self.index_entries:
                self.index_entries[term] = set()

            # Process each page number or range
            for page_item in pages_str.split(','):
                page_item = page_item.strip()

                # Handle page ranges (e.g., "18-20")
                if '-' in page_item:
                    start, end = map(int, page_item.split('-'))
                    self.index_entries[term].update(range(start, end + 1))
                else:
                    try:
                        self.index_entries[term].add(int(page_item))
                    except ValueError:
                        continue

        if self.debug:
            print(f"Parsed {len(self.index_entries)} index terms")
            for term, pages in self.index_entries.items():
                print(f"Term: {term}, Pages: {sorted(pages)}")

    def extract_page_content(self, content: str) -> Dict[int, Tuple[int, int]]:
        """Extract page boundaries from LaTeX content using % Page X markers."""
        page_positions = {}
        current_pos = 0

        while True:
            # Find next page marker
            marker = re.search(r'% Page (\d+)', content[current_pos:])
            if not marker:
                break

            page_num = int(marker.group(1))
            start_pos = current_pos + marker.start()

            # Move current position to start looking for next marker
            current_pos = start_pos + len(marker.group(0))

            # Add to page positions
            page_positions[page_num] = start_pos

        # Convert positions to ranges
        page_ranges = {}
        sorted_pages = sorted(page_positions.items())

        for i in range(len(sorted_pages)):
            page_num, start = sorted_pages[i]
            end = sorted_pages[i + 1][1] if i < len(sorted_pages) - 1 else len(content)
            page_ranges[page_num] = (start, end)

        if self.debug:
            print(f"Found {len(page_ranges)} pages: {sorted(page_ranges.keys())}")

        return page_ranges

    def add_index_to_content(self, content: str, term: str, start_pos: int, end_pos: int) -> str:
        """Add index command after the term, handling LaTeX special characters."""
        # Function to escape special regex characters but keep LaTeX commands
        def escape_for_regex(s):
            special_chars = r'[](){}?*+|^$.\\'
            return ''.join('\\' + c if c in special_chars else c for c in s)

        escaped_term = escape_for_regex(term)
        pattern = fr'\b{escaped_term}\b(?![}}\\])'  # Negative lookahead to avoid double indexing

        segment = content[start_pos:end_pos]
        modified_segment = re.sub(
            pattern,
            f'\\g<0>\\\\index{{{term}}}',
            segment
        )

        return content[:start_pos] + modified_segment + content[end_pos:]

    def process_latex_file(self, input_file: str) -> None:
        """Read and process the LaTeX file with page awareness."""
        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                content = f.read()

            # Normalize line endings
            content = content.replace('\r\n', '\n').replace('\r', '\n')

            # Extract page positions
            page_ranges = self.extract_page_content(content)

            # Add required packages to preamble
            doc_begin = content.find('\\begin{document}')
            if doc_begin == -1:
                raise ValueError("Could not find \\begin{document}")

            preamble = content[:doc_begin]
            if '\\usepackage{makeidx}' not in preamble:
                content = content[:doc_begin] + '\\usepackage{makeidx}\n\\makeindex\n' + content[doc_begin:]

            # Process each term and its pages
            for term, pages in self.index_entries.items():
                if self.debug:
                    print(f"\nProcessing term: {term}")

                for page in pages:
                    if page in page_ranges:
                        start_pos, end_pos = page_ranges[page]
                        content = self.add_index_to_content(content, term, start_pos, end_pos)

            # Add \printindex before \end{document}
            if '\\printindex' not in content:
                end_doc = content.find('\\end{document}')
                if end_doc != -1:
                    content = content[:end_doc] + '\n\\printindex\n' + content[end_doc:]

            self.modified_content = content.split('\n')

        except Exception as e:
            print(f"Error processing file: {e}")
            raise

    def save_modified_file(self, output_file: str) -> None:
        """Save the modified content to a new file."""
        content = '\n'.join(self.modified_content)
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(content)

        if self.debug:
            print(f"\nSaved modified content to {output_file}")
            print("To generate the index, run these commands in order:")
            print(f"1. pdflatex {output_file}")
            print(f"2. makeindex {output_file.replace('.tex', '.idx')}")
            print(f"3. pdflatex {output_file}")

def process_files(tex_file_path: str, index_file_path: str, output_file_path: str = None) -> str:
    """Process the files with page-aware indexing."""
    if output_file_path is None:
        output_file_path = tex_file_path.replace('.tex', '_indexed.tex')

    print(f"Processing LaTeX file: {tex_file_path}")
    print(f"Using index file: {index_file_path}")
    print(f"Output will be saved to: {output_file_path}")

    with open(index_file_path, 'r', encoding='utf-8') as f:
        index_content = f.read()

    indexer = PageAwareLatexIndexer(debug=True)
    indexer.parse_index_list(index_content)
    indexer.process_latex_file(tex_file_path)
    indexer.save_modified_file(output_file_path)

    return output_file_path

In [None]:
process_files('output-gpt-processed-2.tex', 'sample-indices-3.txt')

Processing LaTeX file: output-gpt-processed-2.tex
Using index file: sample-indices-3.txt
Output will be saved to: output-gpt-processed-2_indexed.tex
Parsed 18 index terms
Term: Absolute address, Pages: [274]
Term: add, 29, 30, 282, Pages: [334]
Term: Addition, Pages: [314]
Term: Addition instructions, Pages: [31]
Term: ADDR operator, Pages: [18, 19, 20]
Term: Aliasing, Pages: [160]
Term: American Standard Code for Information Interchange (ASCII), 311, Pages: [312]
Term: and, Pages: [323]
Term: And operator (&&), Pages: [62]
Term: Arithmetic instructions, Pages: [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]
Term: Arithmetic shift, Pages: [105, 106, 107, 108]
Term: . asm, Pages: [291, 292]
Term: Array of strings, Pages: [204, 205, 206]
Term: Arrays, Pages: [159, 160, 161, 162]
Term: 64-bit arrays, Pages: [258, 259, 260]
Term: floating-point arrays, Pages: [232, 233]
Term: Assembler, Pages: [1]
Term: Assembly language, Pages: [1, 2]
Found 230 pages: [16, 17, 18,

'output-gpt-processed-2_indexed.tex'

In [None]:
import re
from pathlib import Path
from typing import Dict, List, Set

class SimpleLatexIndexer:
    def __init__(self, debug=True):
        self.index_entries: Dict[str, Set[int]] = {}  # term -> set of pages
        self.page_map: Dict[int, str] = {}  # page number -> content
        self.debug = debug

    def parse_index_list(self, index_text: str) -> None:
        """Parse the index list into a dictionary of terms and their pages."""
        lines = [line.strip() for line in index_text.split('\n') if line.strip()]

        for line in lines:
            if ',' not in line:
                continue

            # Split on the last comma
            term, pages = line.rsplit(',', 1)
            term = term.strip()
            pages_str = pages.strip().rstrip('\\')

            if term not in self.index_entries:
                self.index_entries[term] = set()

            # Process page numbers and ranges
            for page_item in pages_str.split(','):
                page_item = page_item.strip()
                if '-' in page_item:
                    start, end = map(int, page_item.split('-'))
                    self.index_entries[term].update(range(start, end + 1))
                else:
                    try:
                        self.index_entries[term].add(int(page_item))
                    except ValueError:
                        continue

        if self.debug:
            print(f"\nParsed {len(self.index_entries)} index terms")
            for term, pages in self.index_entries.items():
                print(f"Term: '{term}', Pages: {sorted(pages)}")

    def build_page_map(self, content: str) -> None:
        """Build a map of page numbers to page content."""
        current_page = None
        current_content = []
        self.page_map = {}

        for line in content.split('\n'):
            page_match = re.match(r'% Page (\d+)', line)
            if page_match:
                # Save previous page if exists
                if current_page and current_content:
                    self.page_map[current_page] = '\n'.join(current_content)
                # Start new page
                current_page = int(page_match.group(1))
                current_content = []
            elif current_page is not None:
                current_content.append(line)

        # Save last page
        if current_page and current_content:
            self.page_map[current_page] = '\n'.join(current_content)

        if self.debug:
            print(f"\nBuilt page map with {len(self.page_map)} pages")
            print(f"Page numbers: {sorted(self.page_map.keys())}")

    def process_latex_file(self, input_file: str) -> None:
        """Process the LaTeX file and add index terms."""
        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                content = f.read()

            # Build page map
            self.build_page_map(content)

            # Add packages to preamble
            doc_begin = content.find('\\begin{document}')
            if doc_begin != -1:
                preamble = content[:doc_begin]
                if '\\usepackage{makeidx}' not in preamble:
                    package_text = '\\usepackage{makeidx}\n\\makeindex\n'
                    content = content[:doc_begin] + package_text + content[doc_begin:]

            # Process each index entry
            modified_content = content
            for term, pages in self.index_entries.items():
                for page_num in pages:
                    if page_num in self.page_map:
                        page_content = self.page_map[page_num]

                        # Find a good insertion point - about 1/3 through the page
                        lines = page_content.split('\n')
                        insert_pos = len(lines) // 3

                        # Add the index command on its own line
                        lines.insert(insert_pos, f"\\index{{{term}}}")
                        modified_page = '\n'.join(lines)

                        # Replace the original page content
                        modified_content = modified_content.replace(page_content, modified_page)

                        if self.debug:
                            print(f"Added index for '{term}' on page {page_num}")

            # Add \printindex if not present
            if '\\printindex' not in modified_content:
                end_doc = modified_content.find('\\end{document}')
                if end_doc != -1:
                    modified_content = (
                        modified_content[:end_doc] +
                        '\n\\printindex\n\n' +
                        modified_content[end_doc:]
                    )

            self.modified_content = modified_content

        except Exception as e:
            print(f"Error processing file: {e}")
            raise

    def save_modified_file(self, output_file: str) -> None:
        """Save the modified content to a new file."""
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(self.modified_content)

        if self.debug:
            print(f"\nSaved modified content to {output_file}")
            print("\nTo generate the index, run:")
            print(f"1. pdflatex {output_file}")
            print(f"2. makeindex {output_file.replace('.tex', '.idx')}")
            print(f"3. pdflatex {output_file}")

def process_files(tex_file_path: str, index_file_path: str, output_file_path: str = None) -> str:
    """Process the files with the simple indexing approach."""
    if output_file_path is None:
        output_file_path = tex_file_path.replace('.tex', '_indexed.tex')

    print(f"\nProcessing files:")
    print(f"LaTeX file: {tex_file_path}")
    print(f"Index file: {index_file_path}")
    print(f"Output file: {output_file_path}")

    indexer = SimpleLatexIndexer(debug=True)

    with open(index_file_path, 'r', encoding='utf-8') as f:
        index_content = f.read()

    indexer.parse_index_list(index_content)
    indexer.process_latex_file(tex_file_path)
    indexer.save_modified_file(output_file_path)

    return output_file_path

In [None]:
process_files('output-gpt-processed-2.tex', 'sample-indices-3.txt')


Processing files:
LaTeX file: output-gpt-processed-2.tex
Index file: sample-indices-3.txt
Output file: output-gpt-processed-2_indexed.tex

Parsed 18 index terms
Term: 'Absolute address', Pages: [274]
Term: 'add, 29, 30, 282', Pages: [334]
Term: 'Addition', Pages: [314]
Term: 'Addition instructions', Pages: [31]
Term: 'ADDR operator', Pages: [18, 19, 20]
Term: 'Aliasing', Pages: [160]
Term: 'American Standard Code for Information Interchange (ASCII), 311', Pages: [312]
Term: 'and', Pages: [323]
Term: 'And operator (&&)', Pages: [62]
Term: 'Arithmetic instructions', Pages: [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]
Term: 'Arithmetic shift', Pages: [105, 106, 107, 108]
Term: '. asm', Pages: [291, 292]
Term: 'Array of strings', Pages: [204, 205, 206]
Term: 'Arrays', Pages: [159, 160, 161, 162]
Term: '64-bit arrays', Pages: [258, 259, 260]
Term: 'floating-point arrays', Pages: [232, 233]
Term: 'Assembler', Pages: [1]
Term: 'Assembly language', Pages: [1, 2]

B

'output-gpt-processed-2_indexed.tex'

In [None]:
!pdflatex output-gpt-processed_indexed.tex

/bin/bash: line 1: pdflatex: command not found


In [None]:
import re
from pathlib import Path

class LatexIndexer:
    def __init__(self):
        self.index_entries = {}
        self.modified_content = []

    def parse_index_list(self, index_text):
        """Parse the index list into a dictionary of terms and their pages."""
        lines = [line.strip() for line in index_text.split('\n') if line.strip()]
        lines = [line for line in lines if not line.startswith('\\begin') and not line.startswith('\\end')]

        for line in lines:
            match = re.match(r'(.*?),\s*((?:\d+(?:,\s*\d+)*))\\?$', line)
            if match:
                term, pages = match.groups()
                term = term.strip()
                # Store as strings to preserve the exact page numbers we want in the index
                pages = [p.strip() for p in pages.split(',')]
                self.index_entries[term] = pages

    def find_term_in_text(self, text, term):
        """Find exact occurrences of the term in text, ignoring LaTeX commands."""
        escaped_term = re.escape(term).replace('\\ ', ' ')
        pattern = rf'\b{escaped_term}\b'
        return list(re.finditer(pattern, text))

    def process_latex_file(self, input_file):
        """Read and process the LaTeX file."""
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()

        lines = content.split('\n')
        modified_lines = lines.copy()

        # Process each term
        for term, pages in self.index_entries.items():
            # Find all occurrences of the term
            content_so_far = ''
            for i, line in enumerate(lines):
                content_so_far += line + '\n'
                matches = self.find_term_in_text(line, term)
                if matches:
                    # Add index command after the term
                    for match in matches:
                        modified_line = modified_lines[i]
                        index_cmd = f'\\index{{{term}}}'
                        pos = match.end()
                        modified_lines[i] = (
                            modified_line[:pos] +
                            index_cmd +
                            modified_line[pos:]
                        )

        self.modified_content = modified_lines

    def save_modified_file(self, output_file):
        """Save the modified content to a new file."""
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(self.modified_content))

    def add_index_package(self):
        """Add required index packages if not present."""
        preamble_end = -1
        for i, line in enumerate(self.modified_content):
            if '\\begin{document}' in line:
                preamble_end = i
                break

        if preamble_end != -1:
            preamble = '\n'.join(self.modified_content[:preamble_end])
            packages_to_add = []

            if '\\usepackage{makeidx}' not in preamble:
                packages_to_add.append('\\usepackage{makeidx}')
            if '\\makeindex' not in preamble:
                packages_to_add.append('\\makeindex')

            if packages_to_add:
                self.modified_content[preamble_end:preamble_end] = packages_to_add

        # Add \printindex before \end{document} if not present
        doc_end = len(self.modified_content)
        for i, line in enumerate(self.modified_content):
            if '\\end{document}' in line:
                doc_end = i
                break

        if '\\printindex' not in '\n'.join(self.modified_content):
            self.modified_content.insert(doc_end, '\\printindex')

def process_files(tex_file_path, index_file_path, output_file_path=None):
    """Main function to process the files."""
    if output_file_path is None:
        output_file_path = tex_file_path.replace('.tex', '_indexed.tex')

    with open(index_file_path, 'r', encoding='utf-8') as f:
        index_content = f.read()

    indexer = LatexIndexer()
    indexer.parse_index_list(index_content)
    indexer.process_latex_file(tex_file_path)
    indexer.add_index_package()
    indexer.save_modified_file(output_file_path)

    return output_file_path

In [None]:
import re
from pathlib import Path
from difflib import SequenceMatcher

class Match:
    """Custom match object to handle both exact and fuzzy matches"""
    def __init__(self, start_pos, end_pos, matched_text, similarity=1.0):
        self._start = start_pos
        self._end = end_pos
        self._text = matched_text
        self.similarity = similarity

    def start(self):
        return self._start

    def end(self):
        return self._end

    def group(self):
        return self._text

class LatexIndexer:
    def __init__(self, similarity_threshold=0.8):
        self.index_entries = {}
        self.modified_content = []
        self.similarity_threshold = similarity_threshold

    def parse_index_list(self, index_text):
        """Parse the index list into a dictionary of terms and their pages."""
        lines = [line.strip() for line in index_text.split('\n') if line.strip()]
        lines = [line for line in lines if not line.startswith('\\begin') and not line.startswith('\\end')]

        for line in lines:
            match = re.match(r'(.*?),\s*((?:\d+(?:,\s*\d+)*))\\?$', line)
            if match:
                term, pages = match.groups()
                term = term.strip()
                pages = [p.strip() for p in pages.split(',')]
                self.index_entries[term] = pages

    def calculate_similarity(self, str1, str2):
        """Calculate similarity ratio between two strings."""
        return SequenceMatcher(None, str1.lower(), str2.lower()).ratio()

    def find_best_match_in_text(self, text, term):
        """Find the best matching substring for the term in text."""
        # First try exact match
        escaped_term = re.escape(term).replace('\\ ', ' ')
        pattern = rf'\b{escaped_term}\b'

        exact_matches = []
        for m in re.finditer(pattern, text):
            # Convert re.Match to our custom Match
            exact_matches.append(Match(m.start(), m.end(), m.group(), 1.0))

        if exact_matches:
            return exact_matches

        # If no exact match, try fuzzy matching
        words = text.split()
        best_matches = []

        # Look for matches in sliding windows of various sizes
        term_word_count = len(term.split())
        window_sizes = range(max(1, term_word_count - 1), term_word_count + 2)

        for window_size in window_sizes:
            for i in range(len(words) - window_size + 1):
                window = ' '.join(words[i:i + window_size])
                similarity = self.calculate_similarity(term, window)

                if similarity >= self.similarity_threshold:
                    # Calculate position in original text
                    start_pos = len(' '.join(words[:i]))
                    if i > 0:
                        start_pos += 1  # Add space if not at beginning
                    end_pos = start_pos + len(window)

                    best_matches.append(Match(start_pos, end_pos, window, similarity))

        # Sort by similarity and return the best matches
        return sorted(best_matches, key=lambda x: x.similarity, reverse=True)

    def process_latex_file(self, input_file):
        """Read and process the LaTeX file."""
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()

        lines = content.split('\n')
        modified_lines = lines.copy()
        unmatched_terms = []

        # Process each term
        for term, pages in self.index_entries.items():
            term_found = False

            for i, line in enumerate(lines):
                # Skip lines with LaTeX commands
                if line.strip().startswith('\\'):
                    continue

                matches = self.find_best_match_in_text(line, term)

                if matches:
                    term_found = True
                    modified_line = modified_lines[i]
                    for match in matches:
                        index_cmd = f'\\index{{{term}}}'
                        pos = match.end()
                        modified_lines[i] = (
                            modified_line[:pos] +
                            index_cmd +
                            modified_line[pos:]
                        )
                        # Only use the first match if it's a fuzzy match
                        if match.similarity < 1.0:
                            break

            if not term_found:
                unmatched_terms.append(term)

        self.modified_content = modified_lines
        if unmatched_terms:
            print("Warning: The following terms could not be matched:")
            for term in unmatched_terms:
                print(f"  - {term}")

    def save_modified_file(self, output_file):
        """Save the modified content to a new file."""
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(self.modified_content))

    def add_index_package(self):
        """Add required index packages if not present."""
        preamble_end = -1
        for i, line in enumerate(self.modified_content):
            if '\\begin{document}' in line:
                preamble_end = i
                break

        if preamble_end != -1:
            preamble = '\n'.join(self.modified_content[:preamble_end])
            packages_to_add = []

            if '\\usepackage{makeidx}' not in preamble:
                packages_to_add.append('\\usepackage{makeidx}')
            if '\\makeindex' not in preamble:
                packages_to_add.append('\\makeindex')

            if packages_to_add:
                self.modified_content[preamble_end:preamble_end] = packages_to_add

        # Add \printindex before \end{document} if not present
        doc_end = len(self.modified_content)
        for i, line in enumerate(self.modified_content):
            if '\\end{document}' in line:
                doc_end = i
                break

        if '\\printindex' not in '\n'.join(self.modified_content):
            self.modified_content.insert(doc_end, '\\printindex')

def process_files(tex_file_path, index_file_path, output_file_path=None, similarity_threshold=0.8):
    """Main function to process the files."""
    if output_file_path is None:
        output_file_path = tex_file_path.replace('.tex', '_indexed.tex')

    with open(index_file_path, 'r', encoding='utf-8') as f:
        index_content = f.read()

    indexer = LatexIndexer(similarity_threshold=similarity_threshold)
    indexer.parse_index_list(index_content)
    indexer.process_latex_file(tex_file_path)
    indexer.add_index_package()
    indexer.save_modified_file(output_file_path)

    return output_file_path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cd /content/drive/MyDrive/pdf2latex/

In [None]:
process_files('output-gpt-processed.tex', 'sample-indices-2.txt')

  - A/B testing
  - academic data
  - accuracy
  - activation function
  - AdaBoost
  - add-one discounting
  - agglomerative cluster trees
  - Akaike information criterion
  - algorithm analysis
  - Turkers
  - American basketball players
  - analogies
  - anchoring
  - angular distance
  - Anscombe's Quartet
  - Apple iPhone sales
  - Aristotle
  - Arrow's impossibility theorem
  - Ascombe quartet
  - associativity
  - Babbage, Charles
  - backpropagation
  - Bacon, Kevin
  - balanced training classes
  - bar charts
  - Barzun, Jacques
  - baseball encyclopedia
  - baseline models
  - for value prediction
  - Baysian information criteria
  - lexicographic
  - temporal
  - bias–variance trade-off
  - big oh analysis
  - binary relations
  - binary search


'output-gpt-processed_indexed.tex'

In [None]:
import re
from pathlib import Path
from difflib import SequenceMatcher

class Match:
    def __init__(self, start_pos, end_pos, matched_text, page_num=None, similarity=1.0):
        self._start = start_pos
        self._end = end_pos
        self._text = matched_text
        self.page_num = page_num
        self.similarity = similarity

    def start(self):
        return self._start

    def end(self):
        return self._end

    def group(self):
        return self._text

class LatexIndexer:
    def __init__(self, similarity_threshold=0.8):
        self.index_entries = {}  # Will store {term: [page_numbers]}
        self.modified_content = []
        self.similarity_threshold = similarity_threshold
        self.page_boundaries = {}  # Will store {page_number: (start_line, end_line)}

    def parse_index_list(self, index_text):
        """Parse the index list into a dictionary of terms and their pages."""
        lines = [line.strip() for line in index_text.split('\n') if line.strip()]
        lines = [line for line in lines if not line.startswith('\\begin') and not line.startswith('\\end')]

        for line in lines:
            match = re.match(r'(.*?),\s*((?:\d+(?:,\s*\d+)*))\\?$', line)
            if match:
                term, pages = match.groups()
                term = term.strip()
                # Convert page numbers to integers
                pages = [int(p.strip()) for p in pages.split(',')]
                self.index_entries[term] = sorted(pages)
                print(f"Parsed term: {term} with pages: {pages}")  # Debug output

    def find_page_boundaries(self, content):
        """Find the line ranges for each page in the document."""
        lines = content.split('\n')
        current_page = 1
        start_line = 0

        # Common LaTeX page break commands
        page_breaks = [
            r'\\newpage',
            r'\\pagebreak',
            r'\\clearpage',
            r'\\cleardoublepage'
        ]

        for i, line in enumerate(lines):
            # Check for page number setting commands
            page_set = re.search(r'\\setcounter{page}{(\d+)}', line)
            if page_set:
                current_page = int(page_set.group(1))
                if start_line < i:
                    self.page_boundaries[current_page-1] = (start_line, i)
                start_line = i

            # Check for page breaks
            if any(re.search(break_cmd, line) for break_cmd in page_breaks):
                self.page_boundaries[current_page] = (start_line, i)
                current_page += 1
                start_line = i + 1

        # Add the last page
        self.page_boundaries[current_page] = (start_line, len(lines))
        print(f"Found page boundaries: {self.page_boundaries}")  # Debug output

    def is_line_in_pages(self, line_num, target_pages):
        """Check if a line falls within any of the target pages."""
        for page, (start, end) in self.page_boundaries.items():
            if start <= line_num <= end and page in target_pages:
                return True
        return False

    def process_latex_file(self, input_file):
        """Read and process the LaTeX file."""
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # First find page boundaries
        self.find_page_boundaries(content)

        lines = content.split('\n')
        modified_lines = lines.copy()
        unmatched_terms = []

        # Process each term
        for term, target_pages in self.index_entries.items():
            term_found = False

            # Only process lines within the specified pages
            for i, line in enumerate(lines):
                # Skip if line is not in target pages
                if not self.is_line_in_pages(i, target_pages):
                    continue

                # Skip lines with LaTeX commands
                if line.strip().startswith('\\'):
                    continue

                matches = self.find_best_match_in_text(line, term)

                if matches:
                    term_found = True
                    modified_line = modified_lines[i]
                    for match in matches:
                        # Get current page number for this match
                        current_page = None
                        for page, (start, end) in self.page_boundaries.items():
                            if start <= i <= end:
                                current_page = page
                                break

                        if current_page in target_pages:
                            index_cmd = f'\\index{{{term}}}'
                            pos = match.end()
                            modified_lines[i] = (
                                modified_line[:pos] +
                                index_cmd +
                                modified_line[pos:]
                            )
                            # Only use the first match if it's a fuzzy match
                            if match.similarity < 1.0:
                                break

            if not term_found:
                unmatched_terms.append(f"{term} (pages {target_pages})")

        self.modified_content = modified_lines
        if unmatched_terms:
            print("Warning: The following terms could not be matched on their specified pages:")
            for term in unmatched_terms:
                print(f"  - {term}")

    def save_modified_file(self, output_file):
        """Save the modified content to a new file."""
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(self.modified_content))

    def add_index_package(self):
        """Add required index packages if not present."""
        preamble_end = -1
        for i, line in enumerate(self.modified_content):
            if '\\begin{document}' in line:
                preamble_end = i
                break

        if preamble_end != -1:
            preamble = '\n'.join(self.modified_content[:preamble_end])
            packages_to_add = []

            if '\\usepackage{makeidx}' not in preamble:
                packages_to_add.append('\\usepackage{makeidx}')
            if '\\makeindex' not in preamble:
                packages_to_add.append('\\makeindex')

            if packages_to_add:
                self.modified_content[preamble_end:preamble_end] = packages_to_add

        # Add \printindex before \end{document} if not present
        doc_end = len(self.modified_content)
        for i, line in enumerate(self.modified_content):
            if '\\end{document}' in line:
                doc_end = i
                break

        if '\\printindex' not in '\n'.join(self.modified_content):
            self.modified_content.insert(doc_end, '\\printindex')


    def calculate_similarity(self, str1, str2):
        """Calculate similarity ratio between two strings."""
        return SequenceMatcher(None, str1.lower(), str2.lower()).ratio()

    def find_best_match_in_text(self, text, term):
        """Find the best matching substring for the term in text."""
        # First try exact match
        escaped_term = re.escape(term).replace('\\ ', ' ')
        pattern = rf'\b{escaped_term}\b'

        exact_matches = []
        for m in re.finditer(pattern, text):
            # Convert re.Match to our custom Match
            exact_matches.append(Match(m.start(), m.end(), m.group(), 1.0))

        if exact_matches:
            return exact_matches

        # If no exact match, try fuzzy matching
        words = text.split()
        best_matches = []

        # Look for matches in sliding windows of various sizes
        term_word_count = len(term.split())
        window_sizes = range(max(1, term_word_count - 1), term_word_count + 2)

        for window_size in window_sizes:
            for i in range(len(words) - window_size + 1):
                window = ' '.join(words[i:i + window_size])
                similarity = self.calculate_similarity(term, window)

                if similarity >= self.similarity_threshold:
                    # Calculate position in original text
                    start_pos = len(' '.join(words[:i]))
                    if i > 0:
                        start_pos += 1  # Add space if not at beginning
                    end_pos = start_pos + len(window)

                    best_matches.append(Match(start_pos, end_pos, window, similarity))

        # Sort by similarity and return the best matches
        return sorted(best_matches, key=lambda x: x.similarity, reverse=True)

def process_files(tex_file_path, index_file_path, output_file_path=None, similarity_threshold=0.8):
    """Main function to process the files."""
    if output_file_path is None:
        output_file_path = tex_file_path.replace('.tex', '_indexed.tex')

    with open(index_file_path, 'r', encoding='utf-8') as f:
        index_content = f.read()

    indexer = LatexIndexer(similarity_threshold=similarity_threshold)
    indexer.parse_index_list(index_content)
    indexer.process_latex_file(tex_file_path)
    indexer.add_index_package()
    indexer.save_modified_file(output_file_path)

    return output_file_path

In [None]:
process_files('output-gpt-processed.tex', 'sample-indices-2.txt')

Parsed term: A/B testing with pages: [86]
Parsed term: Aaron Schwartz case with pages: [68]
Parsed term: AB testing with pages: [137]
Parsed term: academic data with pages: [66]
Parsed term: accuracy with pages: [215, 228]
Parsed term: activation function with pages: [380]
Parsed term: AdaBoost with pages: [364]
Parsed term: add-one discounting with pages: [357]
Parsed term: agglomerative cluster trees with pages: [338]
Parsed term: aggregation mechanisms with pages: [83]
Parsed term: Akaike information criterion with pages: [289, 335]
Parsed term: algorithm analysis with pages: [397]
Parsed term: Amazon Turk with pages: [67, 84]
Parsed term: tasks assigned with pages: [85]
Parsed term: Turkers with pages: [84]
Parsed term: American basketball players with pages: [97]
Parsed term: analogies with pages: [312]
Parsed term: anchoring with pages: [82]
Parsed term: angular distance with pages: [310]
Parsed term: Anscombe's Quartet with pages: [159]
Parsed term: AOL with pages: [64]
Parsed t

'output-gpt-processed_indexed.tex'

In [None]:
import re
from pathlib import Path
from difflib import SequenceMatcher

class Match:
    def __init__(self, start_pos, end_pos, matched_text, page_num=None, similarity=1.0):
        self._start = start_pos
        self._end = end_pos
        self._text = matched_text
        self.page_num = page_num
        self.similarity = similarity

    def start(self):
        return self._start

    def end(self):
        return self._end

    def group(self):
        return self._text

class LatexIndexer:
    def __init__(self, similarity_threshold=0.8):
        self.index_entries = {}
        self.modified_content = []
        self.similarity_threshold = similarity_threshold
        self.page_boundaries = {}
        self.debug = True
        self.added_terms_tracking = {}

    def parse_index_list(self, index_text):
        """Parse the index list into a dictionary of terms and their pages."""
        lines = [line.strip() for line in index_text.split('\n') if line.strip()]
        lines = [line for line in lines if not line.startswith('\\begin') and not line.startswith('\\end')]

        for line in lines:
            match = re.match(r'(.*?),\s*((?:\d+(?:,\s*\d+)*))\\?$', line)
            if match:
                term, pages = match.groups()
                term = term.strip()
                pages = [int(p.strip()) for p in pages.split(',')]
                self.index_entries[term] = sorted(pages)

        if self.debug:
            print(f"Parsed {len(self.index_entries)} index terms")

    def find_term_in_text(self, term, text, start_pos=0):
        """Find term in text with fuzzy matching."""
        best_match = None
        best_ratio = 0

        clean_text = re.sub(r'\\[a-zA-Z]+(\{[^}]*\})?', '', text)

        words = clean_text[start_pos:].split()
        for i in range(len(words)):
            for j in range(i + 1, len(words) + 1):
                candidate = ' '.join(words[i:j])
                ratio = SequenceMatcher(None, term.lower(), candidate.lower()).ratio()

                if ratio > best_ratio and ratio >= self.similarity_threshold:
                    start = text[start_pos:].find(candidate) + start_pos
                    end = start + len(candidate)
                    best_match = Match(start, end, candidate, similarity=ratio)
                    best_ratio = ratio

        return best_match

    def get_page_content(self, page_num):
        """Get content for a specific page."""
        if page_num in self.page_boundaries:
            start_line, end_line = self.page_boundaries[page_num]
            return '\n'.join(self.modified_content[start_line:end_line])
        return ""

    def process_latex_file(self, input_file):
        """Read and process the LaTeX file."""
        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                content = f.read()

            self.modified_content = content.split('\n')
            self.estimate_page_boundaries(content)

            # Initialize tracking for each term and page
            self.added_terms_tracking = {
                term: {page: False for page in pages}
                for term, pages in self.index_entries.items()
            }

            # First pass: Try to find matches
            for term, target_pages in self.index_entries.items():
                for page_num in target_pages:
                    page_content = self.get_page_content(page_num)
                    match = self.find_term_in_text(term, page_content)

                    if match and self.add_index_at_match(term, match, page_num):
                        self.added_terms_tracking[term][page_num] = True

            # Second pass: Add remaining terms to page centers
            for term, page_status in self.added_terms_tracking.items():
                for page_num, added in page_status.items():
                    if not added:
                        if self.add_index_to_page(page_num, term, self.modified_content):
                            self.added_terms_tracking[term][page_num] = True

            # Verify and report
            self.verify_additions()

        except Exception as e:
            print(f"Error processing file: {str(e)}")
            raise

    def verify_additions(self):
        """Verify all terms were added and report status."""
        total_required = sum(len(pages) for pages in self.index_entries.values())
        total_added = sum(
            sum(1 for added in page_status.values() if added)
            for page_status in self.added_terms_tracking.values()
        )

        if self.debug:
            print(f"\nVerification results:")
            print(f"Required additions: {total_required}")
            print(f"Successful additions: {total_added}")

            if total_added < total_required:
                print("\nMissing terms:")
                for term, page_status in self.added_terms_tracking.items():
                    missing_pages = [
                        page for page, added in page_status.items()
                        if not added
                    ]
                    if missing_pages:
                        print(f"- '{term}' not added to pages: {missing_pages}")

    def estimate_page_boundaries(self, content):
        """Estimate page boundaries based on content length."""
        lines = content.split('\n')
        total_lines = len(lines)
        avg_lines_per_page = 45

        current_line = 0
        current_page = 1

        while current_line < total_lines:
            end_line = min(current_line + avg_lines_per_page, total_lines)
            self.page_boundaries[current_page] = (current_line, end_line)
            current_line = end_line + 1
            current_page += 1

        if self.debug:
            print(f"Created {len(self.page_boundaries)} page boundaries")

    def add_index_at_match(self, term, match, page_num):
        """Add index at matched location."""
        try:
            start_line, end_line = self.page_boundaries[page_num]
            line_num = start_line
            char_count = 0

            for i, line in enumerate(self.modified_content[start_line:end_line]):
                if char_count <= match.start() < char_count + len(line) + 1:
                    line_num = start_line + i
                    break
                char_count += len(line) + 1

            self.modified_content[line_num] = (
                self.modified_content[line_num][:match.end() - char_count] +
                f'\\index{{{term}}}' +
                self.modified_content[line_num][match.end() - char_count:]
            )
            return True
        except Exception as e:
            print(f"Error adding index at match for '{term}' on page {page_num}: {str(e)}")
            return False

    def add_index_to_page(self, page_num, term, lines):
        """Add index to middle of page - guaranteed to add if page exists."""
        if page_num not in self.page_boundaries:
            return False

        start_line, end_line = self.page_boundaries[page_num]
        middle_line = (start_line + end_line) // 2
        added = False

        # Look around middle
        for offset in range(end_line - start_line):
            for line_num in [middle_line + offset, middle_line - offset]:
                if start_line <= line_num < end_line:
                    line = lines[line_num].strip()
                    if line and not line.startswith('\\'):
                        lines[line_num] = f"{lines[line_num].rstrip()}\\index{{{term}}}"
                        added = True
                        break
            if added:
                break

        # Fallback: add to first non-empty line
        if not added:
            for i in range(start_line, end_line):
                if lines[i].strip() and not lines[i].strip().startswith('\\'):
                    lines[i] = f"{lines[i].rstrip()}\\index{{{term}}}"
                    added = True
                    break

        # Last resort: add to first line of page
        if not added:
            lines[start_line] = f"{lines[start_line].rstrip()}\\index{{{term}}}"
            added = True

        return added

    def save_modified_file(self, output_file):
        """Save modified content to file."""
        content = '\n'.join(self.modified_content)
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(content)

        if self.debug:
            print(f"\nSaved modified content to: {output_file}")

    def add_index_package(self):
        """Add required index packages if not present."""
        preamble_end = -1
        for i, line in enumerate(self.modified_content):
            if '\\begin{document}' in line:
                preamble_end = i
                break

        if preamble_end != -1:
            preamble = '\n'.join(self.modified_content[:preamble_end])
            packages_to_add = []

            if '\\usepackage{makeidx}' not in preamble:
                packages_to_add.append('\\usepackage{makeidx}')
            if '\\makeindex' not in preamble:
                packages_to_add.append('\\makeindex')

            if packages_to_add:
                self.modified_content[preamble_end:preamble_end] = packages_to_add

        # Add \printindex before \end{document}
        doc_end = len(self.modified_content)
        for i, line in enumerate(self.modified_content):
            if '\\end{document}' in line:
                doc_end = i
                break

        if '\\printindex' not in '\n'.join(self.modified_content):
            self.modified_content.insert(doc_end, '\\printindex')

def process_files(tex_file_path, index_file_path, output_file_path=None):
    """Main function to process files."""
    if output_file_path is None:
        output_file_path = tex_file_path.replace('.tex', '_indexed.tex')

    print(f"Processing LaTeX file: {tex_file_path}")
    print(f"Using index file: {index_file_path}")

    with open(index_file_path, 'r', encoding='utf-8') as f:
        index_content = f.read()

    indexer = LatexIndexer()
    indexer.parse_index_list(index_content)
    indexer.process_latex_file(tex_file_path)
    indexer.add_index_package()
    indexer.save_modified_file(output_file_path)

    return output_file_path

In [None]:
process_files('output-gpt-processed.tex', 'sample-indices-2.txt')

Processing LaTeX file: output-gpt-processed.tex
Using index file: sample-indices-2.txt
Parsed 65 index terms
Created 41 page boundaries

Verification results:
Required additions: 75
Successful additions: 7

Missing terms:
- 'A/B testing' not added to pages: [86]
- 'Aaron Schwartz case' not added to pages: [68]
- 'AB testing' not added to pages: [137]
- 'academic data' not added to pages: [66]
- 'accuracy' not added to pages: [215, 228]
- 'activation function' not added to pages: [380]
- 'AdaBoost' not added to pages: [364]
- 'add-one discounting' not added to pages: [357]
- 'agglomerative cluster trees' not added to pages: [338]
- 'aggregation mechanisms' not added to pages: [83]
- 'Akaike information criterion' not added to pages: [289, 335]
- 'algorithm analysis' not added to pages: [397]
- 'Amazon Turk' not added to pages: [67, 84]
- 'tasks assigned' not added to pages: [85]
- 'Turkers' not added to pages: [84]
- 'American basketball players' not added to pages: [97]
- 'analogies' n

'output-gpt-processed_indexed.tex'

In [None]:
import re
from pathlib import Path

class LatexBibliographyMapper:
    def __init__(self, debug=True):
        self.bib_entries = {}
        self.modified_content = []
        self.debug = debug
        self.citations_found = set()

    def parse_bib_list(self, bib_text):
        """Parse the bibliography list into a dictionary of citations and their full references."""
        # Split entries if they're separated by blank lines
        entries = re.split(r'\n\s*\n', bib_text)

        for entry in entries:
            entry = entry.strip()
            if not entry:
                continue

            # Try to match [key] format at the start of the line
            match = re.match(r'\[(.*?)\](.*)', entry)
            if match:
                key, reference = match.groups()
                key = key.strip()
                reference = reference.strip()
                self.bib_entries[key] = reference

        if self.debug:
            print(f"Parsed {len(self.bib_entries)} bibliography entries")
            print("Bibliography keys:", list(self.bib_entries.keys()))

    def process_latex_file(self, input_file):
        """Read and process the LaTeX file."""
        try:
            with open(input_file, 'r', encoding='utf-8') as f:
                content = f.read()

            if self.debug:
                print(f"File length: {len(content)} characters")
                print("Contains \\begin{document}:", '\\begin{document}' in content)

            # Normalize line endings
            content = content.replace('\r\n', '\n').replace('\r', '\n')

            # Find document structure
            doc_begin = content.find('\\begin{document}')
            doc_end = content.find('\\end{document}')

            if doc_begin == -1:
                raise ValueError("Could not find \\begin{document}")

            if doc_end == -1:
                raise ValueError("Could not find \\end{document}")

            # Split into parts
            preamble = content[:doc_begin]
            body = content[doc_begin:doc_end]
            ending = content[doc_end:]

            # Add required packages to preamble if not present
            if '\\usepackage{cite}' not in preamble:
                preamble = preamble.rstrip() + '\n\\usepackage{cite}\n'

            # Process citations in the body text
            # Look for citations in [key] format
            citations = re.finditer(r'\[([^\]]+)\]', body)

            modified_body = body
            for citation in citations:
                cite_key = citation.group(1).strip()
                if cite_key in self.bib_entries:
                    self.citations_found.add(cite_key)
                    # Replace [key] with \cite{key} if it's not already a \cite command
                    if not re.match(r'\\cite{.*}', citation.group(0)):
                        modified_body = modified_body.replace(
                            citation.group(0),
                            f'\\cite{{{cite_key}}}'
                        )

            # Create bibliography section
            bib_section = '\n\\begin{thebibliography}{99}\n'

            # Add only the cited references
            for i, key in enumerate(sorted(self.citations_found), 1):
                if key in self.bib_entries:
                    bib_section += f'\\bibitem{{{key}}} {self.bib_entries[key]}\n'

            bib_section += '\\end{thebibliography}\n'

            # Add bibliography section before \end{document}
            if '\\begin{thebibliography}' not in content:
                ending = bib_section + ending

            # Combine everything back
            final_content = preamble + modified_body + ending

            # Save the modified content
            self.modified_content = final_content.split('\n')

            if self.debug:
                print(f"Found and processed {len(self.citations_found)} citations")
                print("Citations found:", sorted(list(self.citations_found)))

        except UnicodeDecodeError:
            print("Trying with different encodings...")
            encodings = ['latin-1', 'utf-16', 'cp1252']
            for encoding in encodings:
                try:
                    with open(input_file, 'r', encoding=encoding) as f:
                        content = f.read()
                    print(f"Successfully read file with {encoding} encoding")
                    break
                except UnicodeDecodeError:
                    continue
            else:
                raise ValueError(f"Could not read file with any of these encodings: utf-8, {', '.join(encodings)}")

    def save_modified_file(self, output_file):
        """Save the modified content to a new file."""
        content = '\n'.join(self.modified_content)
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(content)

        if self.debug:
            print(f"\nSaved modified content to {output_file}")
            print("\nTo generate the bibliography, run:")
            print(f"1. pdflatex {output_file}")
            print(f"2. pdflatex {output_file}")  # Second run to resolve citations

def process_bibliography(tex_file_path, bib_file_path, output_file_path=None):
    """Main function to process the files."""
    if output_file_path is None:
        output_file_path = tex_file_path.replace('.tex', '_with_bib.tex')

    print(f"Processing LaTeX file: {tex_file_path}")
    print(f"Using bibliography file: {bib_file_path}")
    print(f"Output will be saved to: {output_file_path}")

    with open(bib_file_path, 'r', encoding='utf-8') as f:
        bib_content = f.read()

    mapper = LatexBibliographyMapper()
    mapper.parse_bib_list(bib_content)
    mapper.process_latex_file(tex_file_path)
    mapper.save_modified_file(output_file_path)

    return output_file_path

In [None]:
import numpy as np
from scipy.optimize import linear_sum_assignment
from difflib import SequenceMatcher
import re
from typing import List, Dict, Tuple

def parse_citation(citation: str) -> Tuple[str, str]:
    """
    Extract the tag and full citation from a bibliography entry.

    Args:
        citation (str): Raw citation text
    Returns:
        Tuple[str, str]: (tag if present, full citation text)
    """
    match = re.match(r'\[(.*?)\]\s*&\s*(.*?)\\\\', citation.strip())
    if match:
        return match.group(1), match.group(2)
    return None, citation.strip()

def generate_tag(citation: str) -> str:
    """
    Generate standardized bibliography tag from citation text.

    Args:
        citation (str): Full citation text
    Returns:
        str: Generated tag following the format:
             - Single author: First3Letters + Year2Digits
             - 2-3 authors: AuthorInitials + Year2Digits
             - >3 authors: FirstAuthor3Letters + Next2AuthorInitials + '+' + Year2Digits
    """
    # Extract year (19XX or 20XX)
    year_match = re.search(r'\b(19|20)\d{2}\b', citation)
    year = year_match.group(0) if year_match else ''

    # Get authors (text before first period or \textit)
    first_part = citation.split('.')[0]
    first_part = first_part.split(r'\textit')[0]

    # Extract author names (Capitalized words)
    authors = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]*)*)', first_part)

    if not authors:
        return f"Unknown{year}"

    first_author = authors[0].split()[-1]  # Last name of first author

    # Generate tag based on number of authors
    if len(authors) == 1:
        return f"{first_author[:3]}{year[-2:]}"
    elif len(authors) <= 3:
        # Use initials of all authors
        initials = ''.join(author.split()[-1][0] for author in authors)
        return f"{initials}{year[-2:]}"
    else:
        # First author + next 2 author initials + '+' + year
        return f"{first_author[:3]}{''.join(a.split()[-1][0] for a in authors[1:3])}+{year[-2:]}"

def bipartite_match_hungarian(generated_tags: List[str], actual_tags: List[str],
                            similarity_threshold: float = 0.3) -> Dict[str, str]:
    """
    Match generated tags to actual tags using Hungarian Algorithm.

    Args:
        generated_tags (List[str]): List of generated citation tags
        actual_tags (List[str]): List of actual citation tags
        similarity_threshold (float): Minimum similarity score to consider a match
    Returns:
        Dict[str, str]: Mapping of generated tags to best matching actual tags
    """
    cost_matrix = np.zeros((len(generated_tags), len(actual_tags)))

    # Build cost matrix
    for i, gen_tag in enumerate(generated_tags):
        for j, act_tag in enumerate(actual_tags):
            # Calculate base similarity
            base_score = SequenceMatcher(None, gen_tag.lower(), act_tag.lower()).ratio()

            # Add bonus for matching years
            year_bonus = 0.2 if (
                re.search(r'\d{2}', gen_tag) and
                re.search(r'\d{2}', act_tag) and
                re.search(r'\d{2}', gen_tag).group(0) == re.search(r'\d{2}', act_tag).group(0)
            ) else 0

            # Convert similarity to cost (negative for maximization)
            cost_matrix[i][j] = -(base_score + year_bonus)

    # Apply Hungarian algorithm
    row_ind, col_ind = linear_sum_assignment(cost_matrix)

    # Create matches dictionary with threshold
    matches = {}
    for i, j in zip(row_ind, col_ind):
        if -cost_matrix[i][j] > similarity_threshold:
            matches[generated_tags[i]] = actual_tags[j]

    return matches

def process_bibliography(file_path: str) -> Dict[str, Dict[str, str]]:
    """
    Process bibliography file and return matches with full citations.

    Args:
        file_path (str): Path to bibliography file
    Returns:
        Dict with two keys:
            'matches': Dict mapping generated tags to actual tags
            'citations': Dict mapping generated tags to full citations
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        citations = f.readlines()

    actual_tags = []
    generated_tags = []
    citations_dict = {}

    # Process each citation
    for citation in citations:
        if not citation.strip():
            continue

        actual_tag, full_citation = parse_citation(citation)
        if actual_tag:
            actual_tags.append(actual_tag)

        generated_tag = generate_tag(full_citation)
        generated_tags.append(generated_tag)
        citations_dict[generated_tag] = full_citation

    # Find optimal matches
    matches = bipartite_match_hungarian(generated_tags, actual_tags)

    return {
        'matches': matches,
        'citations': citations_dict
    }

def main():
    """Main function to run the bibliography matching system."""
    file_path = "bib.txt"
    result = process_bibliography(file_path)

    print("\nMatched Citations:")
    print("-" * 80)
    for gen_tag, act_tag in result['matches'].items():
        similarity = SequenceMatcher(None, gen_tag.lower(), act_tag.lower()).ratio()
        print(f"\nGenerated Tag: {gen_tag}")
        print(f"Actual Tag: {act_tag}")
        print(f"Similarity Score: {similarity:.3f}")
        print(f"Citation: {result['citations'][gen_tag]}")
        print("-" * 80)

if __name__ == "__main__":
    main()


Matched Citations:
--------------------------------------------------------------------------------

Generated Tag: Sam06
Actual Tag: Sam06
Similarity Score: 1.000
Citation: Hanan Samet. \textit{Foundations of Multidimensional and Metric Data Structures}. Morgan Kaufmann, 2006. 
--------------------------------------------------------------------------------

Generated Tag: SazAS+97
Actual Tag: SAMS97
Similarity Score: 0.714
Citation: George N Sazaklis, Esther M Arkin, Joseph SB Mitchell, and Steven S Skiena. Geometric decision trees for optical character recognition. In \textit{Proceedings of the 13th Annual Symposium on Computational Geometry}, pages 394–396. ACM, 1997. 
--------------------------------------------------------------------------------

Generated Tag: M12
Actual Tag: SF12
Similarity Score: 0.571
Citation: Gail M. Sullivan and Richard Feinn. Using effect size: or why the p-value is not enough. \textit{J. Graduate Medical Education}, 4:279282, 2012. 
-------------------

In [None]:
!pip install tqdm scholarly

Collecting scholarly
  Downloading scholarly-1.7.11-py3-none-any.whl.metadata (7.4 kB)
Collecting arrow (from scholarly)
  Downloading arrow-1.3.0-py3-none-any.whl.metadata (7.5 kB)
Collecting bibtexparser (from scholarly)
  Downloading bibtexparser-1.4.2.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.7/55.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fake-useragent (from scholarly)
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Collecting free-proxy (from scholarly)
  Downloading free_proxy-1.1.3.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-dotenv (from scholarly)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting selenium (from scholarly)
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting sphinx-rtd-theme (from scholarly)
  Downloading sphinx_rtd_theme-3.0.2-py2.p

In [None]:
# Standard library imports
import re
from typing import List, Dict, Tuple
import time

# Third-party imports
import numpy as np
from scipy.optimize import linear_sum_assignment
from difflib import SequenceMatcher
from scholarly import scholarly
from tqdm import tqdm

In [None]:
import numpy as np
from scipy.optimize import linear_sum_assignment
from difflib import SequenceMatcher
import re
from typing import List, Dict, Tuple
from scholarly import scholarly
import time
from tqdm import tqdm

class ScholarCitationFetcher:
    """
    Handle Google Scholar queries using only title-based search.
    """
    def __init__(self, delay: float = 2.0):
        self.delay = delay

    def extract_title(self, citation: str) -> str:
        """
        Extract the title from citation by taking the longest segment between periods.

        Args:
            citation (str): Raw citation text
        Returns:
            str: Extracted title
        """
        # Clean the citation by removing LaTeX commands
        cleaned = citation.replace(r'\textit{', '').replace('}', '')

        # Split by periods and get all segments
        segments = [seg.strip() for seg in cleaned.split('.')]

        # Get the longest segment as the title
        if segments:
            title = max(segments, key=len)
            print(f"\nExtracted title: {title}")
            return title
        return ""

    def search_citation(self, citation: str) -> Dict:
        """
        Search Google Scholar using only the extracted title.

        Args:
            citation (str): Citation text to search for
        Returns:
            Dict: Enhanced citation data if found, None otherwise
        """
        try:
            # Extract title
            title = self.extract_title(citation)
            if not title:
                print("Could not extract title from citation")
                return None

            print(f"Searching for title: {title}")

            # Search using the title
            search_result = next(scholarly.search_pubs(title), None)

            if search_result:
                time.sleep(self.delay)

                result_data = {
                    'title': search_result.get('title', ''),
                    'authors': search_result.get('author', []),
                    'year': search_result.get('year', ''),
                    'citations': search_result.get('num_citations', 0),
                    'url': search_result.get('url_scholarbib', ''),
                    'venues': search_result.get('venue', ''),
                    'abstract': search_result.get('abstract', '')
                }

                print("\nFound match in Google Scholar:")
                print(f"Title: {result_data['title']}")
                print(f"Authors: {', '.join(result_data['authors'])}")
                print(f"Year: {result_data['year']}")

                return result_data

            print("No match found in Google Scholar")
            return None

        except Exception as e:
            print(f"Error in search_citation: {e}")
            return None

def parse_citation(citation: str) -> Tuple[str, str]:
    """
    Extract the tag and full citation from a bibliography entry.

    Args:
        citation (str): Raw citation text
    Returns:
        Tuple[str, str]: (tag if present, full citation text)
    """
    match = re.match(r'\[(.*?)\]\s*&\s*(.*?)\\\\', citation.strip())
    if match:
        return match.group(1), match.group(2)
    return None, citation.strip()

def bipartite_match_hungarian(generated_tags: List[str], actual_tags: List[str],
                            similarity_threshold: float = 0.3) -> Dict[str, str]:
    """
    Match generated tags to actual tags using Hungarian Algorithm.

    Args:
        generated_tags (List[str]): List of generated citation tags
        actual_tags (List[str]): List of actual citation tags
        similarity_threshold (float): Minimum similarity score to consider a match
    Returns:
        Dict[str, str]: Mapping of generated tags to best matching actual tags
    """
    if not actual_tags or not generated_tags:
        return {}

    cost_matrix = np.zeros((len(generated_tags), len(actual_tags)))

    # Build cost matrix
    for i, gen_tag in enumerate(generated_tags):
        for j, act_tag in enumerate(actual_tags):
            # Calculate base similarity
            base_score = SequenceMatcher(None, gen_tag.lower(), act_tag.lower()).ratio()

            # Add bonus for matching years
            year_bonus = 0.2 if (
                re.search(r'\d{2}', gen_tag) and
                re.search(r'\d{2}', act_tag) and
                re.search(r'\d{2}', gen_tag).group(0) == re.search(r'\d{2}', act_tag).group(0)
            ) else 0

            # Convert similarity to cost (negative for maximization)
            cost_matrix[i][j] = -(base_score + year_bonus)

    # Apply Hungarian algorithm
    row_ind, col_ind = linear_sum_assignment(cost_matrix)

    # Create matches dictionary with threshold
    matches = {}
    for i, j in zip(row_ind, col_ind):
        if -cost_matrix[i][j] > similarity_threshold:
            matches[generated_tags[i]] = actual_tags[j]

    return matches

def generate_tag_from_scholar(scholar_data: Dict) -> str:
    """
    Generate a standardized bibliography tag using Google Scholar data.

    Args:
        scholar_data (Dict): Scholar citation data containing authors, year, etc.
    Returns:
        str: Generated tag following the format:
             - Single author: First3Letters + Year2Digits
             - 2-3 authors: AuthorInitials + Year2Digits
             - >3 authors: FirstAuthor3Letters + Next2AuthorInitials + '+' + Year2Digits
    """
    if not scholar_data:
        return None

    authors = scholar_data.get('authors', [])
    year = scholar_data.get('year', '')

    if not authors or not year:
        return None

    # Extract last names (assume last word in author name is last name)
    last_names = [author.split()[-1] for author in authors]

    # Clean up year to get last 2 digits
    year_suffix = str(year)[-2:]

    if len(authors) == 1:
        # Single author: First three letters of last name + year
        return f"{last_names[0][:3]}{year_suffix}"

    elif len(authors) <= 3:
        # 2-3 authors: Initial of each author's last name + year
        initials = ''.join(name[0] for name in last_names)
        return f"{initials}{year_suffix}"

    else:
        # >3 authors: First author's first 3 letters + next 2 author initials + '+' + year
        return f"{last_names[0][:3]}{last_names[1][0]}{last_names[2][0]}+{year_suffix}"

def generate_tag(citation: str, scholar_data: Dict = None) -> str:
    """
    Generate bibliography tag using Scholar data if available, falling back to regex parsing.

    Args:
        citation (str): Full citation text
        scholar_data (Dict): Optional Google Scholar data
    Returns:
        str: Generated tag
    """
    # Try generating tag from Scholar data first
    if scholar_data:
        scholar_tag = generate_tag_from_scholar(scholar_data)
        if scholar_tag:
            return scholar_tag

    # Fallback to regex-based parsing if Scholar data is unavailable or incomplete
    year_match = re.search(r'\b(19|20)\d{2}\b', citation)
    year = year_match.group(0) if year_match else ''

    first_part = citation.split('.')[0]
    first_part = first_part.split(r'\textit')[0]

    authors = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]*)*)', first_part)

    if not authors:
        return f"Unknown{year[-2:]}"

    first_author = authors[0].split()[-1]

    if len(authors) == 1:
        return f"{first_author[:3]}{year[-2:]}"
    elif len(authors) <= 3:
        initials = ''.join(author.split()[-1][0] for author in authors)
        return f"{initials}{year[-2:]}"
    else:
        return f"{first_author[:3]}{''.join(a.split()[-1][0] for a in authors[1:3])}+{year[-2:]}"

def process_bibliography(file_path: str, fetch_scholar: bool = True) -> Dict[str, Dict]:
    """
    Process bibliography file using enhanced tag generation.

    Args:
        file_path (str): Path to bibliography file
        fetch_scholar (bool): Whether to fetch additional data from Google Scholar
    Returns:
        Dict containing matches, citations, and enhanced data
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        citations = f.readlines()

    actual_tags = []
    generated_tags = []
    citations_dict = {}
    scholar_data = {}

    # First pass: Fetch Scholar data if requested
    if fetch_scholar:
        fetcher = ScholarCitationFetcher()
        print("\nFetching citation data from Google Scholar...")

        for citation in tqdm(citations):
            if not citation.strip():
                continue

            actual_tag, full_citation = parse_citation(citation)
            scholar_result = fetcher.search_citation(full_citation)

            if scholar_result:
                # Generate tag using Scholar data
                generated_tag = generate_tag(full_citation, scholar_result)
                scholar_data[generated_tag] = scholar_result
            else:
                # Fallback to regex-based tag generation
                generated_tag = generate_tag(full_citation)

            if actual_tag:
                actual_tags.append(actual_tag)
            generated_tags.append(generated_tag)
            citations_dict[generated_tag] = full_citation

    else:
        # Process without Scholar data
        for citation in citations:
            if not citation.strip():
                continue

            actual_tag, full_citation = parse_citation(citation)
            generated_tag = generate_tag(full_citation)

            if actual_tag:
                actual_tags.append(actual_tag)
            generated_tags.append(generated_tag)
            citations_dict[generated_tag] = full_citation

    # Find optimal matches
    matches = bipartite_match_hungarian(generated_tags, actual_tags)

    return {
        'matches': matches,
        'citations': citations_dict,
        'scholar_data': scholar_data
    }

def main():
    """Main function demonstrating the enhanced tag generation system."""
    file_path = "bib.txt"
    result = process_bibliography(file_path, fetch_scholar=True)

    print("\nMatched Citations with Enhanced Tags:")
    print("=" * 100)

    for gen_tag, act_tag in result['matches'].items():
        similarity = SequenceMatcher(None, gen_tag.lower(), act_tag.lower()).ratio()
        scholar_info = result['scholar_data'].get(gen_tag, {})

        print(f"\nGenerated Tag: {gen_tag}")
        print(f"Actual Tag: {act_tag}")
        print(f"Similarity Score: {similarity:.3f}")
        print(f"Citation: {result['citations'][gen_tag]}")

        if scholar_info:
            print("\nTag generated using Google Scholar data:")
            print(f"Authors: {', '.join(scholar_info['authors'])}")
            print(f"Year: {scholar_info['year']}")
            print(f"Citations: {scholar_info['citations']}")
        else:
            print("\nTag generated using regex parsing (Scholar data unavailable)")

        print("=" * 100)

if __name__ == "__main__":
    main()


Fetching citation data from Google Scholar...


  0%|          | 0/55 [00:00<?, ?it/s]


Extracted title: Foundations of Multidimensional and Metric Data Structures
Searching for title: Foundations of Multidimensional and Metric Data Structures


  2%|▏         | 1/55 [00:33<29:52, 33.19s/it]

Error in search_citation: Cannot Fetch from Google Scholar.

Extracted title: % Second row of bibliography
Searching for title: % Second row of bibliography


  5%|▌         | 3/55 [01:03<17:20, 20.01s/it]

Error in search_citation: Cannot Fetch from Google Scholar.

Extracted title: In Proceedings of the 13th Annual Symposium on Computational Geometry, pages 394–396
Searching for title: In Proceedings of the 13th Annual Symposium on Computational Geometry, pages 394–396


  7%|▋         | 4/55 [01:36<20:38, 24.29s/it]

Error in search_citation: Cannot Fetch from Google Scholar.

Extracted title: % Third row of bibliography
Searching for title: % Third row of bibliography


 11%|█         | 6/55 [02:07<16:17, 19.95s/it]

Error in search_citation: Cannot Fetch from Google Scholar.

Extracted title: Using effect size: or why the p-value is not enough
Searching for title: Using effect size: or why the p-value is not enough


 11%|█         | 6/55 [02:23<19:31, 23.91s/it]


KeyboardInterrupt: 