In [2]:
import fitz  # PyMuPDF
import re
import json

class PDFParser:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.doc = fitz.open(pdf_path)
        self.toc = self.clean_toc(self.doc.get_toc())
        self.text_pages = self.extract_text_from_pdf()
        self.structure = self.create_structure_from_toc(self.toc)

    @staticmethod
    def clean_toc(toc):
        """Очищает оглавление от элементов с ненужными символами и заголовками типа 'Глава'."""
        toc = [item for item in toc if not re.search(r'[\uf000-\uf0ff]', item[1])]
        toc = [item for item in toc if not item[1].startswith("Глава")]
        return toc

    def create_structure_from_toc(self, toc):
        """Создает иерархическую структуру книги на основе оглавления."""
        structure = {}
        current_chapter = None
        current_section = None

        for level, title, page in toc:
            if level == 1:  # Глава
                chapter_number = str(len(structure) + 1)
                structure[chapter_number] = {
                    "title": title,
                    "sections": {},
                    "text": ""
                }
                current_chapter = chapter_number
                current_section = None

            elif level == 2 and current_chapter:  # Раздел
                section_number = f"{current_chapter}.{len(structure[current_chapter]['sections']) + 1}"
                structure[current_chapter]["sections"][section_number] = {
                    "title": title,
                    "subsections": {},
                    "text": ""
                }
                current_section = section_number

            elif level == 3 and current_section:  # Подраздел
                subsection_number = f"{current_section}.{len(structure[current_chapter]['sections'][current_section]['subsections']) + 1}"
                structure[current_chapter]["sections"][current_section]["subsections"][subsection_number] = {
                    "title": title,
                    "text": ""
                }
        return structure

    def extract_text_from_pdf(self):
        """Извлекает текст из PDF-файла постранично."""
        text_pages = []
        for page_num in range(len(self.doc)):
            page = self.doc.load_page(page_num)
            text = page.get_text("text")
            text_pages.append(text)
        return text_pages

    def find_text_for_toc_index(self, toc_index):
        """Находит текст для заголовка по индексу оглавления."""
        page_with_current_header = self.toc[toc_index][2] - 1 # нумерация в книге идет с 1, а в toc - с 0
        
        page_with_next_header = len(self.text_pages)-1 if toc_index > (len(self.toc)-2) else (self.toc[toc_index+1][2] - 1)

        # Извлекаем текст для текущего заголовка
        text_for_toc_index = ''
        
        current_page_lines = self.text_pages[page_with_current_header].split('\n')
        
        last_line_of_header = [line.strip().lower().endswith(self.toc[toc_index][1].lower().split()[-1]) for line in current_page_lines].index(True)

        if page_with_next_header > page_with_current_header:
            
            for index in range(last_line_of_header+1, len(current_page_lines)):                
                text_for_toc_index += current_page_lines[index].strip() + ' '        
   
            if toc_index > len(self.toc)-2:
                
                for page_number in range(page_with_current_header+1, page_with_next_header+1):                    
                    text_for_toc_index += self.text_pages[page_number].replace("\n", "")
            else:
                for page_number in range(page_with_current_header+1, page_with_next_header):                    
                    text_for_toc_index += self.text_pages[page_number].replace("\n", "")

                current_page_lines = self.text_pages[page_with_next_header].split('\n')
 
                first_line_of_header = [line.strip().lower().startswith(self.toc[toc_index+1][1].lower().split()[0]) for line in current_page_lines].index(True)

                for index in range(0, first_line_of_header):                    
                    text_for_toc_index += current_page_lines[index] + ' '
        else:
            
            if toc_index > len(self.toc)-2:           
 
                for index in range(last_line_of_header+1, len(current_page_lines)+1):
                    
                    text_for_toc_index += current_page_lines[index].strip() + ' '
            else:
            
                first_line_of_header = [line.strip().lower().startswith(self.toc[toc_index+1][1].lower().split()[0]) for line in current_page_lines].index(True)
            
                for index in range(last_line_of_header+1, first_line_of_header):
                    text_for_toc_index += current_page_lines[index].strip() + ' '    

        
        return re.sub(r'\s{2,}', ' ', text_for_toc_index.strip())

    def populate_structure_with_text(self):
        """Добавляет текст к соответствующим заголовкам в структуре."""
        for index, item in enumerate(self.toc):
            title = item[1]
            text_for_index = self.find_text_for_toc_index(index)
            self.assign_text_to_structure(self.structure, title, text_for_index)

    @staticmethod
    def assign_text_to_structure(data, title, text):
        """Рекурсивно добавляет текст к соответствующему заголовку."""
        for key, value in data.items():
            if isinstance(value, dict):
                if value.get("title") == title:
                    value["text"] = text
                    return True
                # Рекурсивно заполняем разделы и подразделы
                if PDFParser.assign_text_to_structure(value.get("sections", {}), title, text):
                    return True
                if PDFParser.assign_text_to_structure(value.get("subsections", {}), title, text):
                    return True
        return False

    def save_structure_to_json(self, output_path):
        """Сохраняет структуру в JSON-файл."""
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(self.structure, f, ensure_ascii=False, indent=4)

    def parse_and_save(self, output_path):
        """Выполняет полный процесс парсинга и сохранения структуры книги."""
        self.populate_structure_with_text()
        self.save_structure_to_json(output_path)
        print(f"Структура книги успешно сохранена в файл {output_path}")

# Пример использования
pdf_parser = PDFParser("Руководство_Бухгалтерия_для_Узбекистана_ред_3_0.pdf")
pdf_parser.parse_and_save("structure.json")


Структура книги успешно сохранена в файл structure.json
