In [1]:
!pip install docling




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem

pipeline_options = PdfPipelineOptions()

pipeline_options.generate_table_images = True
pipeline_options.generate_picture_images = True
pipeline_options.do_table_structure = True
pipeline_options.do_ocr = False
pipeline_options.table_structure_options.do_cell_matching = False


doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [2]:
import os

def definition_data(element, current_page_data):
    if isinstance(element, TableItem):
        current_page_data['tables'].append(element)
    if isinstance(element, TextItem):
        current_page_data['texts'].append(element)
    return current_page_data

def transform_data(current_page_data):
    row = {
        'document': '',
        'page_num': '',
        'table_markdown': '',
        'text': '',
    }
    for table in current_page_data['tables']:
        row['table_markdown'] = table.export_to_markdown()
    pharagraph = []

    for text in current_page_data['texts']:
        if text.label != 'page_footer':
            pharagraph.append(text.text)

    pharagraph.append(row['table_markdown'])
    row['text'] = '\n'.join(pharagraph)

    return row

def get_data(conv_res):
    rows = []
    current_page_number = 1
    current_page_data = {
        'tables': [],
        'texts': [],
    }
    
    for element, _level in conv_res.document.iterate_items():
        new_page_number = element.prov[0].page_no
        if new_page_number != current_page_number:
            row = transform_data(current_page_data)
            row['document'] = conv_res.input.file.name  # Исправлено с result на conv_res
            row['page_num'] = current_page_number

            rows.append(row)
            current_page_number = new_page_number
            current_page_data = {
                'tables': [],
                'texts': [],
            }
        current_page_data = definition_data(element, current_page_data)
    
    # Добавляем обработку последней страницы
    row = transform_data(current_page_data)
    row['document'] = conv_res.input.file.name
    row['page_num'] = current_page_number
    rows.append(row)
    
    return rows

rows = []

import os
from multiprocessing import Pool, Manager
from functools import partial
from tqdm import tqdm

def proccessDoc(shared_rows, input_doc_path, file_name):
    try:
        conv_result = doc_converter.convert(input_doc_path)
        new_rows = get_data(conv_result)
        with shared_rows.get_lock():
            shared_rows.extend(new_rows)
        return True, f"Успешно обработан файл: {file_name}"
    except Exception as e:
        return False, f"Ошибка при обработке файла {file_name}: {str(e)}"

def proccessDocs(dir, num_processes=None):
    if num_processes is None:
        num_processes = os.cpu_count() - 1
    
    # Получаем список всех файлов
    files = [(os.path.join(dir, file), file.split('.')[0]) 
             for file in os.listdir(dir)]
    total_files = len(files)
    
    # Создаем общий список и счетчик для прогресса
    with Manager() as manager:
        shared_rows = manager.list()
        
        # Создаем частичную функцию с общим списком
        process_func = partial(proccessDoc, shared_rows)
        
        # Инициализируем счетчики для статистики
        success_count = 0
        error_count = 0
        
        print(f"\nНачало обработки {total_files} файлов используя {num_processes} процессов")
        
        # Запускаем пул процессов с прогресс-баром
        with Pool(processes=num_processes) as pool:
            with tqdm(total=total_files, desc="Обработка файлов", 
                     unit="файл", ncols=100) as pbar:
                
                # Используем imap для возможности обновления прогресс-бара
                for success, message in pool.imap_unordered(process_func, files):
                    if success:
                        success_count += 1
                    else:
                        error_count += 1
                        print(f"\n{message}")
                    pbar.update()
        
        # Преобразуем shared_rows обратно в обычный список
        global rows
        rows = list(shared_rows)
        
        # Выводим итоговую статистику
        print("\nИтоги обработки:")
        print(f"✓ Успешно обработано: {success_count} файлов")
        if error_count > 0:
            print(f"✗ Ошибок обработки: {error_count} файлов")
        print(f"Всего извлечено строк: {len(rows)}")

In [3]:
def convert_doc(input_doc_path: str) -> list:
    conv_result = doc_converter.convert(input_doc_path)
    new_rows = get_data(conv_result)

    return new_rows

In [4]:
import os
_dir = 'input'

files = [(os.path.join(_dir, file)) for file in os.listdir(_dir)]

In [6]:
from tqdm import tqdm

def process_documents(files):
    converted_docs = []
    successful = 0
    failed = 0
    
    for file in tqdm(files, desc="Обработка файлов"):
        try:
            result = convert_doc(file)
            if result:
                converted_docs.extend(result)
                successful += 1
            else:
                failed += 1
        except Exception as e:
            failed += 1
            print(f"Ошибка при обработке {file}: {str(e)}")
    
    print(f"\nОбработка завершена:")
    print(f"Успешно: {successful}")
    print(f"С ошибками: {failed}")
    print(f"Всего извлечено строк: {len(converted_docs)}")
    
    return converted_docs
converted_docs = process_documents(files)

Обработка файлов:  24%|███████████████                                               | 17/70 [17:51<1:05:20, 73.97s/it]

[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m
[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m


Обработка файлов:  34%|████████████████████▏                                      | 24/70 [1:37:24<3:22:26, 264.06s/it]

[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m
[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m


Обработка файлов:  47%|█████████████████████████████▏                                | 33/70 [1:42:30<22:49, 37.02s/it]Encountered an error during conversion of document 62fd8b2fb60a5693c23aab39c30f0cb769e155912b0ffa90e4d0563d949e7361:
Traceback (most recent call last):

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\pipeline\base_pipeline.py", line 149, in _build_document
    for p in pipeline_pages:  # Must exhaust!

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\pipeline\base_pipeline.py", line 116, in _apply_on_pages
    yield from page_batch

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\models\page_assemble_model.py", line 59, in __call__
    for page in page_batch:

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\models\table_structure_model.py", line 166, in __call__
    text_piece = page._backend.get_text_in

Ошибка при обработке input\39.pdf: float division by zero


Обработка файлов:  50%|███████████████████████████████                               | 35/70 [1:43:25<19:33, 33.54s/it]

[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m


Обработка файлов:  54%|█████████████████████████████████▋                            | 38/70 [1:50:19<42:46, 80.20s/it]

[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m


Обработка файлов:  56%|██████████████████████████████████▌                           | 39/70 [1:51:05<36:10, 70.01s/it]Encountered an error during conversion of document e291b305b4cbf12dbe79b51429c1abf848e641f515dffbb32b8769eff2b7860c:
Traceback (most recent call last):

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\pipeline\base_pipeline.py", line 149, in _build_document
    for p in pipeline_pages:  # Must exhaust!

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\pipeline\base_pipeline.py", line 116, in _apply_on_pages
    yield from page_batch

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\models\page_assemble_model.py", line 59, in __call__
    for page in page_batch:

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\models\table_structure_model.py", line 166, in __call__
    text_piece = page._backend.get_text_in

Ошибка при обработке input\44.pdf: float division by zero


Обработка файлов:  80%|████████████████████████████████████████████████▊            | 56/70 [3:33:09<58:49, 252.07s/it]

[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m
[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m


Обработка файлов:  87%|█████████████████████████████████████████████████████▏       | 61/70 [3:44:02<15:02, 100.33s/it]Encountered an error during conversion of document 61d2f566e0e6a25594b13f8727761b5b458e74300bd94f0599550519df7a3a08:
Traceback (most recent call last):

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\pipeline\base_pipeline.py", line 149, in _build_document
    for p in pipeline_pages:  # Must exhaust!

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\pipeline\base_pipeline.py", line 116, in _apply_on_pages
    yield from page_batch

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\models\page_assemble_model.py", line 59, in __call__
    for page in page_batch:

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\models\table_structure_model.py", line 166, in __call__
    text_piece = page._backend.get_text_in

Ошибка при обработке input\64.pdf: float division by zero


Encountered an error during conversion of document d78ada7eb0aa81c08a3e691c3b4078df94a65312e4cebb4f428d38f54c250d02:
Traceback (most recent call last):

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\pipeline\base_pipeline.py", line 149, in _build_document
    for p in pipeline_pages:  # Must exhaust!

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\pipeline\base_pipeline.py", line 116, in _apply_on_pages
    yield from page_batch

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\models\page_assemble_model.py", line 59, in __call__
    for page in page_batch:

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\models\table_structure_model.py", line 166, in __call__
    text_piece = page._backend.get_text_in_rect(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python31

Ошибка при обработке input\65.pdf: float division by zero


Обработка файлов: 100%|█████████████████████████████████████████████████████████████| 70/70 [3:59:01<00:00, 204.87s/it]


Обработка завершена:
Успешно: 66
С ошибками: 4
Всего извлечено строк: 4452





In [31]:
res = process_documents(files)

ERROR:docling.datamodel.document:An unexpected error occurred while opening the document 0.pdf
Traceback (most recent call last):
  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\datamodel\document.py", line 120, in __init__
    self._init_doc(backend, path_or_stream)
  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\datamodel\document.py", line 173, in _init_doc
    self._backend = backend(self, path_or_stream=path_or_stream)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\docling\backend\docling_parse_backend.py", line 194, in __init__
    self._pdoc = pdfium.PdfDocument(self.path_or_stream)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wurfil\AppData\Local\Programs\Python\Python311\Lib\site-packages\pypdfium2\_helpers\document.py", line 78, in __init__
    self.raw, to_hold, to_

In [None]:
# import pandas as pd
import pandas as pd

df = pd.DataFrame(rows)
df.to_csv('stucture_data.csv')