In [1]:
import os
import re
import torch
from multiprocessing import Pool
from docling.document_converter import DocumentConverter

os.environ["OMP_NUM_THREADS"] = "12"

def decode_unicode_sequences(text):
    return re.sub(r"/uni([0-9a-fA-F]{4})", lambda m: chr(int(m.group(1), 16)), text)

def process_file(args):
    pdf_path, md_root, pdf_root = args
    try:
        converter = DocumentConverter()
        relative_path = os.path.relpath(pdf_path, pdf_root)
        md_path = os.path.join(md_root, os.path.splitext(relative_path)[0] + ".md")
        
        result = converter.convert(pdf_path)
        markdown_text = result.document.export_to_markdown()
        markdown_text = decode_unicode_sequences(markdown_text)

        os.makedirs(os.path.dirname(md_path), exist_ok=True)
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(markdown_text)

        print(f"✅ Файл {pdf_path} сохранен в {md_path}")
        torch.cuda.empty_cache()
        return True
    except Exception as e:
        print(f"❌ Ошибка при обработке {pdf_path}: {e}")
        return False

def convert_all_pdfs(pdf_root="../pdf", md_root="../md_benchmark", workers=4):
    tasks = []
    for root, _, files in os.walk(pdf_root):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                tasks.append((pdf_path, md_root, pdf_root))

    # Запускаем пул процессов
    with Pool(processes=workers) as pool:
        results = pool.map(process_file, tasks)
    
    print(f"Обработано {sum(results)} файлов из {len(tasks)}")

convert_all_pdfs(workers=3)

2025-05-10 16:47:58.113958: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746884878.136140   25893 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746884878.143075   25893 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746884878.161233   25893 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746884878.161250   25893 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746884878.161252   25893 computation_placer.cc:177] computation placer alr

✅ Файл ../pdf/benchmark/ai_industry_reports/investments3.pdf сохранен в ../md_benchmark/benchmark/ai_industry_reports/investments3.md
✅ Файл ../pdf/benchmark/oil/Нефтяная_и_газовая_промышленность.pdf сохранен в ../md_benchmark/benchmark/oil/Нефтяная_и_газовая_промышленность.md
✅ Файл ../pdf/benchmark/ai_industry_reports/strategy3.pdf сохранен в ../md_benchmark/benchmark/ai_industry_reports/strategy3.md
✅ Файл ../pdf/benchmark/ai_industry_reports/report3.pdf сохранен в ../md_benchmark/benchmark/ai_industry_reports/report3.md
✅ Файл ../pdf/benchmark/ai_industry_reports/trends3.pdf сохранен в ../md_benchmark/benchmark/ai_industry_reports/trends3.md
✅ Файл ../pdf/benchmark/linux/linux.pdf сохранен в ../md_benchmark/benchmark/linux/linux.md
✅ Файл ../pdf/benchmark/ai_industry_reports/index3.pdf сохранен в ../md_benchmark/benchmark/ai_industry_reports/index3.md
✅ Файл ../pdf/benchmark/oil/ИТС-28-2017_добыча_нефти.pdf сохранен в ../md_benchmark/benchmark/oil/ИТС-28-2017_добыча_нефти.md
✅ Файл