### Using docling v1

In [None]:
data_dir = 'document_collection/ibm-annual-report'
!OMP_NUM_THREADS=32 mamba run -n docling python ../scripts/docparser.py --input-dir {data_dir} --output-dir {data_dir}

### Using docling v2

In [None]:
data_dir = 'document_collection/ibm-annual-report'
!OMP_NUM_THREADS=32 mamba run -n docling python ../scripts/docparser_v2.py --input-dir {data_dir} --output-dir {data_dir} --c docling_v2_config.yaml

#### Docling Hybrid Chunking

In [6]:
%pip install -q -U docling

Note: you may need to restart the kernel to use updated packages.


In [51]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.chunking import HybridChunker
import yaml
from docling.datamodel.base_models import InputFormat
from transformers import AutoTokenizer
from datasets import Dataset


# Standard
from pathlib import Path
import yaml

# Third Party
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
import click

In [39]:
def chunk_document(document_path, max_tokens=500):
    """
    Chunk the document using docling hybrid chunking
    """
    doc = DocumentConverter().convert(source=document_path).document
    chunker = HybridChunker(max_tokens=max_tokens, tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), merge_peers=True)
    chunk_iter = chunker.chunk(dl_doc=doc)
    document_chunks = []
    for _, chunk in enumerate(chunk_iter):
        enriched_text = chunker.serialize(chunk=chunk)
        document_chunks.append(enriched_text)
    return document_chunks

def add_icls_to_chunks(chunks: list[str], icls: list[dict]):
    # Product of chunks and icls
    chunk_icl_product = []
    for chunk in chunks:
        for icl in icls:
            chunk_icl_product.append({'document': chunk})
            chunk_icl_product[-1].update(icl)
    return chunk_icl_product


In [9]:
data_dir = 'document_collection/ibm-annual-report'
with open(f'../{data_dir}/qna.yaml', 'r') as f:
    qna_yaml = yaml.safe_load(f)

In [52]:
pdf_file_path = f'../{data_dir}/ibm-annual-report-2024.pdf'
### Conver the document into markdown
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = ["es"]
pipeline_options.accelerator_options = AcceleratorOptions(
    num_threads=4,
    device=AcceleratorDevice.AUTO
)
doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )
conv_result = doc_converter.convert(pdf_file_path)
content = getattr(conv_result.document, "export_to_html")()
output_path = data_dir / f"{conv_result.input.file.stem}.html"

content = getattr(conv_result.document, "export_to_markdown")()
output_path = data_dir / f"{conv_result.input.file.stem}.md"

TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [None]:
chunks = chunk_document(f'../{data_dir}/ibm-annual-report-2024.pdf')

In [40]:
chunks_with_icls = add_icls_to_chunks(chunks, qna_yaml['seed_examples'])

In [41]:
chunks_with_icls = Dataset.from_list(chunks_with_icls)