In [1]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline, PdfPipelineOptions
from pathlib import Path

In [408]:
test_pdf_path = Path("../data/files/12-583-x2021001-eng.pdf")

In [409]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = False

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options
        )
    }
)

result = doc_converter.convert(test_pdf_path)

In [410]:
def get_hierarchy_level(text: str) -> tuple[int, str | None]:
    """
    Determine the hierarchy level of a text based on its numeric prefix.
    
    Args:
        text (str): The input text to analyze
        
    Returns:
        tuple[int, str | None]: A tuple containing:
            - hierarchy level (1-4, or 0 if no numeric prefix)
            - the numeric prefix if found, None otherwise
            
    Examples:
        >>> get_hierarchy_level("52 Technical occupations")      # Returns (1, "52")
        >>> get_hierarchy_level("5210 Technical occupations")    # Returns (3, "5210")
        >>> get_hierarchy_level("No numbers here")               # Returns (0, None)
    """
    text = text.strip()
    # Find the numeric prefix
    numeric_prefix = ""
    for char in text:
        if char.isdigit():
            numeric_prefix += char
        else:
            break

    if not numeric_prefix:
        return (0, None)
    
    # Check that there are at least 5 non-numeric characters after the prefix
    remaining_text = text[len(numeric_prefix):].strip()
    if len(remaining_text) < 5:
        return (0, None)
        
    # Map the length of the numeric prefix to hierarchy level
    level_map = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
    
    level = level_map.get(len(numeric_prefix), 0)
    return (level, numeric_prefix if level > 0 else None)

from pydantic import BaseModel
from typing import Type

def filter_object_item_fields(item_dict: dict, object_type: Type[BaseModel]) -> dict:
    """Keep only fields that belong to TextItem model."""
    return {k: v for k, v in item_dict.items() if k in set(object_type.model_fields.keys())}

In [411]:
from docling_core.types.doc import TextItem, SectionHeaderItem, DocItemLabel

elements = []
for item in result.document.texts:
    print(item.text)
    print(item.label)
    level, prefix = get_hierarchy_level(item.text)
    print(level, prefix)
    item_dict = item.model_dump()
    
    # Rename text that should be sections with levels
    if level > 0:
        print('renaming text section')
        item_dict = filter_object_item_fields(item_dict, SectionHeaderItem)
        item_dict.update({'label': DocItemLabel.SECTION_HEADER})
        item = SectionHeaderItem(**item_dict)
        item.level = level  
    elif item.label in [
        DocItemLabel.SECTION_HEADER, 
        DocItemLabel.TITLE, 
        DocItemLabel.DOCUMENT_INDEX, 
        DocItemLabel.PARAGRAPH
    ]:
        print('renaming section or title')
        # Rename sections and titles that should actually be text for chunking
        item_dict = filter_object_item_fields(item_dict, TextItem)
        item_dict.update({'label': DocItemLabel.TEXT})
        item = TextItem(**item_dict)
    else:
        print('keep as is')
        item = item

    elements.append(item)

Catalogue no. 12-583-X ISBN 978-0-660-40067-9
page_header
0 None
keep as is
National Occupational Classification (NOC) 2021 Version 1.0
section_header
0 None
renaming section or title
Release date: September 21, 2021 PDF publish date: November 2, 2022
text
0 None
keep as is
How to obtain more information
section_header
0 None
renaming section or title
For information about this product or the wide range of services and data available from Statistics Canada, visit our website, www.statcan.gc.ca.
text
0 None
keep as is
You can also contact us by
text
0 None
keep as is
Email at infostats@statcan.gc.ca
text
0 None
keep as is
Telephone, from Monday to Friday, 8:30 a.m. to 4:30 p.m., at the following numbers:
text
0 None
keep as is
• Statistical Information Service
list_item
0 None
keep as is
1-800-263-1136
text
1 1
renaming text section
• National telecommunications device for the hearing impaired
list_item
0 None
keep as is
1-800-363-7629
text
1 1
renaming text section
• Fax line
list_item

In [421]:
from docling.chunking import HybridChunker, HierarchicalChunker

chunker = HybridChunker(
    chunk_size=1000,
    chunk_overlap=100,
    merge_list_items=True
)

chunk_iter = chunker.chunk(result.document)

Token indices sequence length is longer than the specified maximum sequence length for this model (1518 > 512). Running this sequence through the model will result in indexing errors


In [422]:
chunks = list(chunk_iter)
len(chunks)

3593

In [423]:
def make_chunk_dict(chunk):

    pages = [[x.page_no for x in x.prov] for x in chunk.meta.doc_items]
    unique_pages = list(set([page for sublist in pages for page in sublist]))
    doc_refs = [x.self_ref for x in chunk.meta.doc_items]
    headings = chunk.meta.headings
    
    if headings:
        h1 = headings[0] if len(headings) > 0 else None
        h2 = headings[1] if len(headings) > 1 else None 
        h3 = headings[2] if len(headings) > 2 else None
        h4 = headings[3] if len(headings) > 3 else None
        h5 = headings[4] if len(headings) > 4 else None
    else:
        h1 = None
        h2 = None
        h3 = None
        h4 = None
        h5 = None
    
    return {
        'filename': chunk.meta.origin.filename,
        'pages': unique_pages,
        'doc_refs': doc_refs,
        'text': chunk.text,
        'h1': h1, 
        'h2': h2, 
        'h3': h3, 
        'h4': h4, 
        'h5': h5
    }

In [426]:
import pandas as pd

df = pd.DataFrame([make_chunk_dict(x) for x in chunks])

In [428]:
df_filt = df[df['h1'].isin([
    '1 Business, finance and administration occupations',
    '2 Natural and applied sciences and related occupations', 
    '3 Health occupations',
    '4 Occupations in education, law and social, community and government services',
    '5 Occupations in art, culture, recreation and sport',
    '6 Sales and service occupations',
    '7 Trades, transport and equipment operators and related occupations',
    '8 Natural resources, agriculture and related production occupations',
    '9 Occupations in manufacturing and utilities',
    '0 Legislative and senior management occupations'
])]

In [433]:
df_filt.to_excel('../data/clean_noc_chunks.xlsx')