In [12]:
import os
import win32com.client as win32

def split_word_by_heading2_and_export_pdfs(docx_path, output_dir=None):
    # Setup output directory
    if output_dir is None:
        output_dir = os.path.splitext(docx_path)[0] + "_split"
    os.makedirs(output_dir, exist_ok=True)

    # Start Word
    word = win32.gencache.EnsureDispatch('Word.Application')
    word.Visible = False
    doc = word.Documents.Open(docx_path)

    paragraphs = doc.Paragraphs
    heading_indices = []

    # Step 1: Find all Heading 2 positions
    for i in range(1, paragraphs.Count + 1):
        para = paragraphs(i)
        rng = para.Range

        try:
            style = rng.Style
            style_name = style.NameLocal  # Get the name string
        except Exception:
            continue  # Skip paragraphs without a valid style

        if style_name.startswith("Heading 1"):
            text = rng.Text.strip()
            if text:
                heading_indices.append((i, text))

    # Add an artificial end index
    heading_indices.append((paragraphs.Count + 1, "END"))

    # Step 2: Loop through each Heading 2 section
    for idx in range(len(heading_indices) - 1):
        start_idx, title = heading_indices[idx]
        end_idx, _ = heading_indices[idx + 1]

        start_range = paragraphs(start_idx).Range.Start
        end_range = paragraphs(end_idx - 1).Range.End

        section_range = doc.Range(Start=start_range, End=end_range)

        # Create a new document and copy content
        new_doc = word.Documents.Add()
        new_doc.Range().FormattedText = section_range.FormattedText

        # Sanitize filename
        safe_title = "".join(c if c.isalnum() or c in " _-" else "_" for c in title)
        pdf_filename = f"{idx + 1:02d}_{safe_title}.pdf"
        pdf_path = os.path.join(output_dir, pdf_filename)

        # Export as PDF
        new_doc.SaveAs(pdf_path, FileFormat=17)  # 17 = wdFormatPDF
        new_doc.Close(False)

    # Cleanup
    doc.Close(False)
    word.Quit()

    print(f"Exported {len(heading_indices)-1} sections to: {output_dir}")

# Example usage
split_word_by_heading2_and_export_pdfs(r"C:\Users\MSI\Documents\DOC_Extractor\demo.docx")


Exported 5 sections to: C:\Users\MSI\Documents\DOC_Extractor\demo_split


In [None]:
docx_path = r"C:\Users\MSI\Documents\DOC_Extractor\demo2.docx"


In [None]:
split_word_by_heading_and_export_pdf(docx_path)