In [12]:
import os
import win32com.client as win32

def split_word_by_heading2_and_export_pdfs(docx_path, output_dir=None):
    # Setup output directory
    if output_dir is None:
        output_dir = os.path.splitext(docx_path)[0] + "_split"
    os.makedirs(output_dir, exist_ok=True)

    # Start Word
    word = win32.gencache.EnsureDispatch('Word.Application')
    word.Visible = False
    doc = word.Documents.Open(docx_path)

    paragraphs = doc.Paragraphs
    heading_indices = []

    # Step 1: Find all Heading 2 positions
    for i in range(1, paragraphs.Count + 1):
        para = paragraphs(i)
        rng = para.Range

        try:
            style = rng.Style
            style_name = style.NameLocal  # Get the name string
        except Exception:
            continue  # Skip paragraphs without a valid style

        if style_name.startswith("Heading 1"):
            text = rng.Text.strip()
            if text:
                heading_indices.append((i, text))

    # Add an artificial end index
    heading_indices.append((paragraphs.Count + 1, "END"))

    # Step 2: Loop through each Heading 2 section
    for idx in range(len(heading_indices) - 1):
        start_idx, title = heading_indices[idx]
        end_idx, _ = heading_indices[idx + 1]

        start_range = paragraphs(start_idx).Range.Start
        end_range = paragraphs(end_idx - 1).Range.End

        section_range = doc.Range(Start=start_range, End=end_range)

        # Create a new document and copy content
        new_doc = word.Documents.Add()
        new_doc.Range().FormattedText = section_range.FormattedText

        # Sanitize filename
        safe_title = "".join(c if c.isalnum() or c in " _-" else "_" for c in title)
        pdf_filename = f"{idx + 1:02d}_{safe_title}.pdf"
        pdf_path = os.path.join(output_dir, pdf_filename)

        # Export as PDF
        new_doc.SaveAs(pdf_path, FileFormat=17)  # 17 = wdFormatPDF
        new_doc.Close(False)

    # Cleanup
    doc.Close(False)
    word.Quit()

    print(f"Exported {len(heading_indices)-1} sections to: {output_dir}")

# Example usage
split_word_by_heading2_and_export_pdfs(r"C:\Users\MSI\Documents\DOC_Extractor\demo.docx")


Exported 5 sections to: C:\Users\MSI\Documents\DOC_Extractor\demo_split


In [None]:
docx_path = r"C:\Users\MSI\Documents\DOC_Extractor\demo2.docx"


In [None]:
split_word_by_heading_and_export_pdf(docx_path)

In [2]:
import os
import time
import win32com.client as win32

WD_FORMAT_PDF = 17
WD_FIND_STOP  = 0  # don't wrap

def _collect_headings_by_find(doc, level):
    """Return [(start_pos, title), ...] for Heading {level} using Word's Find on Style."""
    rng = doc.Content
    find = rng.Find
    find.ClearFormatting()
    # Restrict to style "Heading {level}"
    find.Style = doc.Styles(f"Heading {level}")
    find.Text = ""          # match anything with that style
    find.Forward = True
    find.Format = True
    find.Wrap = WD_FIND_STOP

    headings = []
    # search incrementally by moving range past each hit
    while find.Execute():
        hit = rng.Duplicate  # snapshot before we move rng
        title = hit.Text.strip()
        if title:            # keep empty headings out
            headings.append((hit.Start, title))
        # advance search to end of current match
        rng.Start = hit.End
        rng.End = doc.Content.End

    # Add sentinel end position for the last section
    headings.append((doc.Content.End, "END"))
    return headings

def split_word_to_pdf(docx_path, study, level=2, use_find=False):
    # Output dir
    output_dir = os.path.abspath(f"Output_Pdfs_{study.upper()}")
    os.makedirs(output_dir, exist_ok=True)

    word = win32.DispatchEx('Word.Application')
    word.Visible = False
    word.DisplayAlerts = 0  # wdAlertsNone

    doc = None
    export_doc = None
    try:
        doc = word.Documents.Open(docx_path)

        if use_find:
            # FAST PATH: Find all "Heading {level}" by style
            heading_pos = _collect_headings_by_find(doc, level)
            # mirror your original behavior: print titles and build sections
            for _, t in heading_pos[:-1]:
                print(t)
        else:
            # ORIGINAL LOGIC: paragraph scan (kept here for parity)
            paragraphs = doc.Paragraphs
            n_paras = paragraphs.Count
            heading_pos = []
            for i in range(1, n_paras + 1):
                p = paragraphs(i)
                if p.OutlineLevel == level:
                    t = p.Range.Text.strip()
                    print(t)
                    if t:
                        heading_pos.append((p.Range.Start, t))
            heading_pos.append((doc.Content.End, "END"))

        export_doc = word.Documents.Add()
        export_range = export_doc.Range()

        keep = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789")
        def make_safe(name: str) -> str:
            return ''.join(ch if ch in keep else '_' for ch in name)

        # Export each section: [heading_k.start, heading_{k+1}.start)
        for idx in range(len(heading_pos) - 1):
            start_pos, title = heading_pos[idx]
            next_start, _ = heading_pos[idx + 1]

            section_range = doc.Range(Start=start_pos, End=next_start)
            export_range.FormattedText = section_range.FormattedText

            safe_title = make_safe(title)
            pdf_filename = f"{study.upper()}_{safe_title}"[:46] + ".pdf"
            print(pdf_filename)
            pdf_path = os.path.join(output_dir, pdf_filename)
            export_doc.SaveAs(pdf_path, FileFormat=WD_FORMAT_PDF)

        print(f"Exported {len(heading_pos)-1} sections to: {output_dir}")

    finally:
        if export_doc is not None:
            try:
                export_doc.Close(False)
            except Exception:
                pass
        if doc is not None:
            try:
                doc.Close(False)
            except Exception:
                pass
        try:
            word.Quit()
        except Exception:
            pass


In [6]:
start = time.time()
split_word_to_pdf(r"C:\Users\MSI\Documents\DOC_Extractor\demo.docx", "MyStudy2", level=2, use_find=True)
end = time.time()
print(f"Time taken: {end- start:.2f}s")

Inline formatting
Fun with fonts
Paragraph level formatting
Footnotes & Endnotes
Dropcaps
Links
Table of Contents
Bulleted List
Numbered List
Multi-level Lists
Continued Lists
MYSTUDY2_Inline_formatting.pdf
MYSTUDY2_Fun_with_fonts.pdf
MYSTUDY2_Paragraph_level_formatting.pdf
MYSTUDY2_Footnotes___Endnotes.pdf
MYSTUDY2_Dropcaps.pdf
MYSTUDY2_Links.pdf
MYSTUDY2_Table_of_Contents.pdf
MYSTUDY2_Bulleted_List.pdf
MYSTUDY2_Numbered_List.pdf
MYSTUDY2_Multi_level_Lists.pdf
MYSTUDY2_Continued_Lists.pdf
Exported 11 sections to: C:\Users\MSI\Documents\DOC_Extractor\Output_Pdfs_MYSTUDY2
Time taken: 6.33s
