In [11]:
from docx import Document
from docx.shared import RGBColor
from reportlab.lib.pagesizes import LETTER
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
from reportlab.lib import colors
import os

def get_text_and_style(run):
    color_obj = run.font.color.rgb
    if color_obj:
        r, g, b = color_obj[0], color_obj[1], color_obj[2]
    else:
        r, g, b = 0, 0, 0
    return {
        'text': run.text,
        'bold': run.bold,
        'italic': run.italic,
        'underline': run.underline,
        'color': (r, g, b),
        'size': run.font.size.pt if run.font.size else 12
    }

def convert_to_paragraph_text(paragraph):
    return [get_text_and_style(run) for run in paragraph.runs if run.text.strip()]

def styled_run_to_html(run):
    style = ""
    if run['bold']: style += "<b>"
    if run['italic']: style += "<i>"
    if run['underline']: style += "<u>"

    r, g, b = run['color']
    color = f"#{r:02x}{g:02x}{b:02x}"
    size = run['size']
    style += f'<font color="{color}" size="{size}">{run["text"]}</font>'

    if run['underline']: style += "</u>"
    if run['italic']: style += "</i>"
    if run['bold']: style += "</b>"
    return style

def build_html_from_runs(runs):
    return ''.join(styled_run_to_html(run) for run in runs)

def is_list(paragraph):
    return paragraph.style.name.lower().startswith("list")

def extract_tables(document):
    table_data = []
    for table in document.tables:
        rows = []
        for row in table.rows:
            cells = []
            for cell in row.cells:
                cell_text = ''
                for para in cell.paragraphs:
                    runs = convert_to_paragraph_text(para)
                    cell_text += build_html_from_runs(runs) + "<br/>"
                cells.append(cell_text)
            rows.append(cells)
        table_data.append(rows)
    return table_data

def save_section_to_pdf(header_obj, content, filename):
    doc = SimpleDocTemplate(filename, pagesize=LETTER)
    styles = getSampleStyleSheet()
    body_style = ParagraphStyle(
        name='BodyStyle',
        parent=styles['Normal'],
        fontSize=12,
        leading=14,
        alignment=TA_LEFT
    )

    align_map = {0: TA_LEFT, 1: TA_CENTER, 2: TA_RIGHT}
    alignment = align_map.get(header_obj["alignment"], TA_LEFT)
    header_style = ParagraphStyle(
        name='HeaderStyle',
        parent=styles['Heading1'],
        alignment=alignment
    )

    story = [Paragraph(build_html_from_runs(header_obj["runs"]), header_style), Spacer(1, 12)]

    for item in content:
        if isinstance(item, list):
            para_html = build_html_from_runs(item)
            story.append(Paragraph(para_html, body_style))
            story.append(Spacer(1, 12))
        elif isinstance(item, dict) and item.get("type") == "list":
            bullet = "• " if not item.get("ordered") else f"{item.get('index')}. "
            para_html = build_html_from_runs(item["runs"])
            story.append(Paragraph(bullet + para_html, body_style))
            story.append(Spacer(1, 8))
        elif isinstance(item, dict) and item.get("type") == "table":
            data = item["data"]
            table = Table(data, hAlign='LEFT')
            table.setStyle(TableStyle([
                ("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
                ("VALIGN", (0, 0), (-1, -1), "TOP"),
                ("LEFTPADDING", (0, 0), (-1, -1), 5),
                ("RIGHTPADDING", (0, 0), (-1, -1), 5),
            ]))
            story.append(table)
            story.append(Spacer(1, 12))

    doc.build(story)

def extract_headers_and_content(docx_path, allowed_levels):
    document = Document(docx_path)
    sections = []
    current_header = None
    current_content = []

    for para in document.paragraphs:
        style = para.style.name

        if style.startswith("Heading "):
            level = int(style.split(" ")[-1])
            if level == allowed_levels:
                if current_header:
                    sections.append((current_header, current_content))
                current_header = {
                    "runs": convert_to_paragraph_text(para),
                    "alignment": para.alignment
                }
                current_content = []
                continue

        if is_list(para):
            list_item = {
                "type": "list",
                "ordered": 'Number' in para.style.name,
                "index": len(current_content) + 1,
                "runs": convert_to_paragraph_text(para)
            }
            current_content.append(list_item)
            continue

        styled_runs = convert_to_paragraph_text(para)
        if styled_runs:
            current_content.append(styled_runs)

    tables = extract_tables(document)
    for table in tables:
        current_content.append({"type": "table", "data": table})

    if current_header:
        sections.append((current_header, current_content))

    return sections

def prompt_user_for_levels():
    while True:
        user_input = input("Which header levels do you want to extract? (e.g., 1,2,3): ")
        try:
            levels = [int(x.strip()) for x in user_input.split(',') if x.strip() in {'1', '2', '3'}]
            if levels:
                return levels
            else:
                print("Please enter at least one valid level: 1, 2, or 3.")
        except ValueError:
            print("Invalid input. Use a comma-separated list like 1,2.")

def get_plain_text_from_runs(runs):
    return ''.join(run['text'] for run in runs)

def main():
    docx_path = "demo.docx"
    output_dir = "output_pdfs"
    os.makedirs(output_dir, exist_ok=True)

    levels = 1 #prompt_user_for_levels()
    print(f"Extracting headers: {levels}")

    sections = extract_headers_and_content(docx_path, allowed_levels=levels)

    if not sections:
        print("No matching headers found.")
        return

    for i, (header_obj, content) in enumerate(sections, 1):
        header_text = get_plain_text_from_runs(header_obj["runs"])
        safe_title = ''.join(c if c.isalnum() else '_' for c in header_text)
        filename = os.path.join(output_dir, f"section_{i}_{safe_title}.pdf")
        save_section_to_pdf(header_obj, content, filename)
        print(f"Saved: {filename}")

if __name__ == "__main__":
    main()


Extracting headers: 1
Saved: output_pdfs\section_1_Text_Formatting.pdf
Saved: output_pdfs\section_2_Tables.pdf
Saved: output_pdfs\section_3_Structural_Elements.pdf
Saved: output_pdfs\section_4_Images.pdf
Saved: output_pdfs\section_5_Lists.pdf


In [4]:
a=RGBColor(66, 36, 233)

In [5]:
a[0]

66