In [8]:
from zipfile import ZipFile
from lxml import etree
from fpdf import FPDF
import re
import os

OUTPUT_DIR = "output_lxml"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Namespace for Word XML
NAMESPACE = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

# Class to handle Unicode PDF creation
class UnicodePDF(FPDF):
    def __init__(self):
        super().__init__()
        self.add_font("DejaVu", "", "DejaVuSans.ttf", uni=True)
        self.add_font("DejaVu", "B", "DejaVuSans-Bold.ttf", uni=True)
        self.set_font("DejaVu", size=12)

# PDF creation function
def create_pdf(title, content, filename):
    pdf = UnicodePDF()
    pdf.add_page()

    pdf.set_font("DejaVu", style='B', size=16)
    pdf.cell(200, 10, txt=title, ln=True, align='C')
    pdf.ln(10)

    pdf.set_font("DejaVu", size=12)
    pdf.multi_cell(0, 10, content)
    
    pdf.output(filename)

# Sanitize filename (remove illegal characters)
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "_", name.strip())

# Extract and split text from .docx file
def extract_text_from_docx(docx_file):
    with ZipFile(docx_file) as docx:
        xml_content = docx.read('word/document.xml')

    tree = etree.XML(xml_content)
    paragraphs = tree.xpath('//w:p', namespaces=NAMESPACE)

    current_title = None
    content = ""

    for para in paragraphs:
        style_elem = para.xpath('./w:pPr/w:pStyle/@w:val', namespaces=NAMESPACE)
        text_nodes = para.xpath('.//w:t', namespaces=NAMESPACE)
        text = ''.join(t for t in [node.text for node in text_nodes] if t)

        if not text.strip():
            continue

        if style_elem:
            style = style_elem[0]
            if style in ['Heading1']:  # Treat both as new sections
                if current_title and content:
                    filename = os.path.join(OUTPUT_DIR, sanitize_filename(current_title) + '.pdf')
                    create_pdf(current_title, content.strip(), filename)
                current_title = text
                content = ""
                continue

        # Accumulate content under the current header
        content += text + "\n"

    # Save last section
    if current_title and content:
        filename = os.path.join(OUTPUT_DIR, sanitize_filename(current_title) + '.pdf')
        create_pdf(current_title, content.strip(), filename)

# 🔧 Run the script
if __name__ == "__main__":
    docx_file = 'demo.docx'  # Your input Word file
    extract_text_from_docx(docx_file)
    print("✅ PDFs created successfully.")


  self.add_font("DejaVu", "", "DejaVuSans.ttf", uni=True)
  self.add_font("DejaVu", "B", "DejaVuSans-Bold.ttf", uni=True)
  pdf.cell(200, 10, txt=title, ln=True, align='C')
  pdf.cell(200, 10, txt=title, ln=True, align='C')


✅ PDFs created successfully.
