Word Document Formatter

1. importing libraries

In [1]:
from langgraph.graph import StateGraph
from langchain_core.messages import BaseMessage, HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from typing import TypedDict, Annotated, List, Dict, Optional, Literal
from langgraph.graph import add_messages
from langchain_core.output_parsers import StrOutputParser
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser

In [2]:
from dotenv import load_dotenv
load_dotenv()

False

In [3]:
model = ChatOpenAI(
    model="mistralai/devstral-2512:free",
    openai_api_base="https://openrouter.ai/api/v1",
    openai_api_key="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",
)

2.states define

In [17]:
class DocStructure(BaseModel):
    document_type: str = Field(
        description="Type of document: academic, research, business, general"
    )

    headings: Dict[str, int] = Field(
        description="Detected headings mapped to heading level (e.g., {'Introduction': 1})"
    )

    body_font_size: int = Field(
        description="Target font size for body text"
    )

    heading_font_size: int = Field(
        description="Target font size for headings"
    )

    alignment: str = Field(
        description="Text alignment: left, right, center, justify"
    )

    issues_found: List[str] = Field(
        description="Formatting or structural issues detected"
    )


In [18]:
class wordFormatter(TypedDict):
    doc: str
    structDoc: Annotated[list, lambda x, y: y]
    formatPlan: List[DocStructure]
    output_doc: Optional[str]
    output_pdf: Optional[str]

3. nodes Building

NodeFn1: Loading the document

In [19]:
from docx import Document

def load_docx(path: str):
    doc = Document(path)
    paragraphs = []

    for p in doc.paragraphs:
        paragraphs.append({
            "text": p.text.strip(),
            "style": p.style.name if p.style else None,
            "alignment": p.alignment,
            "runs": [
                {
                    "text": r.text,
                    "bold": r.bold,
                    "italic": r.italic,
                    "font_size": r.font.size.pt if r.font.size else None
                }
                for r in p.runs
            ]
        })
    return paragraphs


In [20]:
loader = load_docx('Draft.docx')
# doc = loader.load()
print(loader)

[{'text': 'Introduction', 'style': 'Heading 1', 'alignment': None, 'runs': [{'text': 'Introduction', 'bold': None, 'italic': None, 'font_size': None}]}, {'text': 'Qwertyujnbvcddx', 'style': 'List Paragraph', 'alignment': None, 'runs': [{'text': 'Qwertyujnbvcddx', 'bold': None, 'italic': None, 'font_size': None}]}, {'text': 'Ertyhhbvcx', 'style': 'List Paragraph', 'alignment': None, 'runs': [{'text': 'Ertyhhbvcx', 'bold': None, 'italic': None, 'font_size': None}]}, {'text': 'sdfgbhcd', 'style': 'List Paragraph', 'alignment': None, 'runs': [{'text': 'sdfgbhcd', 'bold': None, 'italic': None, 'font_size': None}]}, {'text': 'This document is created for a word processing formatting project. The purpose of this file is to allow the user to apply formatting such as text justification, font size, font color, line spacing, and paragraph alignment.', 'style': 'Normal', 'alignment': None, 'runs': [{'text': 'This ', 'bold': None, 'italic': None, 'font_size': None}, {'text': 'document is created fo

In [21]:
def loadDocNode(state: wordFormatter) -> wordFormatter:
    doc = state['doc']
    state['structDoc'] = load_docx(doc)
    return state

NodeFn2: Structure Analysis Agent node

In [22]:
def docStructureNode(state: wordFormatter) -> wordFormatter:
    struct_doc = state["structDoc"]

    parser = PydanticOutputParser(pydantic_object=DocStructure)

    prompt = PromptTemplate(
        template="""
You are a document structure analysis agent.

Analyze the following document structure and produce a formatting plan.

Document structure:
{struct_doc}

{format_instructions}
""",
        input_variables=["struct_doc"],
        partial_variables={
            "format_instructions": parser.get_format_instructions()
        }
    )

    response = model.invoke(
        prompt.format(struct_doc=struct_doc)
    )

    state["formatPlan"] = parser.parse(response.content)
    return state

NodeFn3: Formatter Agent node

3.1 word formatter

In [23]:
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt

def formatterNode(state: wordFormatter) -> wordFormatter:
    plan = state["formatPlan"]
    doc_path = state["doc"]

    doc = Document(doc_path)

    for p in doc.paragraphs:
        # Headings
        if p.style.name.startswith("Heading"):
            for run in p.runs:
                run.font.size = Pt(plan.heading_font_size)
        else:
            # Body text
            p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
            for run in p.runs:
                run.font.size = Pt(plan.body_font_size)

    output_path = "formatted.docx"
    doc.save(output_path)

    state["output_doc"] = output_path
    return state

3.2 pdf generater

In [24]:
import subprocess

def pdfGeneratorNode(state: wordFormatter):
    docx_path = state["output_doc"]
    pdf_path = docx_path.replace(".docx", ".pdf")

    subprocess.run([
        "libreoffice",
        "--headless",
        "--convert-to", "pdf",
        docx_path,
        "--outdir", "."
    ], check=True)

    return {"output_pdf": pdf_path}


4.Graph Struct

In [25]:
graph = StateGraph(wordFormatter)

graph.add_node("load_docx", loadDocNode)
graph.add_node("doc_structure", docStructureNode)
graph.add_node("doc_formatter", formatterNode)
graph.add_node("doc_pdf", pdfGeneratorNode)

graph.set_entry_point("load_docx")
graph.add_edge("load_docx","doc_structure")
graph.add_edge("doc_structure", "doc_formatter")
graph.set_finish_point("doc_formatter")

app = graph.compile()


In [26]:
app.get_graph().print_ascii()

  +-----------+    
  | __start__ |    
  +-----------+    
        *          
        *          
        *          
  +-----------+    
  | load_docx |    
  +-----------+    
        *          
        *          
        *          
+---------------+  
| doc_structure |  
+---------------+  
        *          
        *          
        *          
+---------------+  
| doc_formatter |  
+---------------+  
        *          
        *          
        *          
   +---------+     
   | __end__ |     
   +---------+     


In [27]:
result = app.invoke({
    "doc": "Draft.docx",
    "structDoc": [],
    "formatPlan": [],
    "output_doc": []
})

print(result["output_doc"])
print(result["structDoc"])
print(result["formatPlan"])


formatted.docx
[{'text': 'Introduction', 'style': 'Heading 1', 'alignment': None, 'runs': [{'text': 'Introduction', 'bold': None, 'italic': None, 'font_size': None}]}, {'text': 'Qwertyujnbvcddx', 'style': 'List Paragraph', 'alignment': None, 'runs': [{'text': 'Qwertyujnbvcddx', 'bold': None, 'italic': None, 'font_size': None}]}, {'text': 'Ertyhhbvcx', 'style': 'List Paragraph', 'alignment': None, 'runs': [{'text': 'Ertyhhbvcx', 'bold': None, 'italic': None, 'font_size': None}]}, {'text': 'sdfgbhcd', 'style': 'List Paragraph', 'alignment': None, 'runs': [{'text': 'sdfgbhcd', 'bold': None, 'italic': None, 'font_size': None}]}, {'text': 'This document is created for a word processing formatting project. The purpose of this file is to allow the user to apply formatting such as text justification, font size, font color, line spacing, and paragraph alignment.', 'style': 'Normal', 'alignment': None, 'runs': [{'text': 'This ', 'bold': None, 'italic': None, 'font_size': None}, {'text': 'documen