In [None]:
pip install landingai_ade, openpyxl, pandas

In [None]:
"""
Supports: PDF, DOCX, XLS, XLSX
"""

from dotenv import load_dotenv
from pathlib import Path
import re
import pandas as pd
from html.parser import HTMLParser
from landingai_ade import LandingAIADE
load_dotenv()

class TableHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.rows: list[list[str]] = []
        self.current_row: list[str] = []
        self.current_cell = ""
        self.in_cell = False

    def handle_starttag(self, tag, attrs):
        if tag == "tr":
            self.current_row = []
        elif tag in ("td", "th"):
            self.in_cell = True
            self.current_cell = ""

    def handle_endtag(self, tag):
        if tag in ("td", "th"):
            self.in_cell = False
            self.current_row.append(self.current_cell.strip())
        elif tag == "tr" and self.current_row:
            self.rows.append(self.current_row)

    def handle_data(self, data):
        if self.in_cell:
            self.current_cell += data


def html_table_to_markdown(html_table: str) -> str:
    parser = TableHTMLParser()
    parser.feed(html_table)
    if not parser.rows:
        return ""
    headers = parser.rows[0]
    md = ["| " + " | ".join(h.strip() for h in headers) + " |"]
    md.append("| " + " | ".join(["---"] * len(headers)) + " |")
    for row in parser.rows[1:]:
        while len(row) < len(headers):
            row.append("")
        escaped = [cell.replace("|", "\\|").replace("\n", " ").strip() for cell in row]
        md.append("| " + " | ".join(escaped) + " |")
    return "\n".join(md)


def _remove_images_and_figures(text: str) -> str:
    """Strip <img>, <figure>, and markdown image syntax."""
    text = re.sub(r"<img[^>]*>", "", text, flags=re.IGNORECASE)
    text = re.sub(r"<figure[^>]*>.*?</figure>", "", text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", text)  # ![alt](url)
    return text


def _remove_html_anchors(text: str) -> str:
    """Remove empty anchors and anchor wrappers."""
    text = re.sub(r"<a[^>]*>\s*</a>", "", text, flags=re.IGNORECASE)
    text = re.sub(r"<a[^>]*>.*?</a>", "", text, flags=re.IGNORECASE | re.DOTALL)
    return text


def _remove_html_comments(text: str) -> str:
    """Remove HTML comments."""
    return re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)


def _convert_html_tables(text: str) -> str:
    """Find all HTML tables and replace with Markdown tables."""
    pattern = r"<table[^>]*>.*?</table>"
    for match in re.findall(pattern, text, flags=re.DOTALL | re.IGNORECASE):
        md_table = html_table_to_markdown(match)
        text = text.replace(match, "\n\n" + md_table + "\n\n")
    return text


def _collapse_blank_lines(text: str) -> str:
    return re.sub(r"\n{3,}", "\n\n", text)


def _clean_markdown(raw: str) -> str:
    """Apply all cleaning steps to raw markdown."""
    text = _remove_html_anchors(raw)
    text = _remove_html_comments(text)
    text = _convert_html_tables(text)
    text = _remove_images_and_figures(text)
    text = _collapse_blank_lines(text)
    return text.strip() + "\n"


def _excel_to_markdown(file_path: Path) -> str:
    """Read all sheets from Excel and convert to Markdown tables."""
    md_parts: list[str] = []
    sheets = pd.read_excel(file_path, sheet_name=None, engine="openpyxl")
    for sheet_name, df in sheets.items():
        md_parts.append(f"## Sheet: {sheet_name}\n")
        if df.empty:
            md_parts.append("_Empty sheet_\n")
            continue
        headers = [str(h) for h in df.columns.tolist()]
        md_parts.append("| " + " | ".join(headers) + " |")
        md_parts.append("| " + " | ".join(["---"] * len(headers)) + " |")
        for _, row in df.iterrows():
            cells = [str(x) if pd.notna(x) else "" for x in row.tolist()]
            md_parts.append("| " + " | ".join(cells) + " |")
        md_parts.append("")
    return "\n".join(md_parts).strip() + "\n"


def parse_file_to_markdown(path: str, model: str = "dpt-2-latest") -> str:
    """
    - Uses LandingAIADE for PDF and DOCX.
    - Uses pandas for Excel files.
    - All images are removed from the output.
    - HTML tables are converted to Markdown tables.
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {p}")

    out_path = p.with_suffix(".md")
    ext = p.suffix.lower()

    if ext in (".xls", ".xlsx"):
        md_result = _excel_to_markdown(p)
    else:
        client = LandingAIADE()
        response = client.parse(document=p, model=model)
        raw_md = getattr(response, "markdown", "") or ""
        md_result = _clean_markdown(raw_md)

    out_path.write_text(md_result, encoding="utf-8")
    print(f"Markdown saved to: {out_path}")
    return str(out_path)

In [None]:
# Change path to your files: 
# Supported formats: PDF, DOCX, XLSX
parse_file_to_markdown("test pdf.pdf")

âœ… Markdown saved to: test pdf.md


'test pdf.md'