In [None]:
import os
import xml.etree.ElementTree as ET
from lxml import etree
from utils import check_polygone_overlap

In [None]:
def extract_textline(file_path):
    """
    Extract textlines with coordinates and text from a PAGE XML file.
    Returns a list of dictionaries with region_id, line_id, coords, baseline, and text.
    """
    ns = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
    try:
        root = etree.parse(file_path)
        result = root.xpath(
            '//ns:TextRegion/ns:TextLine[ns:TextEquiv[not(ancestor::ns:Word)]]',
            namespaces=ns
        )

        if not result:
            print(f"No TextEquiv tag found in {file_path}.")
            return []

        textlines = []
        for line in result:
            region_id = line.getparent().get("id")
            line_id = line.get("id")
            coords = line.find("ns:Coords", namespaces=ns)
            coords_points = coords.get("points") if coords is not None else ""
            baseline = line.find("ns:Baseline", namespaces=ns)
            baseline_points = baseline.get("points") if baseline is not None else ""
            text_equiv = line.find("ns:TextEquiv/ns:PlainText", namespaces=ns)
            text = text_equiv.text if text_equiv is not None else ""

            textlines.append({
                "region_id": region_id,
                "line_id": line_id,
                "coords": coords_points,
                "baseline": baseline_points,
                "text": text
            })

        return textlines

    except etree.XMLSyntaxError:
        print(f"Error parsing {file_path}. File may be malformed.")
        return []


In [None]:
def build_table_from_cells(image_name, cells_file, structure_file, htr_file, output_dir, wired=False):
    """
    Build a PAGE XML file containing TableRegion/TableCell/TextLine elements.
    Uses cell polygons from cells_file, structure info from structure_file,
    and TextLine geometry/text from htr_file (PAGE XML).
    Uses user-defined check_polygone_overlap() for textline-to-cell mapping.
    """
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"{image_name}.xml")

    # Load textlines
    textlines = extract_textline(htr_file)

    # Load cells + structure mapping
    cells = []
    with open(cells_file, "r", encoding="utf-8") as f_cells, open(structure_file, "r", encoding="utf-8") as f_struct:
        for i, (cell_line, struct_line) in enumerate(zip(f_cells, f_struct)):
            cell_line = cell_line.strip()
            struct_line = struct_line.strip()
            if not cell_line or not struct_line:
                continue

            # Replace ';' with ' ' for PAGE XML coords format
            coords_points = cell_line.replace(";", " ").replace('.', ',')
            if not wired:
                sr, er, sc, ec = map(int, struct_line.split(","))
            else:
                sc, ec, sr, er = map(int, struct_line.split(","))
            cells.append({
                "id": f"{i+1}",
                "coords": coords_points,
                "row": str(sr),
                "col": str(sc),
                "rowSpan": str(er - sr + 1),
                "colSpan": str(ec - sc + 1)
            })

    # Create PAGE XML structure
    pcgts = ET.Element("PcGts", xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15")
    page_el = ET.SubElement(pcgts, "Page", imageFilename=f"{image_name}")
    table_region = ET.SubElement(page_el, "TableRegion", id="r1", custom="structure {type:table}")

    # Add table cells
    for cell in cells:
        cell_el = ET.SubElement(
            table_region,
            "TableCell",
            id=f"cell_{cell['id']}",
            row=cell["row"],
            col=cell["col"],
            rowSpan=cell["rowSpan"],
            colSpan=cell["colSpan"]
        )
        ET.SubElement(cell_el, "Coords", points=cell["coords"])

        # Find textlines overlapping this cell
        for tl in textlines:
            try:
                if check_polygone_overlap(tl["coords"], cell["coords"], threshold=0.2):
                    textline_el = ET.SubElement(cell_el, "TextLine", id=tl["line_id"])
                    if tl["coords"]:
                        ET.SubElement(textline_el, "Coords", points=tl["coords"])
                    if tl["baseline"]:
                        ET.SubElement(textline_el, "Baseline", points=tl["baseline"])
                    textequiv_el = ET.SubElement(textline_el, "TextEquiv")
                    unicode_el = ET.SubElement(textequiv_el, "Unicode")
                    unicode_el.text = tl["text"]
            except Exception as e:
                print(f"Warning: failed to check overlap for textline {tl['line_id']}: {e}")

    # Save PAGE XML
    tree = ET.ElementTree(pcgts)
    ET.indent(tree, space="  ", level=0)
    tree.write(output_path, encoding="utf-8", xml_declaration=True)
    print(f"âœ… PAGE XML saved to: {output_path}")


In [None]:
DATA_DIR = "../data/images"
for file in os.listdir(DATA_DIR): 
    if not file.endswith(".jpg"): 
        continue 
    image_name=file

    build_table_from_cells(
        image_name=image_name,
        cells_file=f"../data/tables/cells/center/{image_name}.txt",
        structure_file=f"../data/tables/cells/logi/{image_name}.txt",
        htr_file=f"../data/htr/page/{image_name.replace('.jpg', '.xml')}",
        output_dir="../data/tables/pagexml",
        wired=True
    )


In [None]:
from utils import pagexml_to_html

pagexml_file = os.path.join("../data/tables/pagexml", image_name+ ".xml")
output_file = os.path.join("../data/tables/html", image_name + ".html")
pagexml_to_html(pagexml_file, output_file)