In [1]:
import os
from bs4 import BeautifulSoup
from xml.dom.minidom import parseString


class HTMLToPageXMLConverter:
    """
    Converts HTML tables into Page XML format and updates spans and coords.
    """

    def __init__(self, html_content: str, image_filename: str, mapping: dict, coords: list):
        self.html_content = html_content
        self.image_filename = image_filename
        self.mapping = mapping
        self.coords = coords
        self.soup = BeautifulSoup(html_content, "html.parser")

    def convert(self) -> str:
        """
        Convert HTML table to Page XML with indentation and updated spans and coords.
        Returns:
            str: Indented Page XML string.
        """
        raw_xml = [
            '<?xml version="1.0" encoding="utf-8"?>',
            '<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15">',
            f'<Page imageFilename="{self.image_filename}">',
            '<TableRegion id="r1" custom="structure {type:table}">'
        ]

        table = self.soup.find("table")
        if not table:
            raise ValueError("No <table> found in HTML.")

        cells = table.find_all("td")
        for idx, cell in enumerate(cells, start=1):
            text = cell.get_text(strip=True)
            row = cell.get("row", "")
            col = cell.get("col", "")
            cell_id = f"cell_{idx}"
            html_id = cell.get("id", f"html_{idx}")

            # Determine rowSpan and colSpan from mapping
            row_span, col_span = self._get_spans(int(row), int(col))

            # Get coords from coords list based on index
            coords_points = self.coords[idx - 1] if idx - 1 < len(self.coords) else ""

            raw_xml.append(
                f'<TableCell id="{cell_id}" row="{row}" col="{col}" rowSpan="{row_span}" colSpan="{col_span}">'
                f'<Coords points="{coords_points}" />'
                f'<TextLine id="{html_id}">'
                f'<TextEquiv><Unicode>{text}</Unicode></TextEquiv>'
                f'</TextLine>'
                f'</TableCell>'
            )

        raw_xml.extend(['</TableRegion>', '</Page>', '</PcGts>'])

        dom = parseString("".join(raw_xml))
        return dom.toprettyxml(indent="  ")

    def _get_spans(self, row: int, col: int) -> tuple:
        """
        Get rowSpan and colSpan for a cell based on mapping.
        Args:
            row (int): Row index of the cell.
            col (int): Column index of the cell.
        Returns:
            tuple: (rowSpan, colSpan)
        """
        for (start_row, end_row, start_col, end_col) in self.mapping.keys():
            if start_row <= row <= end_row and start_col <= col <= end_col:
                row_span = end_row - start_row + 1
                col_span = end_col - start_col + 1
                return row_span, col_span
        return 1, 1  # Default 1 if no match


def read_html_file(file_path: str) -> str:
    """Read HTML file content."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()


def read_mapping_file(file_path: str) -> dict:
    """
    Read mapping file and return a dictionary of ranges.
    Format: start_row,end_row,start_col,end_col
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Mapping file not found: {file_path}")
    mapping = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split(",")
            if len(parts) == 4:
                start_row, end_row, start_col, end_col = map(int, parts)
                mapping[(start_row, end_row, start_col, end_col)] = True
    return mapping


def read_coords_file(file_path: str) -> list:
    """
    Read coords file and return a list of coords strings.
    Format: "x1,y1;x2,y2;x3,y3;x4,y4",#id
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Coords file not found: {file_path}")
    coords_list = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split(",#")
            if parts:
                coords_list.append(parts[0].replace('"', '').replace(';', ' '))
    return coords_list


def save_page_xml(content: str, output_path: str):
    """Save Page XML content to a file."""
    with open(output_path, "w", encoding="utf-8") as file:
        file.write(content)

In [2]:
import os

def build_table_from_cells_html(image_filename, coords_path, mapping_path, html_path, output_dir):
    html_content = read_html_file(html_path)
    mapping = read_mapping_file(mapping_path)
    coords = read_coords_file(coords_path)

    converter = HTMLToPageXMLConverter(html_content, image_filename, mapping, coords)
    page_xml = converter.convert()
    output_file = os.path.join(output_dir, f"{image_filename}.xml")
    save_page_xml(page_xml, output_file)
    print(f"Conversion successful! Output saved to {output_file}")


In [3]:
html_path = "../data/tables/html/NL-HaNA_2.10.50_45_0110.jpg.html"  
image_filename = "NL-HaNA_2.10.50_45_0110.jpg"
mapping_path = "../data/tables/cells/logi/NL-HaNA_2.10.50_45_0110.jpg.txt"
coords_path = "../data/tables/cells/center/NL-HaNA_2.10.50_45_0110.jpg.txt"
output_dir = "../data/tables/pagexml"

build_table_from_cells_html(
    image_filename=image_filename,
    coords_path=coords_path,
    mapping_path=mapping_path,
    html_path=html_path,
    output_dir=output_dir
)

Conversion successful! Output saved to ../data/tables/pagexml/NL-HaNA_2.10.50_45_0110.jpg.xml


In [3]:
DATA_DIR = "../data/images"
for file in os.listdir(DATA_DIR): 
    print(f"Processing file: {file}")
    if not file.endswith(".jpg"): 
        continue 
    image_filename=file
    
    coords_path = f"../data/tables/cells/center/{image_filename}.txt"
    mapping_path = f"../data/tables/cells/logi/{image_filename}.txt"
    html_path = f"../data/tables/html/{image_filename}.html"
    output_dir = "../data/tables/pagexml"

    build_table_from_cells_html(
        image_filename=image_filename,
        coords_path=coords_path,
        mapping_path=mapping_path,
        html_path=html_path,
        output_dir=output_dir
    )
    


Processing file: .DS_Store
Processing file: NL-HaNA_2.10.50_45_0131.jpg
Conversion successful! Output saved to ../data/tables/pagexml/NL-HaNA_2.10.50_45_0131.jpg.xml
Processing file: NL-HaNA_2.10.50_45_0091.jpg
Conversion successful! Output saved to ../data/tables/pagexml/NL-HaNA_2.10.50_45_0091.jpg.xml
Processing file: NL-HaNA_2.10.50_45_0143.jpg
Conversion successful! Output saved to ../data/tables/pagexml/NL-HaNA_2.10.50_45_0143.jpg.xml
Processing file: NL-HaNA_2.10.50_45_0151.jpg
Conversion successful! Output saved to ../data/tables/pagexml/NL-HaNA_2.10.50_45_0151.jpg.xml
Processing file: NL-HaNA_2.10.50_45_0110.jpg
Conversion successful! Output saved to ../data/tables/pagexml/NL-HaNA_2.10.50_45_0110.jpg.xml


In [1]:

#!/usr/bin/env python3
"""
PageXML Visualiser
Author: Your Name
Description:
    Reads a PageXML file and its associated image.
    Draws bounding boxes for each TableCell based on Coords points.
    Annotates each box with row, col, rowSpan, and colSpan.
"""

import os
import cv2
import numpy as np
import xml.etree.ElementTree as ET


class PageXMLVisualizer:
    """
    Class to handle PageXML parsing and visualisation.
    """

    def __init__(self, pagexml_path: str, image_path: str):
        self.pagexml_path = pagexml_path
        self.image_path = image_path
        self.cells = []

    def parse_pagexml(self):
        """
        Parse PageXML and extract TableCell information.
        Each cell includes coords, row, col, rowSpan, colSpan.
        """
        if not os.path.exists(self.pagexml_path):
            raise FileNotFoundError(f"PageXML file not found: {self.pagexml_path}")

        tree = ET.parse(self.pagexml_path)
        root = tree.getroot()

        # Namespace handling
        ns = {"pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}

        for cell in root.findall(".//pc:TableCell", ns):
            coords_elem = cell.find("pc:Coords", ns)
            coords_points = coords_elem.get("points") if coords_elem is not None else ""
            row = cell.get("row", "")
            col = cell.get("col", "")
            row_span = cell.get("rowSpan", "")
            col_span = cell.get("colSpan", "")

            # Convert coords to list of tuples
            points = []
            if coords_points.strip():
                for pt in coords_points.split():
                    x, y = pt.split(",")
                    points.append((int(x), int(y)))

            self.cells.append({
                "points": points,
                "row": row,
                "col": col,
                "rowSpan": row_span,
                "colSpan": col_span
            })

    def draw_visualisation(self):
        """
        Draw bounding boxes and annotations on the image.
        """
        if not os.path.exists(self.image_path):
            raise FileNotFoundError(f"Image file not found: {self.image_path}")

        image = cv2.imread(self.image_path)
        if image is None:
            raise ValueError("Failed to load image. Check file format and path.")

        for cell in self.cells:
            points = cell["points"]
            if len(points) >= 4:
                # Convert points to NumPy array for OpenCV
                pts = np.array(points, dtype=np.int32).reshape((-1, 1, 2))

                # Draw polygon
                cv2.polylines(image, [pts], True, (0, 255, 0), 2)

                # Compute text position (top-left corner)
                x, y = points[0]
                label = f"r:{cell['row']} c:{cell['col']} rs:{cell['rowSpan']} cs:{cell['colSpan']}"
                cv2.putText(image, label, (x + 5, y + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)

        return image

In [None]:
image_filename = "../data/images/NL-HaNA_2.10.50_45_0110.jpg"
pagexml_file = "../data/tables/pagexml/NL-HaNA_2.10.50_45_0110.jpg.xml"

visualizer = PageXMLVisualizer(pagexml_file, image_filename)
visualizer.parse_pagexml()
result_image = visualizer.draw_visualisation()
# Display the image
cv2.imshow("PageXML Visualisation", result_image)
cv2.waitKey(0)
cv2.destroyAllWindows()


ValueError: invalid literal for int() with base 10: '889.6211'