In [1]:
import os
import xml.etree.ElementTree as ET
from shapely.geometry import Polygon
from shapely.geometry import Point

In [2]:
directory = "data/labels/"
image_name = "NL-HaNA_2.10.50_45_0355.xml"

In [3]:
# Go outside the src directory
os.chdir("..")
current_dir = os.getcwd()
print("Current directory:", current_dir)

Current directory: /Users/sarah_shoilee/codeProjects/stamboekn_KE


need to construct HTML Table form transkribus XML

In [4]:
# -------- Helper Functions --------
def parse_points(points_str):
    """Parse 'x,y x,y ...' string to a list of (x,y) tuples"""
    return [tuple(map(int, p.split(','))) for p in points_str.strip().split()]

In [5]:
# -------- Main Script --------
# Load XML
xml_path = os.path.join(directory, image_name)
tree = ET.parse(xml_path)
root = tree.getroot()

# Namespace (needed because PageXML uses namespaces)
ns = {'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}

# Extract table cells
cells = {}  # cell_id -> Polygon
for cell in root.findall('.//pc:TableCell', ns):
    cell_id = cell.attrib['id']
    coords = cell.find('pc:Coords', ns).attrib['points']
    cells[cell_id] = Polygon(parse_points(coords))

# Extract text lines
text_lines = []  # list of dicts {id, polygon, text}
for textline in root.findall('.//pc:TextLine', ns):
    tl_id = textline.attrib['id']
    coords = textline.find('pc:Coords', ns).attrib['points']
    text = textline.find('.//pc:Unicode', ns)
    text_value = text.text if text is not None else ''
    text_lines.append({
        'id': tl_id,
        'polygon': Polygon(parse_points(coords)),
        'text': text_value
    })

In [6]:
# -------- Overlap Mapping --------
cell_text_map = {cid: [] for cid in cells}

for tl in text_lines:
    tl_poly = tl['polygon']
    for cid, cell_poly in cells.items():
        # Check if textline intersects or lies within cell polygon
        if cell_poly.intersects(tl_poly):
            cell_text_map[cid].append({
                'textline_id': tl['id'],
                'text': tl['text']
            })

cell_texts_combined = {}

for cid, lines in cell_text_map.items():
    # Extract only the text from each line in reverse order
    texts = [line['text'] for line in reversed(lines) if line['text']]
    combined_text = "<br/>".join(texts)
    print(f"Cell {cid} combined text (reversed):\n  {combined_text}")
    
    # Combine with <br/> if multiple lines
    combined_text = "<br/>".join(texts)
    
    cell_texts_combined[cid] = combined_text

    # Print result
    print(f"Cell {cid} combined text:\n  {combined_text}")

Cell c_44 combined text (reversed):
  Hop
Cell c_44 combined text:
  Hop
Cell c_63 combined text (reversed):
  Geboren Den 9 October 1775<br/>Moeder Cakleuna Ballert<br/>Jacob Beorg
Cell c_63 combined text:
  Geboren Den 9 October 1775<br/>Moeder Cakleuna Ballert<br/>Jacob Beorg
Cell c_86 combined text (reversed):
  Laatste Woonplaats Bruille<br/>beboortplaats Schoorkoven<br/>Geboren Den 9 October 1775<br/>Moeder Cakleuna Ballert
Cell c_86 combined text:
  Laatste Woonplaats Bruille<br/>beboortplaats Schoorkoven<br/>Geboren Den 9 October 1775<br/>Moeder Cakleuna Ballert
Cell c_113 combined text (reversed):
  
Cell c_113 combined text:
  
Cell c_144 combined text (reversed):
  
Cell c_144 combined text:
  
Cell c_145 combined text (reversed):
  
Cell c_145 combined text:
  
Cell c_816 combined text (reversed):
  Grest Herdrch<br/>van Schack
Cell c_816 combined text:
  Grest Herdrch<br/>van Schack
Cell c_817 combined text (reversed):
  Laatste Woonplaats ssage<br/>beboorkplaats Stip<br/>

In [7]:
from lxml import etree

def build_html_table_from_pagexml(xml_path, cell_texts_combined):
    """
    Build an HTML <table> from a PAGE XML file and a mapping
    of cell_id -> combined text.
    """
    ns = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
    tree = etree.parse(xml_path)
    root = tree.getroot()

    # Collect all TableCells
    cells_info = []
    for cell in root.xpath('//ns:TableCell', namespaces=ns):
        cell_id = cell.attrib['id']
        row = int(cell.attrib.get('row', 0))
        col = int(cell.attrib.get('col', 0))
        rowspan = int(cell.attrib.get('rowSpan', 1))
        colspan = int(cell.attrib.get('colSpan', 1))
        text = cell_texts_combined.get(cell_id, '')

        cells_info.append({
            'id': cell_id,
            'row': row,
            'col': col,
            'rowspan': rowspan,
            'colspan': colspan,
            'text': text
        })

    # Determine table size (max rows/columns)
    max_row = max(c['row'] for c in cells_info)
    max_col = max(c['col'] for c in cells_info)

    # Build an empty grid
    grid = [[None for _ in range(max_col+1)] for _ in range(max_row+1)]

    # Place cells in grid
    for c in cells_info:
        grid[c['row']][c['col']] = c

    # Generate HTML table string
    html_lines = ['<table>']
    for r in range(max_row+1):
        html_lines.append('  <tr>')
        for c in range(max_col+1):
            cell = grid[r][c]
            if cell:
                td_attrs = f' id="{cell["id"]}"'
                if cell['rowspan'] > 1:
                    td_attrs += f' rowspan="{cell["rowspan"]}"'
                if cell['colspan'] > 1:
                    td_attrs += f' colspan="{cell["colspan"]}"'
                html_lines.append(f'    <td{td_attrs}>{cell["text"]}</td>')
            else:
                # empty cell
                html_lines.append('    <td></td>')
        html_lines.append('  </tr>')
    html_lines.append('</table>')

    return '\n'.join(html_lines)


In [8]:
directory = "data/labels"
xml_path = os.path.join(directory, image_name)

html_table = build_html_table_from_pagexml(xml_path, cell_texts_combined)

# Get the folder of the XML file
folder = "data/tables/html/"

# Build the HTML file path (same name, .html extension)
html_path = os.path.join(folder, image_name.replace('.xml', '.jpg') + ".html")

# Save the HTML
with open(html_path, "w", encoding="utf-8") as f:
    f.write(html_table)

print(f"HTML saved to {html_path}")


HTML saved to data/tables/html/NL-HaNA_2.10.50_45_0355.jpg.html


In [9]:
from Image2Table_LLM.parse import format_td
from Image2Table_LLM.metric import TEDS
def calculate_TEDS(ground_truth_html, predicted_html):
    # predicted_html = format_td(predicted_html)
    ground_truth_html = format_td(ground_truth_html)

    teds = TEDS(structure_only=False)
    teds_score = teds.evaluate(ground_truth_html, predicted_html)

    teds_struct = TEDS(structure_only=True)
    teds_struct_score = teds_struct.evaluate(ground_truth_html, predicted_html)
    
    print(f"TEDS: {teds_score:.4f}")
    print(f"TEDS-Struct: {teds_struct_score:.4f}")

    return teds_score, teds_struct_score

In [11]:
import re
with open(os.path.join("data/tables/html/", image_name.replace('.xml', '.jpg')+'.html'), 'r', encoding='utf-8') as f:
    constructed_html = f.read()

with open(os.path.join("data/labels/", image_name.replace('.xml', '.html')), 'r', encoding='utf-8') as f:
    label_html = f.read()

calculate_TEDS(label_html, constructed_html)

TEDS: 0.6343
TEDS-Struct: 0.7857


(0.6343309061474394, 0.7857142857142857)

### Information Extraction

In [None]:
def read_html(html_path):
    if not os.path.exists(html_path):
        raise FileNotFoundError(f"HTML file not found: {html_path}")
    with open(html_path, 'r', encoding='utf-8') as file:
        table = file.read()
        return table
    # print(f"Successfully read HTML file: {html_path}")

In [None]:
from bs4 import BeautifulSoup

html = read_html(f"data/labels/{image_name}.html")

soup = BeautifulSoup(html, "html.parser")
table = soup.find("table")

In [None]:
import re
import json

persons = []

for row in table.find_all("tr"):
    cells = row.find_all("td")

    if not cells:
        continue  # skip header or empty rows

    person = {}
    for cell in cells:
        # preserve breaks as \n
        text = cell.get_text(separator="\n", strip=True)
        
        vader_match = re.search(r'Vader\s+([^\n]+)', text, re.IGNORECASE)
        moeder_match = re.search(r'Moeder\s+([^\n]+)', text, re.IGNORECASE)
        geboorte_datum_match = re.search(r'Geboren\s*Den\s*([^\n]+)', text, re.IGNORECASE)
        geboorte_plaats_match = re.search(r'Geboortplaats\s*([^\n]+)', text, re.IGNORECASE)
        laatste_woonplaats_match = re.search(r'Laatste\s*Woonplaats\s*([^\n]+)', text, re.IGNORECASE)
        
        if vader_match:
            person['vader'] = {'value': vader_match.group(1).strip(), 'cell': cell.get('id')}
        if moeder_match:
            person['moeder'] = {'value': moeder_match.group(1).strip(), 'cell': cell.get('id')}
        if geboorte_datum_match:
            person['geboorte_datum'] = {'value': geboorte_datum_match.group(1).strip(), 'cell': cell.get('id')}
        if geboorte_plaats_match:
            person['geboorte_plaats'] = {'value': geboorte_plaats_match.group(1).strip(), 'cell': cell.get('id')}
        if laatste_woonplaats_match:
            person['laatste_woonplaats'] = {'value': laatste_woonplaats_match.group(1).strip(), 'cell': cell.get('id')}

    if person:
        persons.append(person)

json_obj = {"persons": persons}
print(json.dumps(json_obj, indent=2, ensure_ascii=False))

In [None]:
with open(f"data/json/{image_name}.json", "w", encoding='utf-8') as json_file:
    json.dump(json_obj, json_file, ensure_ascii=False, indent=2)#!/usr/bin/env python3


### KG Construction

In [None]:
# load json from file
with open(f"data/json/{image_name}.json", "r", encoding="utf-8") as f:
    json_obj = json.load(f)


In [None]:
# CONSTRUCT ASSERSION TRIPLES
from rdflib import Graph, ConjunctiveGraph, Namespace, URIRef, Literal, RDF

FOAF = Namespace("http://xmlns.com/foaf/0.1/")
EX = Namespace("http://example.org/ontology/")
PROV = Namespace("http://www.w3.org/ns/prov#")

cg = ConjunctiveGraph()
cg.bind("foaf", FOAF)
cg.bind("ex", EX)
cg.bind("prov", PROV)

# Mapping from json keys to RDF predicates
predicate_map = {
    "vader": EX.vader,
    "moeder": EX.moeder,
    "geboorte_datum": EX.geboorteDatum,
    "geboorte_plaats": EX.geboortePlaats,
    "laatste_woonplaats": EX.laatsteWoonplaats
}

for idx, person in enumerate(json_obj["persons"], start=1):
    person_uri = URIRef(f"http://example.org/person/{idx}")
    assertion_graph_uri = URIRef("http://example.org/assertion")
    assertion_graph = Graph(store=cg.store, identifier=assertion_graph_uri)
    assertion_graph.add((person_uri, RDF.type, FOAF.Person))

    provenance_graph_uri = URIRef("http://example.org/provenance")
    provenance_graph = Graph(store=cg.store, identifier=provenance_graph_uri)

    for key, value_dict in person.items():
        value = value_dict["value"]
        cell_id = value_dict["cell"]
        predicate = predicate_map.get(key)
        if predicate:
            # Named graph for each cell
            graph_uri = URIRef(f"http://example.org/graph/{cell_id}")
            ng = Graph(store=cg.store, identifier=graph_uri)
            ng.add((person_uri, predicate, Literal(value)))
            

cg.serialize(f"data/triples/{image_name}_assersion.trig", format='trig')


In [None]:
end_time = "2025-09-01T12:00:00Z"  # Example end time, replace with actual time if needed
start_time = "2025-09-01T10:00:00Z"  # Example start time, replace with actual time if needed

# CONSTRUCT PROVENANCE TRIPLES
from lxml import etree

def add_provenance_graph(pagexml_path, stamboek_nummer=image_name):
    tree = etree.parse(pagexml_path)
    root = tree.getroot()

    EX = Namespace("http://example.org/ontology/")
    IMG = Namespace("http://example.org/image_ontology/")
    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")  
    RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
    PROV = Namespace("http://www.w3.org/ns/prov#")
    CSVW = Namespace("http://www.w3.org/ns/csvw#")

    # Create RDF graph
    g = Graph()
    g.bind("ex", EX)
    g.bind("img", IMG)
    g.bind("rdf", RDF)
    g.bind("rdfs", RDFS)
    g.bind("prov", PROV)
    g.csvw = ("csvw", CSVW)


    # Find TableRegion(s)
    table_regions = root.findall(".//{*}TableRegion")

    for table_region in table_regions:
        for cell in table_region.findall(".//{*}TableCell"):
            cell_id = cell.get('id')
            rows = cell.get('row')
            cols = cell.get('col')
            Coords = cell.find(".//{*}Coords")
            coords_points = Coords.get('points') if Coords is not None else None
            
            # cell uri
            named_graph_uri = URIRef(f"http://example.org/graph/{cell_id}")
            cell_uri = URIRef(f"http://example.org/id/{cell_id}")
            provenance_graph.add((named_graph_uri, PROV.wasDerivedFrom, cell_uri))
            g.add((cell_uri, RDF.type, PROV.Entity))
            g.add((cell_uri, RDFS.label, Literal(f"Cell {cell_id} from {stamboek_nummer}")))
            
            g.add((cell_uri, RDF.type, CSVW.Cell))
            g.add((cell_uri, CSVW.rowNumber, Literal(rows)))
            g.add((cell_uri, CSVW.columnNumber, Literal(cols)))
            g.add((cell_uri, EX.ImageRegion, Literal(coords_points)))

            # agents
            agent_1 = URIRef("http://example.org/agent/1")
            g.add((agent_1, RDF.type, PROV.Agent))
            g.add((agent_1, RDFS.label, Literal("Sarah Shoilee")))
            g.add((named_graph_uri, PROV.wasAttributedTo, agent_1))
            project_agent = URIRef("http://example.org/agent/2")
            g.add((project_agent, RDF.type, PROV.Agent))
            g.add((project_agent, RDFS.label, Literal("Pressing Matter Project")))
            g.add((agent_1, PROV.actedOnBehalfOf, project_agent))

            # activity
            stamboekenKGConstructionactivity = URIRef(f"http://example.org/activity/stamboekenKGConstructionactivity/{cell_id}")
            tableConstructionactivity = URIRef(f"http://example.org/activity/TableExtraction/{cell_id}")
            informationExtractionactivity = URIRef(f"http://example.org/activity/InformationExtraction/{cell_id}")
            KGConstructionactivity = URIRef(f"http://example.org/activity/KGConstruction/{cell_id}")
            
            g.add((stamboekenKGConstructionactivity, RDF.type, PROV.Activity))
            g.add((named_graph_uri, PROV.wasGeneratedBy, stamboekenKGConstructionactivity))
            g.add((stamboekenKGConstructionactivity, PROV.wasAssociatedWith, agent_1))
            g.add((stamboekenKGConstructionactivity, PROV.wasInformedBy, tableConstructionactivity))
            g.add((tableConstructionactivity, RDF.type, PROV.Activity))
            g.add((stamboekenKGConstructionactivity, PROV.wasInformedBy, informationExtractionactivity))
            g.add((informationExtractionactivity, RDF.type, PROV.Activity))
            g.add((informationExtractionactivity, PROV.used, cell_uri))
            g.add((stamboekenKGConstructionactivity, PROV.wasInformedBy, KGConstructionactivity))
            g.add((KGConstructionactivity, RDF.type, PROV.Activity))
            g.add((KGConstructionactivity, PROV.used, cell_uri))

            g.add((stamboekenKGConstructionactivity,PROV.endedAtTime, Literal(end_time)))
            g.add((stamboekenKGConstructionactivity,PROV.startedAtTime, Literal(start_time)))

            # Create a Table instance URI
            table_uri = URIRef(f"http://example.org/Table/{cell_id}")
            g.add((table_uri, RDF.type, PROV.Entity))
            g.add((table_uri, RDF.type, CSVW.Table))
            g.add((table_uri, PROV.wasGeneratedBy, tableConstructionactivity))
            g.add((cell_uri, PROV.wasDerivedFrom, table_uri))
            
            # stamboeken
            stamboek_uri = URIRef(f"http://example.org/stamboek/{stamboek_nummer}")
            g.add((stamboek_uri, RDF.type, PROV.Entity))
            g.add((tableConstructionactivity, PROV.used, stamboek_uri))
            g.add((table_uri, PROV.wasDerivedFrom, stamboek_uri))
            national_archives = URIRef("http://example.org/agent/3")
            g.add((national_archives, RDF.type, PROV.Agent))
            g.add((national_archives, RDFS.label, Literal("Nationaal Archief")))
            g.add((stamboek_uri, PROV.wasAttributedTo, national_archives))
    
    g.serialize(f"data/triples/{image_name}_provenance.ttl", format='ttl')
add_provenance_graph(f"data/labels/{image_name}.xml")