In [1]:
image_name = "NL-HaNA_2.10.50_45_0355"

In [2]:
import os
# Go outside the src directory
os.chdir("../")
current_dir = os.getcwd()
print("Current directory:", current_dir)

Current directory: /Users/sarah_shoilee/codeProjects/stamboekn_KE


In [3]:
def read_html(html_path):
    if not os.path.exists(html_path):
        raise FileNotFoundError(f"HTML file not found: {html_path}")
    with open(html_path, 'r', encoding='utf-8') as file:
        table = file.read()
        return table
    # print(f"Successfully read HTML file: {html_path}")

In [4]:
from bs4 import BeautifulSoup

html = read_html(f"data/labels/{image_name}.html")

soup = BeautifulSoup(html, "html.parser")
table = soup.find("table")

In [5]:
import re
import json

persons = []

for row in table.find_all("tr"):
    cells = row.find_all("td")

    if not cells:
        continue  # skip header or empty rows

    person = {}
    for cell in cells:
        # preserve breaks as \n
        text = cell.get_text(separator="\n", strip=True)
        
        vader_match = re.search(r'Vader\s+([^\n]+)', text, re.IGNORECASE)
        moeder_match = re.search(r'Moeder\s+([^\n]+)', text, re.IGNORECASE)
        geboorte_datum_match = re.search(r'Geboren\s*Den\s*([^\n]+)', text, re.IGNORECASE)
        geboorte_plaats_match = re.search(r'Geboortplaats\s*([^\n]+)', text, re.IGNORECASE)
        laatste_woonplaats_match = re.search(r'Laatste\s*Woonplaats\s*([^\n]+)', text, re.IGNORECASE)
        
        if vader_match:
            person['vader'] = {'value': vader_match.group(1).strip(), 'cell': cell.get('id')}
        if moeder_match:
            person['moeder'] = {'value': moeder_match.group(1).strip(), 'cell': cell.get('id')}
        if geboorte_datum_match:
            person['geboorte_datum'] = {'value': geboorte_datum_match.group(1).strip(), 'cell': cell.get('id')}
        if geboorte_plaats_match:
            person['geboorte_plaats'] = {'value': geboorte_plaats_match.group(1).strip(), 'cell': cell.get('id')}
        if laatste_woonplaats_match:
            person['laatste_woonplaats'] = {'value': laatste_woonplaats_match.group(1).strip(), 'cell': cell.get('id')}

    if person:
        persons.append(person)

json_obj = {"persons": persons}
print(json.dumps(json_obj, indent=2, ensure_ascii=False))

{
  "persons": [
    {
      "vader": {
        "value": "Jacobus",
        "cell": "c_86"
      },
      "moeder": {
        "value": "Catharina Ballert",
        "cell": "c_86"
      },
      "geboorte_datum": {
        "value": "9 October 1775",
        "cell": "c_86"
      },
      "geboorte_plaats": {
        "value": "Schoonhoven",
        "cell": "c_86"
      },
      "laatste_woonplaats": {
        "value": "Bruille",
        "cell": "c_86"
      }
    },
    {
      "vader": {
        "value": "Coert Hendrik",
        "cell": "c_818"
      },
      "moeder": {
        "value": "Maria Van Onbeek",
        "cell": "c_818"
      },
      "geboorte_datum": {
        "value": "23 december 1771",
        "cell": "c_818"
      },
      "geboorte_plaats": {
        "value": "S'Hage",
        "cell": "c_818"
      },
      "laatste_woonplaats": {
        "value": "S'Hage",
        "cell": "c_818"
      }
    }
  ]
}


In [6]:
with open(f"data/json/{image_name}.json", "w", encoding='utf-8') as json_file:
    json.dump(json_obj, json_file, ensure_ascii=False, indent=2)#!/usr/bin/env python3


In [7]:
from rdflib import Graph, ConjunctiveGraph, Namespace, URIRef, Literal, RDF

FOAF = Namespace("http://xmlns.com/foaf/0.1/")
EX = Namespace("http://example.org/ontology/")

g = Graph()
g.bind("foaf", FOAF)
g.bind("ex", EX)

cg = ConjunctiveGraph()
cg.bind("foaf", FOAF)
cg.bind("ex", EX)

# Mapping from json keys to RDF predicates
predicate_map = {
    "vader": EX.vader,
    "moeder": EX.moeder,
    "geboorte_datum": EX.geboorteDatum,
    "geboorte_plaats": EX.geboortePlaats,
    "laatste_woonplaats": EX.laatsteWoonplaats
}

for idx, person in enumerate(json_obj["persons"], start=1):
    person_uri = URIRef(f"http://example.org/person/{idx}")
    assertion_graph_uri = URIRef("http://example.org/assertion")
    assertion_graph = Graph(store=cg.store, identifier=assertion_graph_uri)
    assertion_graph.add((person_uri, RDF.type, FOAF.Person))

    provenance_graph_uri = URIRef("http://example.org/provenance")
    provenance_graph = Graph(store=cg.store, identifier=provenance_graph_uri)

    for key, value_dict in person.items():
        value = value_dict["value"]
        cell_id = value_dict["cell"]
        predicate = predicate_map.get(key)
        if predicate:
            # Named graph for each cell
            graph_uri = URIRef(f"http://example.org/graph/{cell_id}")
            ng = Graph(store=cg.store, identifier=graph_uri)
            ng.add((person_uri, predicate, Literal(value)))
            id_uri = URIRef(f"http://example.org/id/{cell_id}")
            provenance_graph.add((graph_uri, EX.prov, id_uri))

cg.serialize(f"data/triples/{image_name}_assersion.trig", format='trig')


  cg = ConjunctiveGraph()


<Graph identifier=Naf9a9400513e43cf98992b43c08d53eb (<class 'rdflib.graph.ConjunctiveGraph'>)>

In [8]:
from lxml import etree

def add_provenance_graph(pagexml_path):
    tree = etree.parse(pagexml_path)
    root = tree.getroot()

    EX = Namespace("http://example.org/ontology/")
    TAB = Namespace("http://example.org/table_ontology/")
    IMG = Namespace("http://example.org/image_ontology/")

    # Create RDF graph
    g = Graph()
    g.bind("ex", EX)
    g.bind("tab", TAB)
    g.bind("img", IMG)


    # Find TableRegion(s)
    table_regions = root.findall(".//{*}TableRegion")

    for table_region in table_regions:
        for cell in table_region.findall(".//{*}TableCell"):
            cell_id = cell.get('id')
            rows = cell.get('row')
            cols = cell.get('col')
            Coords = cell.find(".//{*}Coords")
            coords_points = Coords.get('points') if Coords is not None else None
            
            # create cell uri
            cell_uri = URIRef(f"http://example.org/id/{cell_id}")

            # Create a Table instance URI
            table_uri = URIRef(f"http://example.org/Table/{cell_id}")
            g.add((cell_uri, EX.table_prov, table_uri))
            g.add((table_uri, RDF.type, TAB.table_provanence))

            # Add row and col attributes to the Table instance
            g.add((table_uri, TAB.row, Literal(rows)))
            g.add((table_uri, TAB.col, Literal(cols)))

            # Create image URI
            image_uri = URIRef(f"http://example.org/image/{cell_id}")

            # Add image instance and coord attribute
            g.add((cell_uri, IMG.image_prov, image_uri))
            g.add((image_uri, RDF.type, IMG.image_provenance))
            g.add((image_uri, IMG.coord, Literal(coords_points)))
    
    g.serialize(f"data/triples/{image_name}_provenance.trig", format='trig')
add_provenance_graph(f"data/labels/{image_name}.xml")