In [1]:
image_name = "NL-HaNA_2.10.50_45_0110.jpg"

In [2]:
# load json from file
import json
with open(f"../data/json/{image_name}.json", "r", encoding="utf-8") as f:
    json_obj = json.load(f)

### Create assertion KG

In [None]:
from rdflib import Graph, Dataset, Namespace, URIRef, Literal, RDF, BNode

# Namespaces from schema
SCHEMA = Namespace("https://schema.org/")
PVN = Namespace("https://personvocab.nl/")
DBO = Namespace("http://dbpedia.org/ontology/")
PERSON = Namespace("https://pressingmatter.nl/personbaiscinfo/")
EX = Namespace("https://www.example.com/")

cg = Dataset()
cg.bind("schema", SCHEMA)
cg.bind("pvn", PVN)
cg.bind("dbo", DBO)
cg.bind("personbasicinfo", PERSON)
cg.bind("ex", EX)

# Predicate mapping based on schema
predicate_map = {
    "date_of_birth": SCHEMA.birthDate,
    "birth_place": SCHEMA.birthPlace,
    "last_residence": SCHEMA.homeLocation,
    "country_of_nationality": SCHEMA.nationality,
    "military_rank": DBO.militaryRank,
    "basesurname": PVN.baseName,
    "firstnames": PVN.firstName,
    "infix": PVN.infix
}

# Iterate over persons
for idx, person in enumerate(json_obj["persons"], start=1):
    person_uri = URIRef(f"http://example.org/person/{image_name}/{idx}")
    
    # Assertion graph
    assertion_graph_uri = URIRef("http://example.org/assertion")
    assertion_graph = Graph(store=cg.store, identifier=assertion_graph_uri)
    assertion_graph.add((person_uri, RDF.type, PERSON.Person))

    # Process keys
    for key, value_dict in person.items():
        if key == "name":
            name_blank = BNode() # Blank node for name
            assertion_graph.add((person_uri, PVN.hasName, name_blank))

            # Handle nested name structure
            for sub_key, sub_value_dict in value_dict.items():
                value = sub_value_dict.get("value")
                row = sub_value_dict.get("row")
                cells = sub_value_dict.get("cell")
                text_spans = sub_value_dict.get("original_spans")
                predicate = predicate_map.get(sub_key)
                if value and predicate:
                    assertion_graph.add((name_blank, predicate, Literal(value)))
                    # Add to specific graphs based on row number
                    if row is not None:
                        graph_uri = URIRef(f"http://example.org/graph/{image_name}/row_{int(row)}")
                        ng = Graph(store=cg.store, identifier=graph_uri)
                        ng.add((name_blank, predicate, Literal(value)))
                    # Add to specific graphs based on cell ids
                    if isinstance(cells, list):
                        for cell_id in cells:
                            graph_uri = URIRef(f"http://example.org/graph/{image_name}/{cell_id}")
                            ng = Graph(store=cg.store, identifier=graph_uri)
                            ng.add((name_blank, predicate, Literal(value)))
                    elif cells:
                        graph_uri = URIRef(f"http://example.org/graph/{image_name}/{cells}")
                        ng = Graph(store=cg.store, identifier=graph_uri)
                        ng.add((name_blank, predicate, Literal(value)))
                    
                    # Add to specific graphs based on text spans    
                    if isinstance(text_spans, list):
                        for text_span in text_spans:
                            graph_uri = URIRef(f"http://example.org/text_span/{image_name}/{str(text_span).replace(':', '_')}")
                            ng = Graph(store=cg.store, identifier=graph_uri)
                            ng.add((name_blank, predicate, Literal(value)))
                    elif text_spans:
                        graph_uri = URIRef(f"http://example.org/text_span/{image_name}/{str(text_spans).replace(':', '_')}")
                        ng = Graph(store=cg.store, identifier=graph_uri)
                        ng.add((name_blank, predicate, Literal(value)))
                    else:
                        assertion_graph.add((name_blank, predicate, Literal(value)))
        else:
            # Handle normal keys
            value = value_dict.get("value")
            row = value_dict.get("row")
            cells = value_dict.get("cell")
            text_spans = value_dict.get("original_spans")
            predicate = predicate_map.get(key)
            if value and predicate:
                assertion_graph.add((person_uri, predicate, Literal(value)))
                # Add to specific graphs based on row number
                if row is not None:
                    graph_uri = URIRef(f"http://example.org/graph/{image_name}/row_{int(row)}")
                    ng = Graph(store=cg.store, identifier=graph_uri)
                    ng.add((name_blank, predicate, Literal(value)))
                # Add to specific graphs based on cell ids
                if isinstance(cells, list):
                    for cell_id in cells:
                        graph_uri = URIRef(f"http://example.org/graph/{image_name}/{cell_id}")
                        ng = Graph(store=cg.store, identifier=graph_uri)
                        ng.add((person_uri, predicate, Literal(value)))
                elif cells:
                    graph_uri = URIRef(f"http://example.org/graph/{image_name}/{cells}")
                    ng = Graph(store=cg.store, identifier=graph_uri)
                    ng.add((person_uri, predicate, Literal(value)))
                # Add to specific graphs based on text spans
                if isinstance(text_spans, list):
                        for text_span in text_spans:
                            graph_uri = URIRef(f"http://example.org/text_span/{image_name}/{str(text_span).replace(':', '_')}")
                        ng = Graph(store=cg.store, identifier=graph_uri)
                        ng.add((person_uri, predicate, Literal(value)))
                elif text_spans:
                    graph_uri = URIRef(f"http://example.org/text_span/{image_name}/{str(text_spans).replace(':', '_')}")
                    ng = Graph(store=cg.store, identifier=graph_uri)
                    ng.add((person_uri, predicate, Literal(value)))
                

# Serialize to TRiG format
cg.serialize(f"../data/triples/{image_name.replace('.jpg', '')}_assertion.trig", format='trig')


<Graph identifier=N53221da6035b4aea9d1a10c9d1505c71 (<class 'rdflib.graph.Dataset'>)>

### Count number of triples

In [4]:
from rdflib import Dataset

# Path to your TRiG file
file_path = f"../data/triples/{image_name.replace('.jpg', '')}_assertion.trig"

# Load TRiG file into a ConjunctiveGraph
cg = Dataset()
cg.parse(file_path, format="trig")

# Count unique triples across all graphs
unique_triples = set(cg.quads((None, None, None, None)))  # includes graph info
# If you only want subject-predicate-object uniqueness (ignore graph), do:
unique_spo = set((s, p, o) for s, p, o, g in cg.quads((None, None, None, None)))

print(f"Total graphs: {len(set(g for _, _, _, g in cg.quads((None, None, None, None))))}")
print(f"Unique triples (with graph context): {len(unique_triples)}")
print(f"Unique triples (ignoring graph): {len(unique_spo)}")


Total graphs: 9
Unique triples (with graph context): 36
Unique triples (ignoring graph): 26


In [5]:
from rdflib import Dataset

# Path to your TRiG file
file_path = f"../data/triples/{image_name.replace('.jpg', '')}_assertion.trig"

# Load TRiG file
cg = Dataset()
cg.parse(file_path, format="trig")

# Sets for unique triples
original_span_triples = set()
cell_id_triples = set()

for s, p, o, g in cg.quads((None, None, None, None)):
    graph_uri = str(g)
    # Check if graph URI indicates text span or cell
    if "text_span" in graph_uri:
        original_span_triples.add((s, p, o))
    elif "graph/t" in graph_uri:  # cell graphs like graph/t1c10
        cell_id_triples.add((s, p, o))

print(f"Unique triples from text_span graphs (original_spans): {len(original_span_triples)}")
print(f"Unique triples from cell graphs (cell_id): {len(cell_id_triples)}")


Unique triples from text_span graphs (original_spans): 3
Unique triples from cell graphs (cell_id): 0


### Create KG for Proveneance Graph

In [8]:
end_time = "2025-09-01T12:00:00Z"  # Example end time, replace with actual time if needed
start_time = "2025-09-01T10:00:00Z"  # Example start time, replace with actual time if needed

In [None]:
import json
from lxml import etree

def extract_elements_with_row(data):
    """Recursively extract all elements that have a 'row' key."""
    result = []

    def recurse(obj):
        if isinstance(obj, dict):
            if 'row' in obj:
                result.append(obj)
            for v in obj.values():
                recurse(v)
        elif isinstance(obj, list):
            for item in obj:
                recurse(item)

    recurse(data)
    return result

def get_cell_info(root, cell_id):
    """Find TableCell by ID and extract details."""
    cell = root.find(f".//{{*}}TableCell[@id='{cell_id}']")
    if cell is not None:
        rows = cell.get('row')
        cols = cell.get('col')
        coords_elem = cell.find(".//{*}Coords")
        coords_points = coords_elem.get('points') if coords_elem is not None else None
        return {
            "cell_id": cell_id,
            "row": rows,
            "col": cols,
            "coords": coords_points
        }
    return None

def add_provenance_graph(json_path, pagexml_path, stamboek_nummer=image_name):
    with open(json_path, "r", encoding="utf-8") as f:
        json_data = json.load(f)

    # load all elements with 'row' from JSON
    elements_with_row = extract_elements_with_row(json_data)

    # parse PAGE XML
    tree = etree.parse(pagexml_path)
    root = tree.getroot()

    EX = Namespace("http://example.org/ontology/")
    IMG = Namespace("http://example.org/image_ontology/")
    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")  
    RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
    PROV = Namespace("http://www.w3.org/ns/prov#")
    CSVW = Namespace("http://www.w3.org/ns/csvw#")
    SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")

    # Create RDF graph
    # TODO: consider using g = Dataset() and g.store
    g = Graph()
    g.bind("ex", EX)
    g.bind("img", IMG)
    g.bind("rdf", RDF)
    g.bind("rdfs", RDFS)
    g.bind("prov", PROV)
    g.bind("csvw", CSVW)
    g.bind("skos", SKOS)

    # provenance_graph_uri = URIRef("http://example.org/provenance")
    # provenance_graph = Graph(store=g.store, identifier=provenance_graph_uri)

    results = []

    for elem in elements_with_row:
        row_id = elem.get('row')
        cell_value = elem.get('cell')
        spans = elem.get('original_spans')

        # row uri
        row_graph_uri = URIRef(f"http://example.org/graph/{stamboek_nummer}/row_{int(row_id)}")
        row_uri = URIRef(f"http://example.org/id/{stamboek_nummer}/row_{int(row_id)}")
        g.add((row_graph_uri, PROV.wasDerivedFrom, row_uri))
        g.add((row_uri, RDF.type, PROV.Entity))
        g.add((row_uri, RDF.type, EX.Row))
        g.add((row_uri, RDFS.label, Literal(f"Row {row_id} from {stamboek_nummer}")))

        # agents
        agent_1 = URIRef("http://example.org/agent/1")
        g.add((agent_1, RDF.type, PROV.Agent))
        g.add((agent_1, RDFS.label, Literal("Jane Doe")))
        g.add((row_graph_uri, PROV.wasAttributedTo, agent_1))
        project_agent = URIRef("http://example.org/agent/2")
        g.add((project_agent, RDF.type, PROV.Agent))
        g.add((project_agent, RDFS.label, Literal("Pressing Matter Project")))
        g.add((agent_1, PROV.actedOnBehalfOf, project_agent))

        # activity
        stamboekenKGConstructionactivity = URIRef(f"http://example.org/activity/stamboekenKGConstructionactivity/{stamboek_nummer}")
        tableConstructionactivity = URIRef(f"http://example.org/activity/TableExtraction/{stamboek_nummer}")
        informationExtractionactivity = URIRef(f"http://example.org/activity/InformationExtraction/{stamboek_nummer}/row_{row_id}")
        KGConstructionactivity = URIRef(f"http://example.org/activity/KGConstruction/{stamboek_nummer}/row_{row_id}")
            
        g.add((stamboekenKGConstructionactivity, RDF.type, PROV.Activity))
        # g.add((named_graph_uri, PROV.wasGeneratedBy, stamboekenKGConstructionactivity))
        g.add((stamboekenKGConstructionactivity, PROV.wasAssociatedWith, agent_1))
        g.add((stamboekenKGConstructionactivity, PROV.wasInformedBy, tableConstructionactivity))
        g.add((tableConstructionactivity, RDF.type, PROV.Activity))
        g.add((stamboekenKGConstructionactivity, PROV.wasInformedBy, informationExtractionactivity))
        g.add((informationExtractionactivity, RDF.type, PROV.Activity))
        g.add((informationExtractionactivity, PROV.used, row_uri))
        g.add((stamboekenKGConstructionactivity, PROV.wasInformedBy, KGConstructionactivity))
        g.add((KGConstructionactivity, RDF.type, PROV.Activity))

        json_URI = URIRef(f"http://example.org/json/{stamboek_nummer}.json")
        g.add((json_URI, RDF.type, PROV.Entity))
        g.add((json_URI, RDFS.label, Literal(f"JSON file path {json_path}")))
        g.add((KGConstructionactivity, PROV.used, json_URI))

        # TODO: consider if I need it
        g.add((stamboekenKGConstructionactivity,PROV.endedAtTime, Literal(end_time)))
        g.add((stamboekenKGConstructionactivity,PROV.startedAtTime, Literal(start_time)))

        # Create a Table instance URI
        table_uri = URIRef(f"http://example.org/Table/{stamboek_nummer}")
        g.add((table_uri, RDF.type, PROV.Entity))
        g.add((table_uri, RDF.type, EX.Table))
        g.add((table_uri, PROV.wasGeneratedBy, tableConstructionactivity))
        g.add((row_uri, SKOS.partOf, table_uri))
        
        # stamboeken
        stamboek_uri = URIRef(f"http://example.org/Image/{stamboek_nummer}")
        g.add((stamboek_uri, RDF.type, PROV.Entity))
        g.add((tableConstructionactivity, PROV.used, stamboek_uri))
        g.add((table_uri, PROV.wasDerivedFrom, stamboek_uri))
        national_archives = URIRef("http://example.org/agent/3")
        g.add((national_archives, RDF.type, PROV.Agent))
        g.add((national_archives, RDFS.label, Literal("Nationaal Archief")))
        g.add((stamboek_uri, PROV.wasAttributedTo, national_archives))

        # Handle cell values
        if cell_value is not None:
            if isinstance(cell_value, list):
                for val in cell_value:
                    cell_info = get_cell_info(root, val)
                    cell_id = cell_info['cell_id']
                    rows = cell_info['row']
                    cols = cell_info['col']
                    coords_points = cell_info['coords']

                    # cell uri
                    cell_graph_uri = URIRef(f"http://example.org/graph/{stamboek_nummer}/{cell_id}")
                    cell_uri = URIRef(f"http://example.org/id/{stamboek_nummer}/{cell_id}")
                    g.add((cell_graph_uri, PROV.wasDerivedFrom, cell_uri))
                    g.add((cell_uri, RDF.type, PROV.Entity))
                    g.add((cell_uri, RDFS.label, Literal(f"Cell {cell_id} from {stamboek_nummer}")))

                    g.add((cell_uri, RDF.type, EX.Cell))
                    g.add((cell_uri, CSVW.rowNumber, Literal(rows)))
                    g.add((cell_uri, CSVW.columnNumber, Literal(cols)))
                    g.add((cell_uri, EX.ImageRegion, Literal(coords_points)))

                    g.add((cell_uri, SKOS.partOf, row_uri))
                    
            else:
                cell_info = get_cell_info(root, cell_value)
                cell_id = cell_info['cell_id']
                rows = cell_info['row']
                cols = cell_info['col']
                coords_points = cell_info['coords']

                # cell uri
                cell_graph_uri = URIRef(f"http://example.org/graph/{stamboek_nummer}/{cell_id}")
                cell_uri = URIRef(f"http://example.org/id/{stamboek_nummer}/{cell_id}")
                g.add((cell_graph_uri, PROV.wasDerivedFrom, cell_uri))
                g.add((cell_uri, RDF.type, PROV.Entity))
                g.add((cell_uri, RDFS.label, Literal(f"Cell {cell_id} from {stamboek_nummer}")))

                g.add((cell_uri, RDF.type, EX.Cell))
                g.add((cell_uri, CSVW.rowNumber, Literal(rows)))
                g.add((cell_uri, CSVW.columnNumber, Literal(cols)))
                g.add((cell_uri, EX.ImageRegion, Literal(coords_points)))

                g.add((cell_uri, SKOS.partOf, row_uri))
                

        # Handle original spans
        if spans is not None:
            if isinstance(spans, list):
                for span in spans:
                    # span uri
                    span_graph_uri = URIRef(f"http://example.org/text_span/{stamboek_nummer}/{str(span).replace(':', '_')}")
                    span_uri = URIRef(f"http://example.org/id/{stamboek_nummer}/{str(span).replace(':', '_')}")
                    g.add((span_graph_uri, PROV.wasDerivedFrom, span_uri))
                    g.add((span_uri, RDF.type, PROV.Entity))
                    g.add((span_uri, RDFS.label, Literal(f"Original Spans {span} from {stamboek_nummer}")))

                    g.add((span_uri, RDF.type, EX.TextSpan))
                    g.add((span_uri, EX.range, Literal(span)))

                    g.add((span_uri, SKOS.partOf, row_uri))
            else:
                print(f"  Original span: {spans}")

    g.serialize(f"../data/triples/{image_name.replace('.jpg', '')}_provenance.ttl", format='ttl')


In [10]:
json_path = f"../data/json/{image_name}.json"
pagexml_path = "../data/tables/pagexml/NL-HaNA_2.10.50_45_0110.jpg.xml"
results = add_provenance_graph(json_path, pagexml_path, image_name)

In [None]:
from pyshacl import validate
r = validate(f"../data/triples/{image_name.replace('.jpg', '')}_provenance.ttl", "../data/triples/data_provenance.ttl")
conforms, results_graph, results_text = r