First, activate conda environment:
```bash
conda activate NGTR
```

In [None]:
import os, re, csv

os.chdir("../Image2Table_LLM")
current_dir = os.getcwd()
print("Current directory:", current_dir)

Current directory: /Users/sarah_shoilee/codeProjects/stamboekn_KE/Image2Table_LLM


In [2]:
from multi_turn_conversation import call_LLM
from LLM_key import llm_model
from parse import extract_HTML, format_td
from metric import TEDS

In [3]:
image_path = "../data/images/NL-HaNA_2.10.50_45_0355.jpg"
image_name = os.path.basename(image_path)

In [4]:
llm_response = call_LLM(image_path, model_name=llm_model, temperature=0)

### Step 1: Detecting Table Cells

To detect the table cells, we need to analyze the provided image. The image appears to be a scanned page from an old book, containing a table with handwritten text.

Upon closer inspection, we can identify the table structure and the coordinates of each cell.

The detected cell coordinates are as follows:
```plaintext
246,91;573,91;573,170;246,170 #c_1
573,91;702,91;702,170;573,170 #c_2
702,91;806,91;806,170;702,170 #c_3
806,91;910,91;910,170;806,170 #c_4
246,170;573,170;573,546;246,546 #c_5
573,170;702,170;702,546;573,546 #c_6
702,170;806,170;806,546;702,546 #c_7
806,170;910,170;910,546;806,546 #c_8
246,546;573,546;573,868;246,868 #c_9
573,546;702,546;702,868;573,868 #c_10
702,546;806,546;806,868;702,868 #c_11
806,546;910,546;910,868;806,868 #c_12
```

The logical sequence mapping is as follows:
```plaintext
0, 0, 0, 0 #c_1
0, 0, 1, 1 #c_2
0, 0, 2, 2 #c_3
0, 0, 3, 3 #c_4
1, 1, 0, 0 #c_5
1, 1, 1, 1 #c_6
1, 1, 2, 2 #c_7
1, 1, 3, 3 #c_8
2, 2, 0, 0 #c_9


In [5]:
detected_block = re.search(r"coordinates.*?```plaintext(.*?)```", llm_response, re.S | re.I)
detected_lines = detected_block.group(1).strip().splitlines() if detected_block else []

with open(os.path.join("../data/tables/cells/center", image_name+'.txt'), "w+", newline="") as f:
    writer = csv.writer(f)
    for line in detected_lines:
        if not line.strip():
            continue
        parts = line.split("#")
        polygon = parts[0].strip()
        cell_id = "#" + parts[1].strip() if len(parts) > 1 else ""
        writer.writerow([polygon, cell_id])

In [6]:
logical_block = re.search(r"logical sequence.*?```plaintext(.*?)```", llm_response, re.S | re.I)
logical_lines = logical_block.group(1).strip().splitlines() if logical_block else []

with open(os.path.join("../data/tables/cells/logi", image_name+'.txt'), "w+", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["sequence", "cell_id"])  # header
    for line in logical_lines:
        if not line.strip() or line.strip().startswith("</"):
            continue
        parts = line.split("#")
        sequence = parts[0].strip()
        cell_id = "#" + parts[1].strip() if len(parts) > 1 else ""
        writer.writerow([sequence, cell_id])

In [7]:
llm_html = extract_HTML(llm_response)
with open(os.path.join("../data/tables/html", image_name+'.html'), 'w', encoding='utf-8') as f:
    f.write(llm_html)

In [8]:
def calculate_TEDS(ground_truth_html, predicted_html):
    predicted_html = format_td(predicted_html)
    ground_truth_html = format_td(ground_truth_html)

    teds = TEDS(structure_only=False)
    teds_score = teds.evaluate(ground_truth_html, predicted_html)

    teds_struct = TEDS(structure_only=True)
    teds_struct_score = teds_struct.evaluate(ground_truth_html, predicted_html)
    
    print(f"TEDS: {teds_score:.4f}")
    print(f"TEDS-Struct: {teds_struct_score:.4f}")

    return teds_score, teds_struct_score

In [9]:
with open(os.path.join("../data/labels", image_name.replace('.jpg', '.html')), 'r', encoding='utf-8') as f:
    label_html = f.read()

calculate_TEDS(label_html, llm_html)


TEDS: 0.8377
TEDS-Struct: 0.9286


(0.8377424529245707, 0.9285714285714286)

### Information Extraction

In [10]:
# Go outside the src directory
os.chdir("..")
current_dir = os.getcwd()
print("Current directory:", current_dir)

Current directory: /Users/sarah_shoilee/codeProjects/stamboekn_KE


In [11]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(llm_html, "html.parser")
table = soup.find("table")

In [None]:
import re
import json

persons = []

for row in table.find_all("tr"):
    cells = row.find_all("td")

    if not cells:
        continue  # skip header or empty rows

    person = {}
    for cell in cells:
        # preserve breaks as \n
        text = cell.get_text(separator="\n", strip=True)
        
        vader_match = re.search(r'Vader.?\s+([^\n,<]+)', text, re.IGNORECASE)
        moeder_match = re.search(r'Moeder.?\s+([^\n,<]+)', text, re.IGNORECASE)
        geboorte_datum_match = re.search(r'Geboorte.?\s*Den\s*([^\n,<]+)', text, re.IGNORECASE)
        geboorte_plaats_match = re.search(r'Geboortplaats.?\s*([^\n,<]+)', text, re.IGNORECASE)
        laatste_woonplaats_match = re.search(r'Laatste\s*Woonplaats.?\s*([^\n,<]+)', text, re.IGNORECASE)
        
        if vader_match:
            person['vader'] = {'value': vader_match.group(1).strip(), 'cell': cell.get('id')}
        if moeder_match:
            person['moeder'] = {'value': moeder_match.group(1).strip(), 'cell': cell.get('id')}
        if geboorte_datum_match:
            person['geboorte_datum'] = {'value': geboorte_datum_match.group(1).strip(), 'cell': cell.get('id')}
        if geboorte_plaats_match:
            person['geboorte_plaats'] = {'value': geboorte_plaats_match.group(1).strip(), 'cell': cell.get('id')}
        if laatste_woonplaats_match:
            person['laatste_woonplaats'] = {'value': laatste_woonplaats_match.group(1).strip(), 'cell': cell.get('id')}

    if person:
        persons.append(person)

json_obj = {"persons": persons}
print(json.dumps(json_obj, indent=2, ensure_ascii=False))

In [12]:
import re
import json

persons = []

for row in table.find_all("tr"):
    cells = row.find_all("td")

    if not cells:
        continue  # skip header or empty rows

    person = {}
    for cell in cells:
        # preserve breaks as \n
        text = cell.get_text(separator="\n", strip=True)
        
        vader_match = re.search(r'Vader.?\s+([^\n,<]+)', text, re.IGNORECASE)
        moeder_match = re.search(r'Moeder.?\s+([^\n,<]+)', text, re.IGNORECASE)
        geboorte_datum_match = re.search(r'Geboorte.?\s*Den\s*([^\n,<]+)', text, re.IGNORECASE)
        geboorte_plaats_match = re.search(r'Geboortplaats.?\s*([^\n,<]+)', text, re.IGNORECASE)
        laatste_woonplaats_match = re.search(r'Laatste\s*Woonplaats.?\s*([^\n,<]+)', text, re.IGNORECASE)
        
        if vader_match:
            person['vader'] = {'value': vader_match.group(1).strip(), 'cell': cell.get('id')}
        if moeder_match:
            person['moeder'] = {'value': moeder_match.group(1).strip(), 'cell': cell.get('id')}
        if geboorte_datum_match:
            person['geboorte_datum'] = {'value': geboorte_datum_match.group(1).strip(), 'cell': cell.get('id')}
        if geboorte_plaats_match:
            person['geboorte_plaats'] = {'value': geboorte_plaats_match.group(1).strip(), 'cell': cell.get('id')}
        if laatste_woonplaats_match:
            person['laatste_woonplaats'] = {'value': laatste_woonplaats_match.group(1).strip(), 'cell': cell.get('id')}

    if person:
        persons.append(person)

json_obj = {"persons": persons}
print(json.dumps(json_obj, indent=2, ensure_ascii=False))

{
  "persons": [
    {
      "vader": {
        "value": "Isaac Swaart Moeder Catharina Canters",
        "cell": "c_7"
      },
      "moeder": {
        "value": "Catharina Canters",
        "cell": "c_7"
      }
    },
    {
      "vader": {
        "value": "Goert Arend Moeder Maria Canters",
        "cell": "c_12"
      },
      "moeder": {
        "value": "Maria Canters",
        "cell": "c_12"
      }
    }
  ]
}


In [13]:
with open(f"data/json/{image_name}.json", "w", encoding='utf-8') as json_file:
    json.dump(json_obj, json_file, ensure_ascii=False, indent=2)#!/usr/bin/env python3


In [17]:
from src.metrics import calculate_normalized_information_distance

with open(f"data/json/{image_name}.json", 'r', encoding='utf-8') as f:
    constructed_html = json.load(f)
with open(os.path.join("data/labels", image_name.replace('.jpg', '.json')), 'r', encoding='utf-8') as f:
    label_html = json.load(f)

calculate_normalized_information_distance(constructed_html, label_html)


--- Person 1 ---
vader: predicted='Isaac Swaart Moeder Catharina Canters' | ground_truth='Jacobus' | normalized_edit_distance=0.892
moeder: predicted='Catharina Canters' | ground_truth='Catharina Ballert' | normalized_edit_distance=0.235
Overall normalized edit distance for person 1: 0.564

--- Person 2 ---
vader: predicted='Goert Arend Moeder Maria Canters' | ground_truth='Coert Hendrik' | normalized_edit_distance=0.688
moeder: predicted='Maria Canters' | ground_truth='Maria Van Onbeek' | normalized_edit_distance=0.438
Overall normalized edit distance for person 2: 0.562

Normalized edit distance over all persons and fields: 0.563


0.5630465023847377

### KG construction

In [None]:
# load json from file
with open(f"data/json/{image_name}.json", "r", encoding="utf-8") as f:
    json_obj = json.load(f)

In [None]:
# CONSTRUCT ASSERSION TRIPLES
from rdflib import Graph, ConjunctiveGraph, Namespace, URIRef, Literal, RDF

FOAF = Namespace("http://xmlns.com/foaf/0.1/")
EX = Namespace("http://example.org/ontology/")
PROV = Namespace("http://www.w3.org/ns/prov#")

cg = ConjunctiveGraph()
cg.bind("foaf", FOAF)
cg.bind("ex", EX)
cg.bind("prov", PROV)

# Mapping from json keys to RDF predicates
predicate_map = {
    "vader": EX.vader,
    "moeder": EX.moeder,
    "geboorte_datum": EX.geboorteDatum,
    "geboorte_plaats": EX.geboortePlaats,
    "laatste_woonplaats": EX.laatsteWoonplaats
}

for idx, person in enumerate(json_obj["persons"], start=1):
    person_uri = URIRef(f"http://example.org/person/{idx}")
    assertion_graph_uri = URIRef("http://example.org/assertion")
    assertion_graph = Graph(store=cg.store, identifier=assertion_graph_uri)
    assertion_graph.add((person_uri, RDF.type, FOAF.Person))

    provenance_graph_uri = URIRef("http://example.org/provenance")
    provenance_graph = Graph(store=cg.store, identifier=provenance_graph_uri)

    for key, value_dict in person.items():
        value = value_dict["value"]
        cell_id = value_dict["cell"]
        
        predicate = predicate_map.get(key)
        
        # if cell_id is null, no named graph can be created
        if not cell_id:
            assertion_graph.add((person_uri, predicate, Literal(value)))
            continue
        
        if predicate:
            # Named graph for each cell
            graph_uri = URIRef(f"http://example.org/graph/{cell_id}")
            ng = Graph(store=cg.store, identifier=graph_uri)
            ng.add((person_uri, predicate, Literal(value)))
            

cg.serialize(f"data/triples/{image_name.replace('.jpg','')}_assersion.trig", format='trig')


In [None]:
end_time = "2025-09-01T12:00:00Z"  # Example end time, replace with actual time if needed
start_time = "2025-09-01T10:00:00Z"  # Example start time, replace with actual time if needed

def add_provenance_graph(html_path, coordinate_path, stamboek_nummer=image_name):
    import pandas as pd

    df = pd.read_csv(coordinate_path, header=None)
    print(df)

    with open(html_path, 'r', encoding='utf-8') as f:
        llm_html = f.read()

    soup = BeautifulSoup(llm_html, "html.parser")

    EX = Namespace("http://example.org/ontology/")
    IMG = Namespace("http://example.org/image_ontology/")
    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")  
    RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
    PROV = Namespace("http://www.w3.org/ns/prov#")
    CSVW = Namespace("http://www.w3.org/ns/csvw#")

    # Create RDF graph
    g = Graph()
    g.bind("ex", EX)
    g.bind("img", IMG)
    g.bind("rdf", RDF)
    g.bind("rdfs", RDFS)
    g.bind("prov", PROV)
    g.csvw = ("csvw", CSVW)

    for cell in soup.find_all("td"):
        cell_id = cell.get('id')
        rows = cell.get('row')
        cols = cell.get('col')
        cell_number = int(cell_id.lstrip('#c_'))  # Assuming cell_id is like "#c_1", "#2", etc.
        
        try:
            coords_points = df.iloc[cell_number - 1, 0]  # get polygon from dataframe
        except Exception as e:
            print(f"Error retrieving coordinates for cell {cell_id}: {e}")
            continue
        
        # cell uri
        named_graph_uri = URIRef(f"http://example.org/graph/{cell_id}")
        cell_uri = URIRef(f"http://example.org/id/{cell_id}")
        provenance_graph.add((named_graph_uri, PROV.wasDerivedFrom, cell_uri))
        g.add((cell_uri, RDF.type, PROV.Entity))
        g.add((cell_uri, RDFS.label, Literal(f"Cell {cell_id} from {stamboek_nummer}")))
            
        g.add((cell_uri, RDF.type, CSVW.Cell))
        g.add((cell_uri, CSVW.rowNumber, Literal(rows)))
        g.add((cell_uri, CSVW.columnNumber, Literal(cols)))
        g.add((cell_uri, EX.ImageRegion, Literal(coords_points)))

        # agents
        agent_1 = URIRef("http://example.org/agent/1")
        g.add((agent_1, RDF.type, PROV.Agent))
        g.add((agent_1, RDFS.label, Literal("Sarah Shoilee")))
        g.add((named_graph_uri, PROV.wasAttributedTo, agent_1))
        project_agent = URIRef("http://example.org/agent/2")
        g.add((project_agent, RDF.type, PROV.Agent))
        g.add((project_agent, RDFS.label, Literal("Pressing Matter Project")))
        g.add((agent_1, PROV.actedOnBehalfOf, project_agent))

        # activity
        stamboekenKGConstructionactivity = URIRef(f"http://example.org/activity/stamboekenKGConstructionactivity/{cell_id}")
        tableConstructionactivity = URIRef(f"http://example.org/activity/TableExtraction/{cell_id}")
        informationExtractionactivity = URIRef(f"http://example.org/activity/InformationExtraction/{cell_id}")
        KGConstructionactivity = URIRef(f"http://example.org/activity/KGConstruction/{cell_id}")
            
        g.add((stamboekenKGConstructionactivity, RDF.type, PROV.Activity))
        g.add((named_graph_uri, PROV.wasGeneratedBy, stamboekenKGConstructionactivity))
        g.add((stamboekenKGConstructionactivity, PROV.wasAssociatedWith, agent_1))
        g.add((stamboekenKGConstructionactivity, PROV.wasInformedBy, tableConstructionactivity))
        g.add((tableConstructionactivity, RDF.type, PROV.Activity))
        g.add((stamboekenKGConstructionactivity, PROV.wasInformedBy, informationExtractionactivity))
        g.add((informationExtractionactivity, RDF.type, PROV.Activity))
        g.add((informationExtractionactivity, PROV.used, cell_uri))
        g.add((stamboekenKGConstructionactivity, PROV.wasInformedBy, KGConstructionactivity))
        g.add((KGConstructionactivity, RDF.type, PROV.Activity))
        g.add((KGConstructionactivity, PROV.used, cell_uri))

        g.add((stamboekenKGConstructionactivity,PROV.endedAtTime, Literal(end_time)))
        g.add((stamboekenKGConstructionactivity,PROV.startedAtTime, Literal(start_time)))

        # Create a Table instance URI
        table_uri = URIRef(f"http://example.org/Table/{cell_id}")
        g.add((table_uri, RDF.type, PROV.Entity))
        g.add((table_uri, RDF.type, CSVW.Table))
        g.add((table_uri, PROV.wasGeneratedBy, tableConstructionactivity))
        g.add((cell_uri, PROV.wasDerivedFrom, table_uri))
            
        # stamboeken
        stamboek_uri = URIRef(f"http://example.org/stamboek/{stamboek_nummer}")
        g.add((stamboek_uri, RDF.type, PROV.Entity))
        g.add((tableConstructionactivity, PROV.used, stamboek_uri))
        g.add((table_uri, PROV.wasDerivedFrom, stamboek_uri))
        national_archives = URIRef("http://example.org/agent/3")
        g.add((national_archives, RDF.type, PROV.Agent))
        g.add((national_archives, RDFS.label, Literal("Nationaal Archief")))
        g.add((stamboek_uri, PROV.wasAttributedTo, national_archives))
    
    g.serialize(f"data/triples/{image_name.replace('.jpg','')}_provenance.ttl", format='ttl')


In [None]:
csv_path = os.path.join("data/tables/cells/center", image_name+'.txt')
table_path = os.path.join("data/tables/html", image_name+'.html')

add_provenance_graph(table_path, csv_path, stamboek_nummer=image_name)