In [1]:
import os
import json
import pandas as pd
from lxml import etree
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, SKOS, DCTERMS, DC
import uuid


source_file = './out/monuments.xml'
tree = etree.parse(source_file)
root = tree.getroot()

concept_nodes = {"Religion Type": 'religionURI', "Religious Order Type": "religiousOrder", "Dedication Subject Type": "dedicationURI"}
concept_singels = {"Building Type": 'typeOfBuilding', "Building AAT Type": "additionalType", "Monument Status Type": "monumentStatus", "Rights Type": "rightsURI"}
ark_id = "1552"

In [2]:
def generate_skos(_list, _voc, _type):
    
    NS = Namespace("")
    g = Graph()
    g.bind("ns", NS)
    g.bind("skos", SKOS)
    g.bind("dcterms", DCTERMS)
    g.bind("dc", DC)

    voc_name = _voc
    voc_uri = f'https://n2t.net/ark:/{ark_id}/{voc_name.replace(" ", "")}/{str(uuid.uuid5(uuid.NAMESPACE_DNS, _voc))}'

    ns_voc_name = NS[voc_name]
    ns_voc_uri = NS[voc_uri]
    
    g.add((ns_voc_uri, RDF.type, SKOS['ConceptScheme']))
    g.add((ns_voc_uri, DCTERMS.title, Literal(voc_name,lang="nl")))
    g.add((ns_voc_uri, DCTERMS.title, Literal(voc_name,lang="en")))

    for row in _list:
        if _type == 'nested':
            uri = NS[str(row['about'])]
        elif _type == 'flat':
            flat_url = str(uuid.uuid5(uuid.NAMESPACE_DNS, row['prefLabel'].replace("resource=", "").replace("\"", "")))
            uri = NS[f'https://n2t.net/ark:/{ark_id}/{flat_url}']
        g.add((uri, SKOS.inScheme, ns_voc_uri))
        g.add((ns_voc_uri, SKOS.hasTopConcept, uri))
        
        g.add((uri, RDF.type, SKOS.Concept))
        g.add((uri, SKOS.prefLabel, Literal(row['prefLabel'].replace("resource=", "").replace("\"", ""), lang="nl")))
        g.add((uri, SKOS.inScheme, ns_voc_uri))
        if 'altLabel' in row:            
            for alt_label in row['altLabel']:
                g.add((uri, SKOS.altLabel, Literal(alt_label, lang="nl")))

    skos_data = g.serialize(format='pretty-xml').decode('utf-8')
    #print(skos_data)
    with open(f'./skos/{voc_name}.xml', "w") as f:
        f.write(skos_data)
    
    
    #skos_data = ''
    del g
    return skos_data

In [3]:
def remove_duplicates(dict_list):
    seen = set()  # Set to track unique prefLabel values
    unique_dicts = []  # List to store dictionaries without duplicates

    for d in dict_list:
        pref_label = d["prefLabel"]
        if pref_label not in seen:
            seen.add(pref_label)  # Add prefLabel to the seen set
            unique_dicts.append(d)  # Add the dictionary to the unique list

    return unique_dicts

In [None]:
def extract_nested_concepts(node):
    
    records_list = []

    for concepts in root.findall(f'.//{node}'):
        concept = concepts.find("Concept")
        if concept is not None:
            id_elem = concept.find("id")
            pref_label_elem = concept.find("prefLabel")
            alt_label_elems = concept.findall("altLabel")

            id_value = id_elem.text.replace("about=", "") if id_elem is not None and id_elem.text is not None else ""
            pref_label_value = pref_label_elem.text if pref_label_elem is not None and pref_label_elem.text is not None else ""

            alt_labels = [alt_label.text for alt_label in alt_label_elems if alt_label.text is not None]

            if pref_label_value:
                record_dict = {
                    "prefLabel": pref_label_value,
                    "about": id_value,
                    "altLabel": alt_labels
                }
                records_list.append(record_dict)

    return records_list

for voc_label, voc_name in concept_nodes.items():
    concepts = extract_nested_concepts(voc_name)
    unique_keys = remove_duplicates(concepts)
    skos = generate_skos(unique_keys, voc_label, 'nested')
    #print(skos)


In [None]:
def extract_flat_concepts(node):
    
    records_list = []

    for concept in root.findall(f'.//{node}'):
        if concept is not None:
            pref_label_value = concept.text
            if pref_label_value:
                record_dict = {
                    "prefLabel": pref_label_value,
                }
            
            records_list.append(record_dict)

    return records_list
for voc_label, voc_name in concept_singels.items():
    flat_concepts = extract_flat_concepts(voc_name)
    flat_unique_keys = remove_duplicates(flat_concepts)
    skos = generate_skos(flat_unique_keys, voc_label, 'flat')

#for voc_label, voc_name in concept_singels.items():
#    flat_concepts = extract_nested_concepts(voc_name)

    #generate_skos(concepts, voc_label)



In [16]:
def extract_rights_concepts(node):
    records_list = []

    for concept in root.findall(f'.//{node}'):
        if concept is not None:
            pref_label_value = concept.text
            if pref_label_value:
                record_dict = {
                    "rights_uri": pref_label_value.replace("resource=", "").replace("\"", ""),
                }
            
            records_list.append(record_dict)

    return records_list

flat_rights_concepts = extract_rights_concepts('rightsURI')
flat_unique_keys = remove_duplicates(flat_rights_concepts)
print(len(flat_unique_keys))
print(json.dumps(flat_rights_concepts, indent=2))


172
[
  {
    "prefLabel": "http://creativecommons.org/publicdomain/zero/1.0/deed.en"
  },
  {
    "prefLabel": "http://gebouwen.brabantcloud.nl/entity/Q2274"
  },
  {
    "prefLabel": "http://gebouwen.brabantcloud.nl/entity/Q1759"
  },
  {
    "prefLabel": "https://creativecommons.org/licenses/by-sa/3.0"
  },
  {
    "prefLabel": "http://gebouwen.brabantcloud.nl/entity/Q1764"
  },
  {
    "prefLabel": "http://creativecommons.org/licenses/by-sa/3.0/"
  },
  {
    "prefLabel": "https://creativecommons.org/licenses/by-sa/4.0"
  },
  {
    "prefLabel": "http://gebouwen.brabantcloud.nl/entity/Q2195"
  },
  {
    "prefLabel": "http://gebouwen.brabantcloud.nl/entity/Q1752"
  },
  {
    "prefLabel": "http://gebouwen.brabantcloud.nl/entity/Q2181"
  },
  {
    "prefLabel": "http://gebouwen.brabantcloud.nl/entity/Q1850"
  },
  {
    "prefLabel": "https://creativecommons.org/licenses/by-sa/4.0/"
  },
  {
    "prefLabel": "http://gebouwen.brabantcloud.nl/entity/Q6848"
  },
  {
    "prefLabel": "ht