The RDF Star program

In [1]:
pip install rdflib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import FOAF, DCTERMS, XSD, RDF, RDFS, SDO
import pprint
import os

In [3]:
# returns all proxies for a provided person
def linkedProxies(ProvidedPerson, g):
    query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ns2: <http://data.biographynet.nl/rdf/>
    PREFIX idm: <http://www.intavia.eu/idm-core/>
    select distinct ?s
    where { 
        ?s idm:person_proxy_for <%s> .
    }
    """ % ProvidedPerson
    
    proxies = g.query(query)
    return proxies

In [4]:
# returns biodes source for a proxy
def findSource(Proxy, g):
    query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ns2: <http://data.biographynet.nl/rdf/>
    PREFIX idm: <http://www.intavia.eu/idm-core/>
    PREFIX ore: <http://www.openarchives.org/ore/terms/>
    select ?biodes
    where { 
        <%s> ore:proxyIn ?biodes .
    }
    """ % Proxy
    
    Biodes = g.query(query)
    for stmt in  Biodes:
        return(stmt.biodes)

In [5]:
#returns all triples connected to proxy to depth of 2
def allTriples(Proxy, g): 
    # create a graph for a description
    # add all triples with subject that matches the description
    new_graph = Graph()
    new_graph += g.triples((Proxy, None, None))
    for stmt in g.triples((Proxy, None, None)):
        subject = stmt[2]
        new_graph += g.triples((subject, None, None))
    return new_graph

In [6]:
#for triples with proxy as subject, we can now use the actual person
def replaceProxyByPerson(triple, Person, Proxy):
    s, p, o = triple
    if s == Proxy:
        s = Person
    new_triple = s, p, o
    return new_triple

In [7]:
def write_to_file(file, file_path, s, p, o, derive_predicate, Source):
        if type(o) == Literal and "\n" in o:
            o = o.replace("\n", "")
        if type(o) == Literal:
            file.write("<< <" + str(s) +"> <" + str(p) + "> \"" + str(o) + "\" >> <" + derive_predicate + "> <" + Source + "> .\n")
        else:
            file.write("<< <" + str(s) +"> <" + str(p) + "> <" + str(o) + "> >> <" + derive_predicate + "> <" + Source + "> .\n")

In [8]:
# for one provided person, write all rdf star statements
def PersonToRDFSTAR(ProvidedPerson, g, path, new_file_name):
    for row in linkedProxies(ProvidedPerson, g):
        Proxy = row['s']
        Source = findSource(Proxy, g)
        all_triples = allTriples(Proxy, g)
        file_path = os.path.join(path, new_file_name)
        with open(file_path, 'w', encoding="utf-8") as file:
            for triple in all_triples:
                triple = replaceProxyByPerson(triple, ProvidedPerson, Proxy)
                s, p, o = triple
                derive_predicate = URIRef('https://www.w3.org/TR/prov-o/#wasDerivedFrom')
                write_to_file(file, file_path, s, p, o, derive_predicate, Source)

In [9]:
#make a query to find all the persons in the knowledge graph
def find_persons(g):
    query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ns2: <http://data.biographynet.nl/rdf/>
    PREFIX idm: <http://www.intavia.eu/idm-core/>
    select distinct ?person
    where { 
        ?person rdf:type idm:Provided_Person .
    }
    """

    persons = g.query(query)
    return persons

In [10]:
def iterate_through_folder(folder_path, path):
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            new_file_name = file_name.split('.')[0] + '.ttls'
            #create a graph for the knowledge graph, for other knowledge graphs paste the file location and name within the parse function
            g = Graph()
            g.parse(file_path)
            Persons = find_persons(g)
            for row in Persons:
                Person = row['person']
                PersonToRDFSTAR(Person, g, path, new_file_name)

In [None]:
path = "D:\\bioport_xml_batch_15_rdf_star"
if not os.path.exists(path):
  os.mkdir(path)
  print("Folder %s created!" % path)
else:
  print("Folder %s already exists" % path)
folder_path = "D:\\bioport_xml_batch_15"
iterate_through_folder(folder_path, path)

Folder D:\bioport_xml_batch_15_rdf_star already exists


<< <http://www.cidoc-crm.org/cidoc-crm/E21_Person> <http://www.w3.org/2000/01/rdf-schema#label> "Person" >> <https://www.w3.org/TR/prov-o/#wasDerivedFrom does not look like a valid URI, trying to serialize this will break.
<< <http://www.cidoc-crm.org/cidoc-crm/E21_Person> <http://www.w3.org/2000/01/rdf-schema#label> "Person" >> <https://www.w3.org/TR/prov-o/#wasDerivedFrom> < does not look like a valid URI, trying to serialize this will break.
<< <http://www.cidoc-crm.org/cidoc-crm/E21_Person> <http://www.w3.org/2000/01/rdf-schema#label> "Person" >> <https://www.w3.org/TR/prov-o/#wasDerivedFrom> <http://data.biographynet.nl/rdf/BioDes-10000083_01 does not look like a valid URI, trying to serialize this will break.
<< <http://www.cidoc-crm.org/cidoc-crm/E21_Person> <http://www.w3.org/2000/01/rdf-schema#label> "Person" >> <https://www.w3.org/TR/prov-o/#wasDerivedFrom> <http://data.biographynet.nl/rdf/BioDes-10000083_01 does not look like a valid URI, trying to serialize this will break.