Named graphs programs

In [14]:
pip install rdflib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import FOAF, DCTERMS, XSD, RDF, RDFS, SDO
import pprint
import os

In [16]:
# returns all proxies for a provided person
def linkedProxies(ProvidedPerson, g):
    query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ns2: <http://data.biographynet.nl/rdf/>
    PREFIX idm: <http://www.intavia.eu/idm-core/>
    select distinct ?s
    where { 
        ?s idm:person_proxy_for <%s> .
    }
    """ % ProvidedPerson
    
    proxies = g.query(query)
    return proxies

In [17]:
# returns biodes source for a proxy
def findSource(Proxy, g):
    query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ns2: <http://data.biographynet.nl/rdf/>
    PREFIX idm: <http://www.intavia.eu/idm-core/>
    PREFIX ore: <http://www.openarchives.org/ore/terms/>
    select ?biodes
    where { 
        <%s> ore:proxyIn ?biodes .
    }
    """ % Proxy
    
    Biodes = g.query(query)
    for stmt in  Biodes:
        return(stmt.biodes)

In [18]:
#returns all triples connected to proxy to depth of 2
def allTriples(Proxy, g): 
    # create a graph for a description
    # add all triples with subject that matches the description
    new_graph = Graph()
    new_graph += g.triples((Proxy, None, None))
    for stmt in g.triples((Proxy, None, None)):
        subject = stmt[2]
        new_graph += g.triples((subject, None, None))
    return new_graph

In [19]:
#for triples with proxy as subject, we can now use the actual person
def replaceProxyByPerson(triple, Person, Proxy):
    s, p, o = triple
    if s == Proxy:
        s = Person
    return s, p, o

In [55]:
def deleteNewLine(s, p, o):
    if type(o) == Literal and "\n" in o:
        o = o.replace("\n", "")
        o = Literal(o)
    return s, p, o

In [56]:
def create_graph_uri(uri):
    firstsplit = str(uri).split('/')
    str_uri_number = str(firstsplit[4])
    uri = 'http://example.org/' + str_uri_number
    return uri

In [57]:
def create_named_graph_filename(Proxy):
    firstsplit = str(Proxy).split('/')
    str_proxy_number = str(firstsplit[4])
    file_named_graph_name = str_proxy_number + '_named_graph.trig'
    return file_named_graph_name

In [58]:
def create_main_filename(Person):
    firstsplit = str(Person).split('/')
    str_person_number = str(firstsplit[4])
    file_main_name = str_person_number + '_main_graph.trig'
    return file_main_name

In [59]:
# for one provided person, write all rdf statements into a named graph
def PersonToNG(ProvidedPerson, g, main_graph, path):
    for row in linkedProxies(ProvidedPerson, g):
        Proxy = row['s']
        graph_uri = URIRef(create_graph_uri(Proxy))
        new_graph = Graph(identifier=graph_uri)
        Source = findSource(Proxy, g)
        main_graph.add((graph_uri, URIRef('https://www.w3.org/TR/prov-o/#wasDerivedFrom'), Source))
        all_triples = allTriples(Proxy, g)
        for triple in all_triples:
            if triple[0] == Source:
                main_graph.add((triple))
            else:
                s, p, o = replaceProxyByPerson(triple, ProvidedPerson, Proxy)
                s, p, o = deleteNewLine(s, p, o)
                new_graph.add((s, p, o))
        file_named_graph_name = os.path.join(path, create_named_graph_filename(Proxy))
        new_graph.serialize(destination=file_named_graph_name, format = 'trig')
    file_main_name = os.path.join(path, create_main_filename(ProvidedPerson))
    main_graph.serialize(destination=file_main_name, format = 'trig')

In [60]:
#make a query to find all the persons in the knowledge graph
def find_persons(g):
    query = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX ns2: <http://data.biographynet.nl/rdf/>
    PREFIX idm: <http://www.intavia.eu/idm-core/>
    select distinct ?person
    where { 
        ?person rdf:type idm:Provided_Person .
    }
    """

    persons = g.query(query)
    return persons

In [61]:
def iterate_through_folder(folder_path, path):
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            #new_file_name = file_name.split('.')[0] + '.ttls'
            #create a graph for the knowledge graph, for other knowledge graphs paste the file location and name within the parse function
            g = Graph()
            g.parse(file_path)
            Persons = find_persons(g)
            for row in Persons:
                Person = row['person']
                Person_uri = create_graph_uri(Person)
                main_graph = Graph(identifier=Person_uri)
                PersonToNG(Person, g, main_graph, path)

In [62]:
path = "D:\\bioport_xml_batch_15_named_graphs"
if not os.path.exists(path):
  os.mkdir(path)
  print("Folder %s created!" % path)
else:
  print("Folder %s already exists" % path)
folder_path = "D:\\bioport_xml_batch_15"
iterate_through_folder(folder_path, path)

Folder D:\bioport_xml_batch_15_named_graphs already exists


Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x000002B29F8DDB20>
Traceback (most recent call last):
  File "C:\Users\cjtya\AppData\Local\Programs\Python\Python311\Lib\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\cjtya\AppData\Local\Programs\Python\Python311\Lib\site-packages\isodate\isodates.py", line 181, in parse_date
    ret = date(sign * int(groups['year']), 1, 1)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: year 0 is out of range
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x000002B29F8DDB20>
Traceback (most recent call last):
  File "C:\Users\cjtya\AppData\Local\Programs\Python\Python311\Lib\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_