# Testing provbook

This notebook explores the metadata produced from [provbook](https://github.com/Sheeba-Samuel/ProvBook), using notebook_rdf from command line on the notebooks reproducing published results.

In [11]:
from rdflib import Graph, Literal, BNode, Namespace, URIRef
from rdflib.namespace import FOAF, RDF

# library to support visualisation of graphs
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph

# load a small rdf file in n-triples format
from pprint import pprint

#pip install pyvis
from pathlib import Path

# pyvis used to display the graphs
from pyvis.network import Network

g = Graph()
# friend of a friend (FOAF) is an ontology describing people
g.bind("foaf", FOAF)  # bind an RDFLib-provided namespace to a prefix

pokedex = Namespace("https://michiganpython.org/pokedex/")
pikachu = pokedex.pikachu
catezar = BNode() # a GUID is generated

name = Literal("Pikachu")
age = Literal(24)

# addin triples to a graph
g.add((pikachu, RDF.type, pokedex.Pokemon)) # add a triple for pikachu
g.add((pikachu, FOAF.name, name)) # add name triple for pikachu
g.add((pikachu, FOAF.age, age)) # add age triple for pikachu
g.add((pikachu, FOAF.knows, catezar)) # add triple to link to catezar and pikachu
g.add((catezar, RDF.type, pokedex.Pokemon))
g.add((catezar, FOAF.name, Literal("Catezar")))
g.add((catezar, FOAF.age, Literal(1)))

# printing the graph
print(g.serialize())


@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://michiganpython.org/pokedex/pikachu> a <https://michiganpython.org/pokedex/Pokemon> ;
    foaf:age 24 ;
    foaf:knows [ a <https://michiganpython.org/pokedex/Pokemon> ;
            foaf:age 1 ;
            foaf:name "Catezar" ] ;
    foaf:name "Pikachu" .




## Reading RDF
The individaul rdf documents generated with notebook_rdf in a single graph "g_nbs".

In [27]:
g_nbs = Graph()

files_path = "."
os_path = Path(files_path)
for a_file in os_path.glob('*.ttl'):
    print(a_file)
    #a_graph = Graph()
    g_nbs.parse(a_file, format="ttl")
    #for statement in a_graph:
    #    pprint(statement)
    
print (f"\nrdflib Graph loaded successfully with {len(g_nbs)} triples")   

Paper 01 Reproduce XAS.ttl
Paper 02 Reproduce XAS.ttl
Paper 03 Reproduce XAS.ttl
Paper 04 Reproduce XAS.ttl
Paper 05 Reproduce XAS.ttl
Paper 06 Reproduce XAS.ttl
Paper 07 Reproduce XAS.ttl
Paper 08 Reproduce XAS.ttl
Paper 09 Reproduce XAS.ttl
Reproduce Results Larch-Python.ttl
Vary fit parameters.ttl

rdflib Graph loaded successfully with 1580 triples


## Create a subgraph

A subgraph is created to show part of the information in the graph. 


In [14]:
print (f"rdflib Graph loaded successfully with {len(g_nbs)} triples")
#for statement in g_nbs:
#   pprint(statement)

sub_graph = Graph()
triple_object = URIRef("https://w3id.org/reproduceme#Notebook")

triple_subject = URIRef("https://w3id.org/reproduceme#Cell0")

#p-plan: <http://purl.org/net/p-plan/#> .
#rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
#repr: <https://w3id.org/reproduceme#> .
#xsd: <http://www.w3.org/2001/XMLSchema#> .
# something a repr:Notebook
sub_graph += g_nbs.triples((None, None, triple_object))
for idx, (s,p,o) in enumerate(g_nbs.triples((None, None, triple_object))):
    print(f"{idx} S: {s} P:{p} O: {o}")
    #sub_graph += g_nbs.triples((None, None, s))
    for idx2, (s1,p1,o1) in enumerate(g_nbs.triples((None, None, s))):
        print(f"\t {idx2} S: {s1} P: {p1} O: {o1}")
        sub_graph.add((s1, p1, o1))
        #for s2,p2,o2 in g_nbs.triples((s1,None, None)):
        #    print(f"S: {s2} P: {p2} O: {o2}")


rdflib Graph loaded successfully with 1580 triples
0 S: https://w3id.org/reproduceme#Paper01ReproduceXAS P:http://www.w3.org/1999/02/22-rdf-syntax-ns#type O: https://w3id.org/reproduceme#Notebook
	 0 S: https://w3id.org/reproduceme#Cell0 P: http://purl.org/net/p-plan/#isStepOfPlan O: https://w3id.org/reproduceme#Paper01ReproduceXAS
	 1 S: https://w3id.org/reproduceme#Cell1 P: http://purl.org/net/p-plan/#isStepOfPlan O: https://w3id.org/reproduceme#Paper01ReproduceXAS
	 2 S: https://w3id.org/reproduceme#Cell10 P: http://purl.org/net/p-plan/#isStepOfPlan O: https://w3id.org/reproduceme#Paper01ReproduceXAS
	 3 S: https://w3id.org/reproduceme#Cell11 P: http://purl.org/net/p-plan/#isStepOfPlan O: https://w3id.org/reproduceme#Paper01ReproduceXAS
	 4 S: https://w3id.org/reproduceme#Cell12 P: http://purl.org/net/p-plan/#isStepOfPlan O: https://w3id.org/reproduceme#Paper01ReproduceXAS
	 5 S: https://w3id.org/reproduceme#Cell13 P: http://purl.org/net/p-plan/#isStepOfPlan O: https://w3id.org/repr

# Display subgraph

Loading the data as it is causes some clashes because the names of cells (Cell0, Cell1, ... Celln) are used as ID in different notebooks. Need to check if this is OK

In [28]:
nx_graph = rdflib_to_networkx_graph(sub_graph)
print(f"Newtworkx {nx_graph} loaded successfully")

nt = Network("800px", "100%", notebook="True")
nt.from_nx(nx_graph)
nt.show_buttons(filter_=["physics"])
nt.show("nx.html")

Newtworkx Graph with 261 nodes and 936 edges loaded successfully
nx.html


## Rename cells

Renaming the cells fixes some of the issues. However, the renaming needs to be done at the time of reading as loading more data into the graph further down causes new clashes. 

In [29]:
#p-plan: <http://purl.org/net/p-plan/#> .
#rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
#repr: <https://w3id.org/reproduceme#> .
#xsd: <http://www.w3.org/2001/XMLSchema#> .
# something a repr:Notebook
repr_prefix = "https://w3id.org/reproduceme#"
sub_graph = Graph()
sub_graph += g_nbs.triples((None, None, triple_object))
for idx, (s,p,o) in enumerate(g_nbs.triples((None, None, triple_object))):
    book_id = s.removeprefix(repr_prefix)
    #print(f"{idx} S: {s} P:{p} O: {o}")
    #sub_graph += g_nbs.triples((None, None, s))
    for idx2, (s1,p1,o1) in enumerate(g_nbs.triples((None, None, s))):
        cell_id = s1.removeprefix(repr_prefix)
        #print(cell_id)
        new_s1 = URIRef(repr_prefix+book_id+"_"+cell_id)
        #print(f"\t {idx2} S: {new_s1} P: {p1} O: {o1}")
        sub_graph.add((new_s1, p1, o1))
 

In [30]:
nx_graph = rdflib_to_networkx_graph(sub_graph)
print(f"Newtworkx {nx_graph} loaded successfully")

nt = Network("800px", "100%", notebook="True")
nt.from_nx(nx_graph)
nt.show_buttons(filter_=["physics"])
nt.show("nx.html")

Newtworkx Graph with 258 nodes and 257 edges loaded successfully
nx.html


In [31]:
prefix = "https://w3id.org/reproduceme#"
book_id = s.removeprefix(prefix)

print(s)

print(book_id)

cell_id = s1.removeprefix(prefix)
print(s1)
print(cell_id)

new_s1 = URIRef(prefix+book_id+"_"+cell_id)
print (new_s1)

https://w3id.org/reproduceme#Varyfitparameters
Varyfitparameters
https://w3id.org/reproduceme#Cell9
Cell9
https://w3id.org/reproduceme#Varyfitparameters_Cell9


In [32]:
#p-plan: <http://purl.org/net/p-plan/#> .
#rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
#repr: <https://w3id.org/reproduceme#> .
#xsd: <http://www.w3.org/2001/XMLSchema#> .
# something a repr:Notebook
repr_prefix = "https://w3id.org/reproduceme#"
type_pred = URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')

sub_graph = Graph()
sub_graph += g_nbs.triples((None, None, triple_object))
for idx, (s,p,o) in enumerate(g_nbs.triples((None, None, triple_object))):
    book_id = s.removeprefix(repr_prefix)
    #print(f"{idx} S: {s} P:{p} O: {o}")
    #sub_graph += g_nbs.triples((None, None, s))
    for idx2, (s1,p1,o1) in enumerate(g_nbs.triples((None, None, s))):
        cell_id = s1.removeprefix(repr_prefix)
        #print(cell_id)
        new_s1 = URIRef(repr_prefix+book_id+"_"+cell_id)
        #print(f"\t {idx2} S: {new_s1} P: {p1} O: {o1}")
        sub_graph.add((new_s1, p1, o1))
        for idx2, (_,p2,o2) in enumerate(g_nbs.triples((s1, type_pred, None))):
            sub_graph.add((new_s1,p2,o2))

In [33]:
nx_graph = rdflib_to_networkx_graph(sub_graph)
print(f"Newtworkx {nx_graph} loaded successfully")

nt = Network("800px", "100%", notebook="True")
nt.from_nx(nx_graph)
nt.show_buttons(filter_=["physics"])
nt.show("nx.html")

Newtworkx Graph with 259 nodes and 503 edges loaded successfully
nx.html


In [34]:
#p-plan: <http://purl.org/net/p-plan/#> .
#rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
#repr: <https://w3id.org/reproduceme#> .
#xsd: <http://www.w3.org/2001/XMLSchema#> .
# something a repr:Notebook
repr_prefix = "https://w3id.org/reproduceme#"
type_pred = URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
cell_type_pred = URIRef("https://w3id.org/reproduceme#hasCellType")
sub_graph = Graph()
sub_graph += g_nbs.triples((None, None, triple_object))
for idx, (s,p,o) in enumerate(g_nbs.triples((None, None, triple_object))):
    book_id = s.removeprefix(repr_prefix)
    #print(f"{idx} S: {s} P:{p} O: {o}")
    #sub_graph += g_nbs.triples((None, None, s))
    for idx2, (s1,p1,o1) in enumerate(g_nbs.triples((None, None, s))):
        cell_id = s1.removeprefix(repr_prefix)
        #print(cell_id)
        new_s1 = URIRef(repr_prefix+book_id+"_"+cell_id)
        #print(f"\t {idx2} S: {new_s1} P: {p1} O: {o1}")
        sub_graph.add((new_s1, p1, o1))
        for idx2, (_,p2,o2) in enumerate(g_nbs.triples((s1, type_pred, None))):
            sub_graph.add((new_s1,p2,o2))
        for idx2, (_,p2,o2) in enumerate(g_nbs.triples((s1, cell_type_pred, None))):
            sub_graph.add((new_s1,p2,o2))

In [35]:
nx_graph = rdflib_to_networkx_graph(sub_graph)
print(f"Newtworkx {nx_graph} loaded successfully")

nt = Network("800px", "100%", notebook="True")
nt.from_nx(nx_graph)
nt.show_buttons(filter_=["physics"])
nt.show("nx.html")

Newtworkx Graph with 261 nodes and 936 edges loaded successfully
nx.html


## Review
Similar problems are expected to occur with similar elements of the metadata which are named using a keyword and a consecutive index such as "Source", "Output" and  "SubOutput". Need to verify if this can/should be corrected at read time or at generation time.