In [1]:
from pathlib import Path
import rdflib
from prov.serializers.provxml import ProvXMLSerializer

In [2]:
#!git clone https://github.com/common-workflow-language/cwlprov
prov_dir = Path("cwlprov") / "examples" / "revsort-run-1" / "metadata" / "provenance"

The RDF serialization should be the easiest to query with SPARQL

In [3]:
prov_path = prov_dir / "primary.cwlprov.nt"
g = rdflib.Graph()
g.bind("wfprov", "http://purl.org/wf4ever/wfprov#")
g.parse(prov_path)
query = """\
SELECT ?run
WHERE {
  ?run a wfprov:WorkflowRun .
}
"""
res = g.query(query)
for row in res:
    print(row)

(rdflib.term.URIRef('urn:uuid:1f767ad4-ac52-4623-b5bc-dd9faf2b869f'),)


We'd like to extend the query to find out when the workflow run started and ended. This information is available through `was{Started,Ended}By` (with `WorkflowEngine` as the subject), but it's not included in the RDF serialization. It is included in the XML file, but querying that with the `prov` library is more contrived. For instance, to find the `WorkflowRun`:

In [4]:
from prov.serializers.provxml import ProvXMLSerializer

WFPROV = "http://purl.org/wf4ever/wfprov#"
PROV = "http://www.w3.org/ns/prov#"
prov_path = prov_dir / "primary.cwlprov.xml"
with open(prov_path) as f:
    doc = ProvXMLSerializer().deserialize(f)
for r in doc.get_records():
    for k, v in r.attributes:
        if k.uri == f"{PROV}type" and v.uri == f"{WFPROV}WorkflowRun":
            print(r)

activity(id:1f767ad4-ac52-4623-b5bc-dd9faf2b869f, 2018-10-25T15:46:35.211026, -, [prov:label="Run of workflow/packed.cwl#main", prov:type='wfprov:WorkflowRun'])
