# SPARQL queries of CWLProv provenance files
This document provides an overview of different SPARQL queries, together with their (expected) result.

## Import modules & queries

In [35]:
import os
import rdflib
from pathlib import Path
from rdflib.plugins.sparql import prepareQuery
from rdflib.namespace import Namespace
import pandas as pd

In [2]:
cwd = Path(os.getcwd())
queries_dir = cwd / 'queries'

In [44]:
from rdflib import Literal

In [37]:
SCHEMA = Namespace("http://schema.org/")
WFDESC = Namespace("http://purl.org/wf4ever/wfdesc#")

## Functions

In [49]:
def run_query(rdf_file, query_file, namespaces):
    """
    rdf_file = RDF file; query_file = path to sparql query file.
    """
    g = rdflib.Graph()
    g.parse(rdf_file)
    with open(query_file, 'r')  as f:
        query_string = f.read()
        query = prepareQuery(
            queryString = query_string,
            initNs = namespaces,
        )

    print(f"SPARQL QUERY IS:\n{query}")
    
    qres = g.query(query)
    
    results = pd.DataFrame(qres.bindings).map(str).rename(columns=str)
    return results

In [34]:
def extract_wf_namespace(rdf_file):
    """
    Function which extracts namespace from CWLProv RDF provenance graph.
    """
    g = rdflib.Graph()
    g.parse(rdf_file)
    namespaces = list(g.namespaces())
    wf_namespace = ""
    for ns in namespaces:
        (prefix, namespace) = ns
        if prefix == "wf":
            wf_namespace = namespace

    return wf_namespace

## SPARQL queries

Return the doc, label, and intent fields of the main workflow.

In [50]:
provenance_file = "/Users/r.d.wit/Documents/GitHub/cwlprov-provenance/cwlprov_rdf_examples/scenario1/ro/metadata/provenance/primary.cwlprov.ttl" 
wf_namespace = extract_wf_namespace(provenance_file)

namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_wf_doc_query = queries_dir / "wf_metadata_fields.sparql"
response = run_query(provenance_file, extract_wf_doc_query, namespaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x13084b740>
      doc                                  intent     label  \
0  WF_doc  http://edamontology.org/operation_0004  WF_label   

                                                  wf  
0  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  


Return doc, label, and intent fields of every command-line tool/nested workflow that is run by each of the steps.

In [40]:
namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_clt_doc_query = queries_dir / "clt_nested_wf_metadata_fields.sparql"
response = run_query(provenance_file, extract_clt_doc_query, namespaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x1308eb620>
       doc                                  intent      label  \
0  CLT_doc  http://edamontology.org/operation_0004  CLT_label   

                                             main_wf  
0  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  


List doc, label, format fields of all input parameters of main workflow.

In [51]:
namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_wf_inputs_query = queries_dir / "wf_input_params_metadata_fields.sparql"
response = run_query(provenance_file, extract_wf_inputs_query, namespaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x1308f2090>
                  doc                                             format  \
0  wf_input_param_doc  https://www.iana.org/assignments/media-types/t...   

                  label                                                 wf  
0  wf_input_param_label  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  


List doc, label, format fields of all output parameters of main workflow.

In [52]:
namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_wf_outputs_query = queries_dir / "wf_output_params_metadata_fields.sparql"
response = run_query(provenance_file, extract_wf_outputs_query, namespaces)
print(response)

SPARQL QUERY IS:
<rdflib.plugins.sparql.sparql.Query object at 0x1308ecf80>
                   doc                                             format  \
0  wf_output_param_doc  https://www.iana.org/assignments/media-types/t...   

                   label                                                 wf  
0  wf_output_param_label  arcp://uuid,d589fe1c-9550-46b1-b2ed-260a515e74...  


List doc, label, format fields of all input and output parameters of nested workflows/commandlinetools.

In [None]:
namespaces = {"wf": wf_namespace, 
              "wfdesc": WFDESC,
              "schema": SCHEMA }
extract_wf_outputs_query = queries_dir / "clt_nested_wf_input_params_metadata_fields.sparql"
response = run_query(provenance_file, extract_wf_outputs_query, namespaces)
print(response)