# SPARQL queries of CWLProv provenance files
This document provides an overview of different SPARQL queries, together with their (expected) result.

## Import modules & queries

In [7]:
import os
import rdflib
from pathlib import Path
import pandas as pd

In [8]:
cwd = Path(os.getcwd())
queries_dir = cwd / 'queries'

## Functions

In [39]:
def run_query(rdf_file, query_file):
    """
    file = RDF file; query_file = path to sparql query file.
    """
    g = rdflib.Graph()
    g.parse(rdf_file)
    with open(query_file, 'r')  as f:
        query = f.read()
        
    print(f"SPARQL QUERY IS:\n{query}")
    
    qres = g.query(query)
    
    results = pd.DataFrame(qres.bindings).applymap(str).rename(columns=str)
    return results

## SPARQL queries

List all the Docker images used in this workflow run.

In [10]:
provenance_file = cwd / "labels_wf_primary_cwlprov.ttl" # replace this with provenance file of full workflow run
extract_images_query = queries_dir / "docker_images.sparql"
response = run_query(provenance_file, extract_images_query)
for row in response:
    print(f"{row.step} used {row.image}")


arcp://uuid,a914217a-5cd2-457d-85cc-7472eeb17bfd/workflow/packed.cwl#main/generate_pc7 used amancevice/pandas:1.3.4-slim


List every entity that has a DOI.

In [11]:
provenance_file = cwd / 'data_ann_ex1_primary.cwlprov.ttl' # replace this with provenance file of full workflow run
extract_doi_query = queries_dir / 'dois.sparql'
response = run_query(provenance_file, extract_doi_query)
for row in response:
    print(f"{row.doi}")

<doi>
<doi>


Extract the formats of all files for which this is specified.

In [12]:
provenance_file = cwd / 'data_ann_ex2_primary.cwlprov.ttl'
extract_format_query = queries_dir / 'format.sparql'
response = run_query(provenance_file, extract_format_query)
for row in response:
    print(f"{row.basename} has format {row.format}")

sabdab_summary_all_20220527.tsv has format https://www.iana.org/assignments/media-types/text/tab-separated-values
7mb7.cif has format http://edamontology.org/format_1477
7zxf.cif has format http://edamontology.org/format_1477


## Queries of emulated provenance file
Emulated RDF graph of epitope prediction workflow run.

In [16]:
workflow_provenance = cwd / 'niaa_wf_run/primary.cwlprov.ttl'
extract_citations_query = queries_dir / 'dois.sparql'
response = run_query(workflow_provenance, extract_citations_query)
for row in response:
    print(f"{row.doi}")

Extract description of workflow run.

In [21]:
workflow_provenance = cwd / 'niaa_wf_run/primary.cwlprov.ttl'
extract_wf_desc_query = queries_dir / 'execution_desc.sparql'
response = run_query(workflow_provenance, extract_wf_desc_query)
for row in response:
    print(f"Description of workflow run: {row.desc}")

Query is:
PREFIX s: <http://schema.org/>
PREFIX wfprov: <http://purl.org/wf4ever/wfprov#> 

SELECT ?desc

WHERE {
    ?id a wfprov:WorkflowRun .
    ?id s:description ?desc .
}
Description of workflow run: Demonstration run of epitope prediction workflow. Some steps are emulated, so the results of the workflow are not yet biologically meaningful.


Extract metadata of workflow.

In [33]:
workflow_provenance = cwd / 'niaa_wf_run/primary.cwlprov.ttl'
extract_wf_annot_query = queries_dir / 'wf_annotations.sparql'
response = run_query(workflow_provenance, extract_wf_annot_query)
for row in response:
    print(f"Doc field of workflbow: {row.doc}\nIntent field of workflow: {row.intent}")

SPARQL QUERY IS:
PREFIX s: <http://schema.org/>
PREFIX wfdesc: <http://purl.org/wf4ever/wfdesc#> 

SELECT ?id ?doc ?intent

WHERE {
    ?id a wfdesc:Workflow .
    ?id s:description ?doc .
    ?id s:featureList ?intent .
}
Doc field of workflow: This workflow calculates input features and labels which are used to train a deep learning model for epitope prediction.
Intent field of workflow: http://edamontology.org/operation_2423
<rdflib.plugins.sparql.processor.SPARQLResult object at 0x7ff430be2ac0>


List all the steps of the workflow, with metadata.

In [54]:
workflow_provenance = cwd / 'niaa_wf_run/primary.cwlprov.ttl'
extract_wf_steps_query = queries_dir / 'wf_steps.sparql'
response = run_query(workflow_provenance, extract_wf_steps_query)
# for row in response:
#     print(f"Step: {row.step_id}")
response.to_csv(cwd / 'results/step_descriptions.tsv', sep='\t')


SPARQL QUERY IS:
PREFIX s: <http://schema.org/>
PREFIX wfdesc: <http://purl.org/wf4ever/wfdesc#> 

SELECT ?step_id ?doc ?label ?clt

WHERE {
    ?wf a wfdesc:Workflow .
    ?wf wfdesc:hasSubProcess ?step_id .
    OPTIONAL { ?step_id s:description ?doc . } .
    OPTIONAL { ?step_id s:name ?label . } .
    OPTIONAL { ?step_id wfdesc:hasSubProcess ?clt . } .
}


List all the input parameters of 1 particular step.

In [55]:
workflow_provenance = cwd / 'niaa_wf_run/primary.cwlprov.ttl'
extract_step_params_query = queries_dir /'step_metadata.sparql'
response = run_query(workflow_provenance, extract_step_params_query)
response.to_csv(cwd / 'results/step_params.tsv', sep='\t')

SPARQL QUERY IS:
PREFIX s: <http://schema.org/>
PREFIX cwlprov: <https://w3id.org/cwl/prov#>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX wfdesc: <http://purl.org/wf4ever/wfdesc#> 

SELECT DISTINCT ?step_input

WHERE {
    ?wf wfdesc:hasSubProcess <arcp://uuid,eb41f41c-d7b4-4999-9ce9-719fdc8c12b1/workflow/packed.cwl#main/combine_labels> .
    OPTIONAL { <arcp://uuid,eb41f41c-d7b4-4999-9ce9-719fdc8c12b1/workflow/packed.cwl#main/combine_labels> wfdesc:hasInput ?step_input } .
}
