### Bioremediation Use Case Prototype

Federating between: Pollutants list (possible sources: EU Open Data / Wikidata), IDSM, Rhea, Uniprot, OMA



In [None]:
# helper functions

# first we need to define the SPARQL endpoints of each source, to use later in the protocols
import sys
!{sys.executable} -m pip install SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON, CSV
import sys, os, time
import pandas as pd

NUM_EXAMPLES = 10

# the endpoints must be defined as wrappers for executing SPARQL queries
sparql_Wikidata = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql_IDSM = SPARQLWrapper("https://idsm.elixir-czech.cz/sparql/endpoint/idsm")
sparql_Rhea = SPARQLWrapper("https://sparql.rhea-db.org/sparql/")
sparql_Uniprot = SPARQLWrapper("https://sparql.uniprot.org/sparql")
sparql_OMA = SPARQLWrapper("https://sparql.omabrowser.org/sparql/")

# function to print in a table results of a SPARQL query
def pretty_print(results):

    # how to transform SPARQL results into Pandas dataframes

    # get header (column names) from results
    if not results["results"]["bindings"]:
      return pd.DataFrame()

    header = results["results"]["bindings"][0].keys()

    # display table of results:
    table = []

    # the SPARQL JSON results to the query are available in the "results", "bindings" entry:
    for entry in results["results"]["bindings"]:
        # append entries from the results to a regular Python list of rows, which we can then transform to a Pandas DF
        row = [entry[column]["value"] if entry.get(column, None) != None else None for column in header]
        table.append(row)
    df = pd.DataFrame(table, columns=list(header))
    return df

def execute_query(sparql_endpoint, query):
    sparql_endpoint.setQuery(query)
    sparql_endpoint.setReturnFormat(JSON)

    results = sparql_endpoint.query().convert()
    return results



### Wikidata query to retrieve pollutants list with their CAS numbers to then use in IDSM:


In [None]:
query_Wikidata = """
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT * WHERE
{
  SERVICE <https://query.wikidata.org/sparql> {
   SELECT distinct ?use_type ?use_typeLabel ?compound ?compoundLabel ?ec_number  ?cas_number (avg(?ld50) as ?avg_ld50)
   { ?compound  wdt:P31 wd:Q113145171 ;
            wdt:P232 ?ec_number ;
            wdt:P231 ?cas_number ;
            wdt:P2240 ?ld50 ; # toxicity level - NOTE: not many data points have this info (only 9 results in total)
            wdt:P366 ?use_type .
     ?use_type  wdt:P279* wd:Q131656 .
     ?compound p:P2240 ?ref.
     ?ref pq:P636 wd:Q285166 .
     ?rats wdt:P279 wd:Q184224 .
     ?ref  pq:P689|pq:P2352 ?rats

    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],mul,en". } # Helps get the label in your language, if not, then default for all languages, then en language
    }
  group by ?use_type ?use_typeLabel ?compound ?compoundLabel ?ec_number  ?cas_number
  order by ?avg_ld50 # rank by toxicity
  }
}


#limit 100

"""

In [None]:
results_Wikidata = execute_query(sparql_Wikidata, query_Wikidata)

pretty_print(results_Wikidata).head(NUM_EXAMPLES)

Unnamed: 0,compoundLabel,avg_ld50,use_type,ec_number,cas_number,use_typeLabel,compound
0,amitrole,5333.333333333333,http://www.wikidata.org/entity/Q178266,200-521-5,61-82-5,herbicide,http://www.wikidata.org/entity/Q423314
1,ammonium sulfamate,2952.4,http://www.wikidata.org/entity/Q178266,231-871-7,7773-06-0,herbicide,http://www.wikidata.org/entity/Q1014257
2,atrazine,1503.5454545454545,http://www.wikidata.org/entity/Q178266,217-617-8,1912-24-9,herbicide,http://www.wikidata.org/entity/Q408652
3,ANTU,709.5708333333333,http://www.wikidata.org/entity/Q924146,201-706-3,86-88-4,rodenticide,http://www.wikidata.org/entity/Q413532
4,aldrin,136.64074074074074,http://www.wikidata.org/entity/Q181322,206-215-8,309-00-2,insecticide,http://www.wikidata.org/entity/Q409054
5,benomyl,37.0,http://www.wikidata.org/entity/Q193237,241-775-7,17804-35-2,fungicide,http://www.wikidata.org/entity/Q420172
6,benomyl,37.0,http://www.wikidata.org/entity/Q50377184,241-775-7,17804-35-2,industrial fungicides,http://www.wikidata.org/entity/Q420172


Query to retrieve CAS compound labels from an RDF document stored in a Solid Pod. Query was executed using the [Comunica web client](https://query.comunica.dev/#datasources=https%3A%2F%2Ftriple.ilabt.imec.be%2Ftest%2Fbio-usecase%2Fnbn-chist-era-annex-1-chemicals.ttl&query=prefix%20sio%3A%20%3Chttp%3A%2F%2Fsemanticscience.org%2Fresource%2F%3E%0ASELECT%20%3FCAS%20WHERE%20%7B%0A%09%3Fs%20sio%3ASIO_000300%20%3FCAS%0A%7D).

In [None]:
[ ] query_SolidPod = """
    prefix sio: <http://semanticscience.org/resource/>

    SELECT ?CAS WHERE {
	    ?s sio:SIO_000300 ?CAS
    }

    """

### IDSM query to retrieve similar compounds:

In [None]:
query_IDSM = """
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX vocab: <http://rdf.ncbi.nlm.nih.gov/pubchem/vocabulary#>
PREFIX sachem: <http://bioinfo.uochb.cas.cz/rdf/v1.0/sachem#>
PREFIX endpoint: <https://idsm.elixir-czech.cz/sparql/endpoint/>

SELECT ?CAS ?COMPOUND ?SIMILAR_COMPOUND ?SCORE WHERE {
  {
    VALUES ?CAS { "75-35-4" "71-55-6" "79-00-5" "630-20-6" "79-34-5" "106-93-4" "107-06-2" "542-75-6" "10061-01-5" "13952-84-6" "91-59-8" "553-00-4" "612-52-2" "120-23-0" "121-14-2" "93-76-5" "10519-33-2" "92-67-1" "2113-61-3" "92-93-3" "101-77-9" "81-15-2" "30560-19-1" "34256-82-1" "50594-66-6" "15972-60-8" "116-06-3" "834-12-8" "33089-61-1" "61-82-5" "84-65-1" "1332-21-4" "77536-66-4" "77536-67-5" "12172-73-5" "12001-28-4" "77536-68-6" "3337-71-1" "2302-17-2" "1912-24-9" "2642-71-9" "86-50-0" "41083-11-8" "71626-11-4" "82560-54-1" "17606-31-4" "71-43-2" "92-87-5" "36341-27-2" "85-68-7" "1820573-27-0" "65731-84-2" "82657-04-3" "485-31-4" }

    ?SYNONYM a sio:CHEMINF_000446. # CAS registry number
    ?SYNONYM sio:SIO_000300 ?CAS.
    ?SYNONYM sio:SIO_000011 ?COMPOUND.

    ?COMPOUND a vocab:Compound.

    ?ATTRIBUTE a sio:SIO_011120. # molecular structure file
    ?ATTRIBUTE sio:SIO_000011 ?COMPOUND.
    ?ATTRIBUTE sio:SIO_000300 ?MOLFILE.

    SERVICE endpoint:chebi {
      [ sachem:compound ?SIMILAR_COMPOUND;
        sachem:score ?SCORE ] sachem:similaritySearch [
        sachem:query ?MOLFILE;
        sachem:cutoff "0.7"^^xsd:double;
        sachem:similarityRadius '3'^^xsd:integer;
        sachem:aromaticityMode sachem:aromaticityDetectIfMissing;
        sachem:tautomerMode sachem:inchiTautomers
      ].
    }
  }
  UNION
  {
    SERVICE endpoint:chebi {
      ?SIMILAR_COMPOUND sachem:substructureSearch [ sachem:query '[As]' ].
    }
  }
}
ORDER BY DESC(?SCORE)
"""

In [None]:
results_IDSM = execute_query(sparql_IDSM, query_IDSM)

pretty_print(results_IDSM).head(NUM_EXAMPLES)

Unnamed: 0,CAS,COMPOUND,SIMILAR_COMPOUND,SCORE
0,92-67-1,http://rdf.ncbi.nlm.nih.gov/pubchem/compound/C...,http://purl.obolibrary.org/obo/CHEBI_1784,1.0
1,34256-82-1,http://rdf.ncbi.nlm.nih.gov/pubchem/compound/C...,http://purl.obolibrary.org/obo/CHEBI_2394,1.0
2,485-31-4,http://rdf.ncbi.nlm.nih.gov/pubchem/compound/C...,http://purl.obolibrary.org/obo/CHEBI_83371,1.0
3,485-31-4,http://rdf.ncbi.nlm.nih.gov/pubchem/compound/C...,http://purl.obolibrary.org/obo/CHEBI_83367,1.0
4,1820573-27-0,http://rdf.ncbi.nlm.nih.gov/pubchem/compound/C...,http://purl.obolibrary.org/obo/CHEBI_4034,1.0
5,1820573-27-0,http://rdf.ncbi.nlm.nih.gov/pubchem/compound/C...,http://purl.obolibrary.org/obo/CHEBI_39313,1.0
6,1820573-27-0,http://rdf.ncbi.nlm.nih.gov/pubchem/compound/C...,http://purl.obolibrary.org/obo/CHEBI_39309,1.0
7,1820573-27-0,http://rdf.ncbi.nlm.nih.gov/pubchem/compound/C...,http://purl.obolibrary.org/obo/CHEBI_39310,1.0
8,1820573-27-0,http://rdf.ncbi.nlm.nih.gov/pubchem/compound/C...,http://purl.obolibrary.org/obo/CHEBI_39312,1.0
9,121-14-2,http://rdf.ncbi.nlm.nih.gov/pubchem/compound/C...,http://purl.obolibrary.org/obo/CHEBI_920,1.0


### Query Rhea using similar compounds name from IDSM

In [None]:
query_Rhea = """
PREFIX CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rh: <http://rdf.rhea-db.org/>
SELECT distinct ?chebi ?rhea ?equation ?uniprot
WHERE {
  VALUES (?chebi) { ( <http://purl.obolibrary.org/obo/CHEBI_15930> ) } # from IDSM
  ?rhea rdfs:subClassOf rh:Reaction .
  ?rhea rh:equation ?equation .
  ?rhea rh:side/rh:contains/rh:compound ?compound .
  #
  # the ChEBI can be used either as a small molecule, the reactive part of a macromolecule or as a polymer.
  #
  ?compound (rh:chebi|(rh:reactivePart/rh:chebi)|(rh:underlyingChebi/rh:chebi)) ?chebi . # ?chebi comes from IDSM higher
}
"""


In [None]:
results_Rhea = execute_query(sparql_Rhea, query_Rhea)

pretty_print(results_Rhea).head(NUM_EXAMPLES)

Unnamed: 0,chebi,rhea,equation
0,http://purl.obolibrary.org/obo/CHEBI_15930,http://rdf.rhea-db.org/11312,atrazine + H2O = hydroxyatrazine + chloride + ...


### Follow link to Uniprot (if needed, otherwise just use cross-ref in prev query)


In [None]:
query_Rhea_link_to_Uniprot = """
PREFIX rh: <http://rdf.rhea-db.org/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT ?uniprot ?mnemo ?rhea ?accession ?equation
WHERE {
  SERVICE <https://sparql.uniprot.org/sparql> {
    GRAPH <http://sparql.uniprot.org/uniprot> {
      VALUES (?rhea) { (<http://rdf.rhea-db.org/11312>) (<http://rdf.rhea-db.org/11313>) }
      ?uniprot up:reviewed true .
      ?uniprot up:mnemonic ?mnemo .
      ?uniprot up:organism ?taxid .
      ?uniprot up:annotation/up:catalyticActivity/up:catalyzedReaction ?rhea . # where ?rhea comes from query upwards
    }
  }
  ?rhea rh:accession ?accession .
  ?rhea rh:equation ?equation .
}
"""

In [None]:
results_Rhea_Uniprot = execute_query(sparql_Rhea, query_Rhea_link_to_Uniprot)

pretty_print(results_Rhea_Uniprot).head(NUM_EXAMPLES)

Unnamed: 0,rhea,uniprot,mnemo,equation,accession
0,http://rdf.rhea-db.org/11312,http://purl.uniprot.org/uniprot/P72156,ATZA_PSESD,atrazine + H2O = hydroxyatrazine + chloride + ...,RHEA:11312


### Cross-ref to OMA to get organisms with orthologous genes


In [None]:
query_OMA = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX ensembl: <http://rdf.ebi.ac.uk/resource/ensembl/>
PREFIX oma: <http://omabrowser.org/ontology/oma#>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX void: <http://rdfs.org/ns/void#>
PREFIX lscr: <http://purl.org/lscr#>

select ?protein2 ?OMA_LINK ?taxon_sci_name
where {

VALUES (?Uniprot_link) { ( <http://purl.uniprot.org/uniprot/P72156> ) } # here
#The three that contains Orthologs. The leafs are proteins.
#This graph pattern defines the relationship protein1 is Orthologs to protein2
?cluster a orth:OrthologsCluster.
?cluster orth:hasHomologousMember ?node1.
?cluster orth:hasHomologousMember ?node2.
?node2 orth:hasHomologousMember* ?protein2.
?node1 orth:hasHomologousMember* ?protein1.
########

#Specify the protein to look for its orthologs
?protein1 lscr:xrefUniprot ?Uniprot_link.
########

#The OMA link to the second protein
?protein2 rdfs:seeAlso ?OMA_LINK.
?protein2 orth:organism/obo:RO_0002162/up:scientificName ?taxon_sci_name.
########

filter(?node1 != ?node2)
}
"""

In [None]:
results_OMA = execute_query(sparql_OMA, query_OMA)

pretty_print(results_OMA).head(NUM_EXAMPLES)