### Initialize a list of queries (size = num examples available)

##### These will be parsed to generate core queries + ways to augment then

In [1]:
# Bgee queries (with variants) from tutorial online: 
# https://www.bgee.org/support/tutorial-query-bgee-knowledge-graph-sparql#querying-with-controlled-vocabularies-and-identifiers

qs = [None] * 16 

qs[0] = []

qs[1] = ["""
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?species {
	?species a up:Taxon .}
    """]

qs[2] = ["""
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?species ?sci_name ?common_name {
	?species a up:Taxon .
	?species up:scientificName ?sci_name .
	?species up:rank up:Species .
	OPTIONAL { ?species up:commonName ?common_name .}
}
"""]

qs[3] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
SELECT DISTINCT ?anat ?anatName {
	?seq a orth:Gene .
	?seq genex:isExpressedIn ?anat .
	?seq rdfs:label "APOC1" .
	?anat a genex:AnatomicalEntity .
	?anat rdfs:label ?anatName .
}

"""]

qs[4] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT DISTINCT ?anat ?anatName {
	?seq a orth:Gene .
	?seq genex:isExpressedIn ?anat.
	?seq rdfs:label "APOC1" .
	?anat a genex:AnatomicalEntity .
	?anat rdfs:label ?anatName .
		### Specifying species:
		?seq orth:organism ?organism .
		?organism obo:RO_0002162  ?species .
		?species a up:Taxon .
		?species up:scientificName "Homo sapiens" .
}
"""]

qs[5] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?anatName {
	?seq a orth:Gene .
	?seq genex:isExpressedIn ?condition.
    ?condition a genex:ExpressionCondition.
	?seq rdfs:label "APOC1" .
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?condition genex:hasAnatomicalEntity obo:GO_0005575 .
	?anat rdfs:label ?anatName .
	?condition genex:hasDevelopmentalStage ?stage .
    ?stage a efo:EFO_0000399 .
	?stage rdfs:label "life cycle" .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
	?strain rdfs:label "wild-type" .
}
"""]

qs[6] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?anatName ?stage {
	?seq a orth:Gene .
	?seq genex:isExpressedIn ?condition.
    ?condition a genex:ExpressionCondition.
	?seq rdfs:label "APOC1" .
	?condition genex:hasAnatomicalEntity ?anat .
	?anat rdfs:label ?anatName .
		?condition genex:hasAnatomicalEntity obo:GO_0005575 .
		?condition genex:hasDevelopmentalStage ?stage .
        ?stage a efo:EFO_0000399 .
	?stage rdfs:label "post-juvenile" .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
		?strain rdfs:label "wild-type" .
		?seq orth:organism ?organism .
		?organism obo:RO_0002162  ?species .
		?species a up:Taxon .
		?species up:commonName "human" .
}
"""]

qs[7] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?anatName ?stage {
	?seq a orth:Gene .
	?seq genex:isExpressedIn ?condition.
	?seq rdfs:label "APOC1" .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
		?condition genex:hasAnatomicalEntity obo:GO_0005575 .
		?condition genex:hasDevelopmentalStage ?stage .
        ?stage a efo:EFO_0000399 .
	?stage rdfs:label "post-juvenile" .
		?seq orth:organism ?organism .
		?organism obo:RO_0002162  ?species .
		?species a up:Taxon .
		?species up:commonName "human" .
}
"""]

qs[8] = [None] * 3

qs[8][0] = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT DISTINCT ?anat ?anatName ?score ?stage {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq rdfs:label "APOC1" .
	?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
	?condition genex:hasAnatomicalEntity obo:GO_0005575 .
	?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
	?condition genex:hasDevelopmentalStage ?stage .
	?stage rdfs:label "post-juvenile" .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
	?strain rdfs:label "wild-type" .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  ?species .
	?species a up:Taxon .
	?species up:commonName "human" .
FILTER (?anat !=  obo:GO_0005575)
} ORDER BY DESC(?score)
"""

qs[8][1] = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX ensembl: <http://rdf.ebi.ac.uk/resource/ensembl/>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX efo: <http://www.ebi.ac.uk/efo/>

SELECT DISTINCT ?anat ?anatName ?score  {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq lscr:xrefEnsemblGene ensembl:ENSG00000130208 .
	?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
	?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
	?condition genex:hasDevelopmentalStage obo:UBERON_0000113 .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
	?strain rdfs:label "wild-type" .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  up-taxon:9606 .
    FILTER (?anat !=  obo:GO_0005575)
} ORDER BY DESC(?score)
"""

qs[8][2] = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX ensembl: <http://rdf.ebi.ac.uk/resource/ensembl/>
PREFIX lscr: <http://purl.org/lscr#>

SELECT DISTINCT ?anat ?anatName ?score  {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq lscr:xrefEnsemblGene ensembl:ENSG00000130208 .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
	?condition genex:hasDevelopmentalStage obo:UBERON_0000113 .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
	?strain rdfs:label "wild-type" .
FILTER (?anat !=  obo:GO_0005575)
} ORDER BY DESC(?score)
"""

qs[9] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT DISTINCT ?anat ?cellType ?anatName ?cellTypeName ?score ?stage {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq rdfs:label "APOC1" .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
	?condition genex:hasAnatomicalEntity ?cellType .
	?cellType rdfs:label ?cellTypeName .
	?condition genex:hasDevelopmentalStage ?stage .
	?stage rdfs:label "post-juvenile" .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
	?strain rdfs:label "wild-type" .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  ?species .
	?species a up:Taxon .
	?species up:commonName "human" .
    FILTER (?anat != obo:GO_0005575)
    FILTER (?anat != ?cellType)
} ORDER BY DESC(?score)
"""]

qs[10] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT DISTINCT ?anat ?cellType ?anatName ?cellTypeName ?score ?stage {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq rdfs:label "APOC1" .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
	?condition genex:hasAnatomicalEntity ?cellType .
	?cellType rdfs:label ?cellTypeName .
	?condition genex:hasDevelopmentalStage ?stage .
    ?stage rdf:type efo:EFO_0000399 .
	?stage rdfs:label "post-juvenile" .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  ?species .
	?species a up:Taxon .
	?species up:commonName "human" .
    FILTER (?anat !=  obo:GO_0005575)
    FILTER (?anat != ?cellType)
} ORDER BY DESC(?score)
"""]

qs[11] = [None] * 2

qs[11][0] = """
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX efo: <http://www.ebi.ac.uk/efo/EFO_0000399>

SELECT DISTINCT ?stage ?stageName ?stageDescription {
        ?stage rdf:type efo:EFO_0000399 .
		?stage rdfs:label ?stageName .
		?stage dcterms:description ?stageDescription .
}
"""

qs[11][1] = """
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX efo: <http://www.ebi.ac.uk/efo/EFO_0000399>

SELECT DISTINCT ?stage ?stageName ?stageDescription {
        ?stage rdf:type efo:EFO_0000399 . 
		?stage rdfs:label ?stageName .
		?stage dcterms:description ?stageDescription .
		FILTER (CONTAINS(?stageName,"post-juvenile"))
}
"""

qs[12] = [None] * 3

qs[12][0] = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX lscr: <http://purl.org/lscr#>

SELECT DISTINCT ?anat ?anatName ?stageIRI ?score  {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq lscr:xrefNCBIGene <https://www.ncbi.nlm.nih.gov/gene/118230125> .
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
	?anat rdfs:label ?anatName .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasDevelopmentalStage ?stageIRI .
    ?stageIRI rdf:type efo:EFO_0000399 . 
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
	?strain rdfs:label "wild-type" .
    FILTER (?anat != obo:GO_0005575)
} ORDER BY DESC(?score)
"""

qs[12][1] = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX dcterms: <http://purl.org/dc/terms/>

SELECT DISTINCT ?anat ?anatName ?stageIRI ?score  {
	?seq a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?condition .
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
	?seq dcterms:identifier "118230125" .
    ?condition a genex:ExpressionCondition.
	?condition genex:hasAnatomicalEntity ?anat .
	?anat rdfs:label ?anatName .
	?condition genex:hasDevelopmentalStage ?stageIRI .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
    ?strain a efo:EFO_0005135.
	?strain rdfs:label "wild-type" .
FILTER (?anat !=  obo:GO_0005575)
} ORDER BY DESC(?score)
"""

qs[13] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up-protein:<http://purl.uniprot.org/uniprot/>
PREFIX lscr: <http://purl.org/lscr#>

SELECT DISTINCT ?anat ?anatName {
	?seq a orth:Gene .
	?seq genex:isExpressedIn ?anat .
	?seq lscr:xrefUniprot up-protein:P02654 .
	?anat a genex:AnatomicalEntity .
	?anat rdfs:label ?anatName .
}
"""]

qs[14] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX dcterms: <http://purl.org/dc/terms/>

SELECT DISTINCT ?symbol ?description ?id
?links ?organism ?uniprot ?ensembl ?ncbi  {
    ?seq a orth:Gene .
    ?seq rdfs:label ?symbol .
    ?seq rdfs:seeAlso ?links .
    ?seq dcterms:description ?description .
    ?seq dcterms:identifier ?id .
    ?seq orth:organism ?organism .
    OPTIONAL{?seq lscr:xrefUniprot ?uniprot .}
    OPTIONAL{?seq lscr:xrefEnsemblGene ?ensembl .}
    OPTIONAL{?seq lscr:xrefNCBIGene ?ncbi .}
    FILTER (?id = "ENSG00000130208")
}
"""]

qs[15] = ["""
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>

SELECT DISTINCT ?anat ?anatName {
	?seq a orth:Gene .
	?seq genex:isAbsentIn ?anat.
	?seq rdfs:label "APOC1" .
	?anat a genex:AnatomicalEntity .
	?anat rdfs:label ?anatName .
	?seq orth:organism ?organism .
	?organism obo:RO_0002162  ?species .
	?species a up:Taxon .
	?species up:scientificName "Homo sapiens" .
}
"""]

In [2]:
# also store NL questions corresponding to the queries

nlqs = [None] *16

nlqs[0] = ""
nlqs[1] = "What are the species present in Bgee?"
nlqs[2] = "What are the species present in Bgee and their scientific and common names?"
nlqs[3] = "What are the anatomical entities where the APOC1 gene is expressed?"
nlqs[4] = "What are the anatomical entities where the APOC1 Homo sapiens gene is expressed?"
nlqs[5] = "What are the anatomical entities where the APOC1 gene is expressed independently of the developmental stage, sex, strain and cell type?"
nlqs[6] = "What are the anatomical entities where the human gene APOC1 is expressed in the post-juvenile stage?"
nlqs[7] = "What are the anatomical entities where the human gene APOC1 is expressed in the post-juvenile stage?"
nlqs[8] = "What are the anatomical entities where the human gene APOC1 is expressed in the post-juvenile stage along with its expression score independently of the strain, sex, and cell type?"
nlqs[9] = "What are the anatomical entities including cell types, if any, where the human gene APOC1 is expressed at the post-juvenile stage along with its expression score independently of the strain and sex?"
nlqs[10] = "What are the anatomical entities including cell types, if any, where the human gene APOC1 is expressed at the post-juvenile stage along with its expression score independently of the strain and sex?"
nlqs[11] = "What are the developmental stages present in Bgee?"
nlqs[12] = "What are the anatomical entities where the eel gene apoc1 is expressed along with its expression score independently of the strain, sex, and cell type?"
nlqs[13] = "What are the anatomical entities where the P02654 gene is expressed? Note that P02654 is a UniProtKB identifier of the APOC1 human gene."
nlqs[14] = "What is all the metadata related to the ENSG00000130208 gene, where ENSG00000130208 is the identifier of the APOC1 human gene. "
nlqs[15] = "What are the anatomical entities where the APOC1 Homo sapiens gene is not expressed, that is where is APOC1 absent ?"

In [3]:
# 2. construct augmented SPARQL query

def construct_query(prefixes, vars_list, triples, optionals, filters):
    sparql_query = ''
    
    for prefix in prefixes :
        sparql_query += prefix + "\n"
    
    select_stmt = "SELECT DISTINCT "

    for var in vars_list:
        select_stmt += "?" + var + " "
            
    select_stmt += "WHERE "
    
    sparql_query += select_stmt + "{ \n"

    
    for triple in triples:
        sparql_query += "\t" + triple + "\n"
    
    for optional in optionals:
        sparql_query += "\t" + "OPTIONAL { " + optional + " } " + "\n"
        # here add also class...property label (if it exists)...or COMMENT for the property label e.g. #encoded by
     
    for filter in filters:
        sparql_query += "\t" + "FILTER( " + filter + " )" + "\n"
        
    sparql_query += "}"
    
    return sparql_query

In [4]:
# first we need to define the SPARQL endpoints of each source, to use later in the protocols
import sys
!{sys.executable} -m pip install SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON, CSV
import sys, os, time
import pandas as pd

NUM_EXAMPLES = 10
# always display full column results (don't truncate output)
pd.set_option('display.max_colwidth', -1)

# the endpoints must be defined as wrappers for executing SPARQL queries
sparql_OMA = SPARQLWrapper("https://sparql.omabrowser.org/sparql")
sparql_Bgee = SPARQLWrapper("https://www.bgee.org/sparql/")
sparql_RIKEN = SPARQLWrapper("https://knowledge.brc.riken.jp/sparql")

# function to print in a table results of a SPARQL query
def pretty_print(results):
    
    # how to transform SPARQL results into Pandas dataframes
    
    # get header (column names) from results
    header = results["results"]["bindings"][0].keys()

    # display table of results:
    table = []
    
    # the SPARQL JSON results to the query are available in the "results", "bindings" entry:
    for entry in results["results"]["bindings"]:
        # append entries from the results to a regular Python list of rows, which we can then transform to a Pandas DF
        row = [entry[column]["value"] if entry.get(column, None) != None else None for column in header]
        table.append(row)
    df = pd.DataFrame(table, columns=list(header))
    return df



ValueError: Value must be a nonnegative integer or None

In [6]:
def execute_query(sparql_endpoint, query):
    sparql_endpoint.setQuery(query)
    sparql_endpoint.setReturnFormat(JSON)

    results = sparql_endpoint.query().convert()
    return results

### TODOs
- identify parts in graph where the literal properties occur - assign comments with the appropriate labels to them
=> augmented set 1
- break down queries into more general ones (fewer triples)
- progressively add more triple patterns

How to? example: https://pypi.org/project/SPARQL-parser/ . Library doesn't work but we reuse code from it here.

In [7]:
import re

def get_core_triples(sparql_query):
    # separate prefixes
    # get triples as list of strings
    # NOTE: must parse OPTIONALS separately too
    # these should be handled separately, since we want to add them only as optionals
    query_string = sparql_query
    
    prefixes_string = query_string[:query_string.lower().find('select')]
    prefixes = prefixes_string.split("\n")
    prefixes = [prefix for prefix in prefixes if prefix != '']
    
    # get variables mentioned in query
    vars_list = set(re.findall(r'\?([a-z_0-9A-Z]+)', query_string))
    
    # split query by ". or ;"
    if query_string.find('{') != -1:  
        query_start = query_string.find('{') + 1
        query_end = query_string.rfind("}")
        
    query_string_core = query_string[query_start : query_end].replace("\t", "")

    # SPLIT triple patterns. assumption: "nicely written" queries, with 1 triple pattern per line
    triples = query_string_core.split("\n")
    
    triples = [triple.strip() for triple in triples if triple and not triple.isspace()]
    
    #print("Query fragments (triple patterns, OPTIONALs, FILTERs): \n{}".format(triples))
    
    optionals = []
    filters = []
    
    final_triples = []
    
    # also return FILTERs separately
    for triple in triples:
        triple = triple.strip()
        if(triple.startswith("}")):
            triple = triple[1:].strip()
        if(triple.lower().find("filter") != -1):
            matches = re.findall(r'\((.*?)\)',triple)
            for filter in matches:
                filters += [filter]
            continue
        if(triple.lower().startswith("optional")):
            optional_start = triple.find("{") + 1
            optionals += [triple[optional_start:triple.rfind("}")].strip()]
            continue
        final_triples += [triple]

    return [prefixes, vars_list, final_triples, optionals, filters]
    

query = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX dcterms: <http://purl.org/dc/terms/>

SELECT DISTINCT ?anat ?anatName ?stageIRI ?score  {
	?seq a orth:Gene .
		?expression a genex:Expression .
		?expression genex:hasExpressionCondition ?condition .
		?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
		?seq dcterms:identifier "118230125" .
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
		?anat rdfs:label ?anatName .
        ?condition a genex:ExpressionCondition.
		?condition genex:hasDevelopmentalStage ?stageIRI .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
		?strain rdfs:label "wild-type" .
FILTER (?anat !=  obo:GO_0005575)
} ORDER BY DESC(?score)
"""

get_core_triples(query)

[['PREFIX orth: <http://purl.org/net/orth#>',
  'PREFIX genex: <http://purl.org/genex#>',
  'PREFIX obo: <http://purl.obolibrary.org/obo/>',
  'PREFIX up: <http://purl.uniprot.org/core/>',
  'PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>',
  'PREFIX lscr: <http://purl.org/lscr#>',
  'PREFIX dcterms: <http://purl.org/dc/terms/>'],
 {'anat',
  'anatName',
  'condition',
  'expression',
  'score',
  'seq',
  'stageIRI',
  'strain'},
 ['?seq a orth:Gene .',
  '?expression a genex:Expression .',
  '?expression genex:hasExpressionCondition ?condition .',
  '?expression genex:hasExpressionLevel ?score .',
  '?expression genex:hasSequenceUnit ?seq .',
  '?seq dcterms:identifier "118230125" .',
  '?condition genex:hasAnatomicalEntity ?anat .',
  '?anat a genex:AnatomicalEntity.',
  '?anat rdfs:label ?anatName .',
  '?condition a genex:ExpressionCondition.',
  '?condition genex:hasDevelopmentalStage ?stageIRI .',
  '?condition genex:hasSex "any".',
  '?condition genex:hasStrain ?strain .',

In [8]:
def extract_name_from_URI(Uri):
    if(not Uri.startswith("http")): # defined via a PREFIX statement
        return Uri[Uri.find(":") + 1 : ]
    fragmentName = urlparse(Uri).fragment
    if fragmentName:
        return fragmentName
    else:
        return Uri[Uri.rfind("/") + 1 : ]

Note: Simplifying Assumption: the variable must have a type so that it can be augmented with a description

In [9]:
# How to augment? 
from urllib.parse import urlparse, urldefrag, parse_qs

# parse file with extra text properties to include in the query

import pandas as pd
df_extra_props = pd.read_csv ('extra_props_bgee.csv')
df_extra_props = df_extra_props.drop_duplicates()
print(df_extra_props.head(3))
classNameURIs = df_extra_props['class'].drop_duplicates().tolist()

# build a json from className to label, props_List : {URI, label}
classNamesProps = {}

for classNameURI in classNameURIs:
    className = extract_name_from_URI(classNameURI)
    propNames = df_extra_props[df_extra_props['class']== classNameURI]
    classNamesProps[className] = {}
    classNamesProps[className]["classLabel"] = set(propNames["classLabel"].tolist())
    classNamesProps[className]["propNamesLabels"] = {x["property"] : x["propertyLabel"] for index, x in propNames.iterrows()}
    
print(classNamesProps)

                           class classLabel  \
0  http://purl.org/net/orth#Gene       Gene   
1  http://purl.org/net/orth#Gene       Gene   
2  http://purl.org/net/orth#Gene       Gene   

                                     property propertyLabel  
0  http://www.w3.org/2000/01/rdf-schema#label         label  
1         http://purl.org/dc/terms/identifier    Identifier  
2        http://purl.org/dc/terms/description   Description  
{'Gene': {'classLabel': {'gene', 'Gene'}, 'propNamesLabels': {'http://www.w3.org/2000/01/rdf-schema#label': 'label', 'http://purl.org/dc/terms/identifier': 'Identifier', 'http://purl.org/dc/terms/description': 'Description', 'http://www.w3.org/2000/01/rdf-schema#seeAlso': 'see also'}}, 'AnatomicalEntity': {'classLabel': {'Anatomical entity'}, 'propNamesLabels': {'http://purl.org/dc/terms/description': 'Description', 'http://www.w3.org/2000/01/rdf-schema#label': 'label'}}, 'AbsenceExpression': {'classLabel': {'absence of gene expression'}, 'propNamesLabels':

In [10]:
def rename_vars_meaningfully(query):
    # here, idea is to test whether meaningful var names make a difference
    # e.g. if we refer to a Gene as ?gene as opposed to ?x
    # assumption: vars have classes defined in the original query text
    [prefixes, vars_list, triples, optionals, filters] = get_core_triples(query)

    replacements = {}
    
    # get all class declarations and match var name with expected name
    for triple in triples:
        s_p_o = triple.replace(".", "").split(" ")[:3]
        if(s_p_o[1] == "a" or s_p_o[1] == "rdf:type"): # here, consider multiple cases, e.g. rdf:type etc
            var_name = s_p_o[0]
            className = extract_name_from_URI(s_p_o[2])
            if(var_name[1:].lower != className.lower()):
                # replace var name with className everywhere
                replacements[var_name] = "?" + className.lower()

    # reconstruct query and replace all occurences of old var names with the new ones
    reconstructed = construct_query(prefixes, vars_list, triples, optionals, filters)

    for varName in replacements.keys():
        reconstructed = reconstructed.replace(varName, replacements[varName])
    
    return reconstructed
        

test_query = """
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
SELECT DISTINCT ?anatomicalentity ?anatomicalentityName ?efo_343_4343 ?gene ?gene_seealso WHERE { 
	?gene a orth:Gene .
	?gene genex:isExpressedIn ?anatomicalentity .
	?gene rdfs:label "APOC1" .
	?anatomicalentity a genex:AnatomicalEntity .
	?anatomicalentity rdfs:label ?anatomicalentityName .
	?anatomicalentity <http://www.w3.org/2000/01/rdf-schema#label> ?anatomicalentity_label.
	?anatomicalentity <http://purl.org/dc/terms/description> ?anatomicalentity_description.
	?gene <http://www.w3.org/2000/01/rdf-schema#seeAlso> ?gene_seealso.
}
"""
vars_renamed_query = rename_vars_meaningfully(test_query)
print(vars_renamed_query)

#extract_name_from_URI("orth:Gene")
#extract_name_from_URI("http://purl.org/net/orth#Gene")
#extract_name_from_URI("http://purl.org/net/orth/Gene") 

PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
SELECT DISTINCT ?anatomicalentity ?anatomicalentityName ?efo_343_4343 ?anatomicalentity_label ?gene ?anatomicalentity_description ?gene_seealso WHERE { 
	?gene a orth:Gene .
	?gene genex:isExpressedIn ?anatomicalentity .
	?gene rdfs:label "APOC1" .
	?anatomicalentity a genex:AnatomicalEntity .
	?anatomicalentity rdfs:label ?anatomicalentityName .
	?anatomicalentity <http://www.w3.org/2000/01/rdf-schema#label> ?anatomicalentity_label.
	?anatomicalentity <http://purl.org/dc/terms/description> ?anatomicalentity_description.
	?gene <http://www.w3.org/2000/01/rdf-schema#seeAlso> ?gene_seealso.
}


In [11]:
def rename_vars_randomly(query):
    # here, idea is to test whether meaningful var names make a difference
    # we create a function to replace all var names with random ones
    
    [prefixes, vars_list, triples, optionals, filters] = get_core_triples(query)
    
    replacements = {}
    
    var_index = 0
    
    # get all class declarations and match var name with expected name
    for triple in triples:
        s_p_o = triple.replace(".", "").split(" ")[:3]
        if(s_p_o[0].startswith("?")):
            var_name = s_p_o[0] 
            if(replacements.get(var_name) is None):
                replacements[var_name] = "?x" + str(var_index)
                var_index += 1
        if(s_p_o[1].startswith("?")):
            var_name = s_p_o[1]
            if(replacements.get(var_name) is None):
                replacements[var_name] = "?x" + str(var_index)
                var_index += 1
        if(s_p_o[2].startswith("?")):
            var_name = s_p_o[2]
            if(replacements.get(var_name) is None):
                replacements[var_name] = "?x" + str(var_index)
                var_index += 1

    # reconstruct query and replace all occurences of old var names with the new ones
    reconstructed = construct_query(prefixes, vars_list, triples, optionals, filters)

    for varName in replacements.keys():
        reconstructed = reconstructed.replace(varName, replacements[varName])
    
    return reconstructed

random_query = rename_vars_randomly(query)
print(random_query)

PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT DISTINCT ?x4Name ?x3 ?x1 ?x2 ?x0 ?x7 ?x6 ?x4 WHERE { 
	?x0 a orth:Gene .
	?x1 a genex:Expression .
	?x1 genex:hasExpressionCondition ?x2 .
	?x1 genex:hasExpressionLevel ?x3 .
	?x1 genex:hasSequenceUnit ?x0 .
	?x0 dcterms:identifier "118230125" .
	?x2 genex:hasAnatomicalEntity ?x4 .
	?x4 a genex:AnatomicalEntity.
	?x4 rdfs:label ?x4Name .
	?x2 a genex:ExpressionCondition.
	?x2 genex:hasDevelopmentalStage ?x6 .
	?x2 genex:hasSex "any".
	?x2 genex:hasStrain ?x7 .
	?x7 rdfs:label "wild-type" .
	FILTER( ?x4 !=  obo:GO_0005575 )
}


In [12]:
# Task I: Augment queries with 1 more hop (e.g. a label/comment/description triple pattern)

# 1. get fragment from each Class URI in the list above, e.g. "gene"
# Simplifying assumption: every fragment is unique (e.g. there are no 2 different Gene classes 
# with distinct namespaces in the same query) - otherwise we would need to parse the prefixes in the query
# 2. look for triple with the fragment in the query text. if exists, get variable name. e.g. "?gene a orth:Gene."
# 3. attach triple pattern with the extra property to the corresponding variable
# 4. Generate corresponding NL question deterministically 
# future work: improve using e.g. ChatGPT? OpenAI API?

def augment_query_1_hop(query, question):
    # first rename all var names to meaningful ones to make it easy to check if we can add more props
    renamed_query = rename_vars_meaningfully(query)
    
    # parse triples from query
    [prefixes, vars_list, triples, optionals, filters] = get_core_triples(renamed_query)
    
    augment_with_triple = None
    
    augmented_questions = []
    augmented_queries = []
    
    already_added = set()
    
    # iterate through type declarations in query to check whether new datatype properties can be added
    for triple in triples:
        s_p_o = triple.replace(".", "").split(" ")[:3]
        if(s_p_o[1] == "a" or s_p_o[1] == "rdf:type"): # here, consider multiple cases, e.g. rdf:type etc
            
            var_name = s_p_o[0] 
            classNameQuery = extract_name_from_URI(s_p_o[2])
            
            # iterate through list of known properties for this className and see if triple can be added
            # put the varName to be the label of the property wihtout spaces
            for className in classNamesProps.keys():
                if(classNameQuery.lower() == className.lower()):
                    propNamesLabels = classNamesProps[className]["propNamesLabels"]
                    # iterate through all properties attached to this class
                    # all of these can be used to generate 1 extra variant of the query
                    for propName in propNamesLabels.keys():
                        propNameLabel = classNamesProps[className]["propNamesLabels"][propName]
                        propNameLabel = propNameLabel.replace(" ", "").replace("-", "_").lower()
                        propLabelVarName = className.lower() + "_" + propNameLabel
                        to_add_triple = var_name + " " + "<" + propName + ">" + " " + "?" + propLabelVarName + "."
                        if(to_add_triple not in set(triples) and to_add_triple not in already_added):
                            classNameText = list(classNamesProps[className]["classLabel"])[0].lower()
                            propNameText = classNamesProps[className]["propNamesLabels"][propName].lower()
                            if(question.find("Provide") != -1):
                                augmented_question = question[:-1] + " and the " + classNameText + " " + propNameText + "."
                            else:    
                                augmented_question = question + " Provide also the " + classNameText + " " + propNameText + "."
                            augmented_questions += [augmented_question]
                            augmented_queries += [construct_query(prefixes, list(vars_list)+[propLabelVarName], triples+[to_add_triple], optionals, filters)]
                            already_added.add(to_add_triple)
                            
    if(len(augmented_queries) == 0):
        # we added all properties possible
        return [None, None]
    
    return [list(augmented_questions), list(augmented_queries)]
    
    

In [13]:
print("Before: " + query)
[aug_questions, augmented_queries] = augment_query_1_hop(query, nlqs[12])

print("Augmented Question: ")
for question in aug_questions:
    print(question)
    
print("Augmented: " + str(len(augmented_queries)))
for query in augmented_queries:
    print(query)
    print("************")

Before: 
PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX dcterms: <http://purl.org/dc/terms/>

SELECT DISTINCT ?anat ?anatName ?stageIRI ?score  {
	?seq a orth:Gene .
		?expression a genex:Expression .
		?expression genex:hasExpressionCondition ?condition .
		?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?seq .
		?seq dcterms:identifier "118230125" .
	?condition genex:hasAnatomicalEntity ?anat .
    ?anat a genex:AnatomicalEntity.
		?anat rdfs:label ?anatName .
        ?condition a genex:ExpressionCondition.
		?condition genex:hasDevelopmentalStage ?stageIRI .
	?condition genex:hasSex "any".
	?condition genex:hasStrain ?strain .
		?strain rdfs:label "wild-type" .
FILTER (?anat !=  obo:GO_0005575)
} ORDER BY DESC(?score)

Augmented Questio

In [14]:
# Task II: Label existing query with comments (e.g. describing property or class names)

# first we need the object properties labels
df_labels_props = pd.read_csv ('object_properties_labels.csv')
df_labels_props = df_labels_props.drop_duplicates()
print(df_labels_props.head(3))

print(df_labels_props[df_labels_props['propertyURI']== "http://omabrowser.org/ontology/oma#domain"]["label"].tolist()[0])

                                         propertyURI                label
0  http://bioontology.org/ontologies/biositemap.o...  version information
1        http://purl.obolibrary.org/obo/CDAO_0000190           belongs_to
2          http://omabrowser.org/ontology/oma#domain               domain
domain


In [15]:
def augment_query_with_comments(query):
    [prefixes, vars_list, triples, optionals, filters] = get_core_triples(query)
    
    final_triples = [] 
    for triple in triples:
        # get the property name and add the property label as an inline comment to the triple
        s_p_o = triple.replace(".", "").split(" ")[:3]
      
        if(not s_p_o[1].startswith("?")):
            propUri = s_p_o[1]
            propName = extract_name_from_URI(propUri).lower()
            for propertyURI in df_labels_props['propertyURI'].tolist():
                if(extract_name_from_URI(propertyURI).lower() == propName):
                    triple += " # " + df_labels_props[df_labels_props['propertyURI'] == propertyURI]['label'].tolist()[0]
        final_triples += [triple]
                    
    return construct_query(prefixes, vars_list, final_triples, optionals, filters)

res = augment_query_with_comments(query)
print(res)

PREFIX orth: <http://purl.org/net/orth#>
PREFIX genex: <http://purl.org/genex#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX up-taxon:<http://purl.uniprot.org/taxonomy/>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX dcterms: <http://purl.org/dc/terms/>
SELECT DISTINCT ?expressioncondition ?anatomicalentity ?expression ?score ?anatomicalentityName ?strain ?stageIRI ?anatomicalentity_label ?gene WHERE { 
	?gene a orth:Gene .
	?expression a genex:Expression .
	?expression genex:hasExpressionCondition ?expressioncondition . # has an expression condition
	?expression genex:hasExpressionLevel ?score .
	?expression genex:hasSequenceUnit ?gene . # has sequence unit
	?gene dcterms:identifier "118230125" .
	?expressioncondition genex:hasAnatomicalEntity ?anatomicalentity . # has anatomical entity
	?anatomicalentity a genex:AnatomicalEntity.
	?anatomicalentity rdfs:label ?anatomicalentityName .
	?expressioncondition a genex:ExpressionCondition.
	?e

# Next, generate 3 sets of queries:
 1. random variable names
 2. meaningful variable names
 3. meaningful variable names + inline comments for property names

In [16]:
# iterate through existing queries, progressively augment with 1 hop
# create 4 folders:
# 0. original queries (with 1-hop augments)
# 1. random_vars
# 2. meaningful_vars
# 3. meaningful_vars + comments

In [79]:
printed = False

index_main_query = 0

generated_variants = 0

for queries in qs: # iterate through queries, e.g. from 1 to 15
    for query in queries: # iterate through variants, e.g. 8a 8b etc
     if(query):
        index_variant = 0
        
        index_augment_variant = 0
        
        # write original question to file
        
        current_question = nlqs[index_main_query]
        with open("dataset/questions/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".txt", "w") as f:
            f.write(current_question)
        
        # write original query to file
        with open("dataset/original_augmented/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
            f.write("# " + current_question + "\n")
            f.write(query)
        
        with open("dataset/random_vars/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
            f.write("# " + current_question + "\n")
            f.write(rename_vars_randomly(query))
        
        with open("dataset/meaningful_vars/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
            f.write("# " + current_question + "\n")
            f.write(rename_vars_meaningfully(query))
            
        with open("dataset/meaningful_vars_comments/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
            f.write("# " + current_question + "\n")
            f.write(augment_query_with_comments(rename_vars_meaningfully(query)))
        
        
        # augment each query with 1 hop iteratively until no more additions are possible
        [augmented_questions, augmented_queries] = augment_query_1_hop(query, nlqs[index_main_query])
            
        while(augmented_queries != None): # iterate through augmentations
        
            for augmented_query in augmented_queries:
                index_augment_variant += 1
                generated_variants += 1
                if(generated_variants % 100 == 0):
                    print("Generated...{} variants".format(generated_variants))
                    
                current_question = augmented_questions[augmented_queries.index(augmented_query)]
                    
                with open("dataset/questions/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".txt", "w") as f:
                    f.write(current_question)
                    
                with open("dataset/original_augmented/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
                    f.write("# " + current_question + "\n")
                    f.write(augmented_query)
        
                with open("dataset/random_vars/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
                    f.write("# " + current_question + "\n")
                    f.write(rename_vars_randomly(augmented_query))
                
                with open("dataset/meaningful_vars/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
                    f.write("# " + current_question + "\n")
                    f.write(rename_vars_meaningfully(augmented_query))
                
                with open("dataset/meaningful_vars_comments/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
                    f.write("# " + current_question + "\n")
                    f.write(augment_query_with_comments(rename_vars_meaningfully(augmented_query)))
                    
            [augmented_questions, augmented_queries] = augment_query_1_hop(augmented_query, augmented_questions[augmented_queries.index(augmented_query)])

        index_variant += 1
            
    index_main_query += 1
    if(index_main_query == len(nlqs) - 1):
        break
    
    

Generated...100 variants
Generated...200 variants
Generated...300 variants
Generated...400 variants
Generated...500 variants
Generated...600 variants


In [18]:
printed = False

index_main_query = 0

generated_variants = 0

for queries in qs: # iterate through queries, e.g. from 1 to 15
    for query in queries: # iterate through variants, e.g. 8a 8b etc
     if(query):
        index_variant = 0
        
        index_augment_variant = 0
        
        # write original question to file
        
        current_question = nlqs[index_main_query]
        
        with open("dataset/new_original_augmented/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
                    f.write("# " + current_question + "\n")
                    f.write(query)
            
        with open("dataset/original_with_comments/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
            f.write("# " + current_question + "\n")
            f.write(augment_query_with_comments(query))
        
        
        # augment each query with 1 hop iteratively until no more additions are possible
        [augmented_questions, augmented_queries] = augment_query_1_hop(query, nlqs[index_main_query])
            
        while(augmented_queries != None): # iterate through augmentations
        
            for augmented_query in augmented_queries:
                index_augment_variant += 1
                generated_variants += 1
                if(generated_variants % 100 == 0):
                    print("Generated...{} variants".format(generated_variants))
                
                current_question = augmented_questions[augmented_queries.index(augmented_query)]
                    
                with open("dataset/new_original_augmented/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
                    f.write("# " + current_question + "\n")
                    f.write(augmented_query)
                
                with open("dataset/original_with_comments/query_"+ str(index_main_query) + "_" + str(index_variant) + "_" + str(index_augment_variant) + ".rq", "w") as f:
                    f.write("# " + current_question + "\n")
                    f.write(augment_query_with_comments(augmented_query))
                    
            [augmented_questions, augmented_queries] = augment_query_1_hop(augmented_query, augmented_questions[augmented_queries.index(augmented_query)])

        index_variant += 1
            
    index_main_query += 1
    if(index_main_query == len(nlqs) - 1):
        break

Generated...100 variants
Generated...200 variants
Generated...300 variants
Generated...400 variants
Generated...500 variants
Generated...600 variants
