# Requêtes pour la recherche de workflows

In [12]:
from py2neo import *
import json
IP="bolt://localhost:7687"
USER="neo4j"
PASSWORD="pass"
graph_db = Graph(IP, auth=(USER, PASSWORD))

In [13]:
# Exemple de requête qui part de "Sequence set" pour aller vers un "Alignment"
# On cherche les chemins passant par au plus 4 relations (*1..4), tout en triant les realtions selon leur type (inputOf, outputOf)
# Enfin on trie les chemins pour éviter de passer 2 fois par la même fonction
graph_db.run("""
        MATCH path=(i:IO {term: 'Sequence set'})-[:inputOf|outputOf *1..4]->(o:IO {term: 'Alignment'})
        WHERE NONE (n IN nodes(path) WHERE size([x IN nodes(path) WHERE n = x]) > 1 )
        RETURN path
        LIMIT 10
    """)

path
"(_768)-[:inputOf {format: []}]->(_649)-[:outputOf {format: ['{""term"":""Stockholm format"",""uri"":""http://edamontology.org/format_1961""}']}]->(_770)"


In [82]:
def workflow_simple(entree, sortie, longueur_max, limit):
    request = f"""
        MATCH path=(i:IO {{term: '{inputTerm}'}})-[:inputOf|outputOf *1..{profMax}]->(o:IO {{term: '{outputTerm}'}})
        WHERE NONE (n IN nodes(path) WHERE size([x IN nodes(path) WHERE n = x]) > 1 )
        RETURN path AS p ORDER BY length(path) ASC
        LIMIT {limit}
    """
    return graph_db.run(request)

In [14]:
# Variante en précisant des parametres

inputTerm = "Sequence set"
outputTerm = "Alignment"
profMax = 5
limit = 100

print(workflow_simple(inputTerm, outputTerm, profMax, limit))

NameError: name 'workflow_simple' is not defined

In [None]:
# ajouter contrainte qui passe par un outil précis
# logiciels sur un système précis
# logiciels avec language précis
# associés à un terme d'ontologie précis
# mots clés pertinents associés au workflow ? naviguer dans l'arbre de l'ontologie ?
# Calcul du score du workflow ?
# longueur du workflow, termes en commun entre les outils

In [17]:
# Vérificateur de terme
def verif(term):
    res = graph_db.run(f"""MATCH (t) RETURN apoc.convert.toJson(t) AS Node LIMIT 10""").data()
    return res

print(verif("ADNet"))

[{'Node': '{"id":"0","type":"node","labels":["Tool"],"properties":{"owner":"Jennifer","description":"Xconnector is a software package designed to easily retrieve, and visualize metabolomics data from different database sources.","toolType":["Plug-in"],"license":"Other","elixirCommunity":[],"validated":0,"lastUpdate":"2021-10-05T21:35:11.947166Z","homepageStatus":0,"name":"Xconnector","elixirNode":[],"elixirPlatform":[],"additionDate":"2021-10-05T21:35:11.944560Z","collectionID":[],"confidenceFlag":"tool","elixirBadge":0,"homepage":"https://github.com/Proteomicslab57357/Xconnector"}}'}, {'Node': '{"id":"1","type":"node","labels":["Tool"],"properties":{"owner":"Jennifer","cost":"Free of charge","description":"xTea (comprehensive transposable element analyzer) is designed to identify TE insertions from paired-end Illumina reads, barcode linked-reads, long reads (PacBio or Nanopore), or hybrid data from different sequencing platforms and takes whole-exome sequencing (WES) or whole-genome s

# Recherche des données en plsu du resultat de la requête
# TODO aller chercher les relations également (même si elles sont bien vides pour l'instant)

In [20]:
def requete_format():
    return graph_db.run("""
// Première requête renvoyant le résultat sous forme d'une liste de chemins (juste les ID)
CALL {
    MATCH path=(i:IO {term: 'Sequence'})-[:inputOf|outputOf *1..5]->(o:IO {term: 'Alignment'})
    WHERE NONE (n IN nodes(path) WHERE size([x IN nodes(path) WHERE n = x]) > 1 )
    WITH apoc.path.elements(path) AS pathAsList
    WITH collect([x in pathAsList | ID(x)]) AS pathAsIdList
    RETURN pathAsIdList AS pathList
}
// Post traitement pour aller chercher les données en lien avec les outils proposés
CALL {
    WITH pathList // importation de la variable dans la sous requête
    UNWIND pathList AS pathAsIdList
    UNWIND pathAsIdList AS stepID // On parcourt tout les noeuds de chaque chemin
    MATCH (i)--(t:Tool)--(f:Function) WHERE ID(f)=stepID // on va chercher le noeud et ses infos, uniquement si on est bien sur une Function (et non un IO)

    // Plusieurs étapes en une ligne:
    // - On reparcours tout les noeuds trouvés
    // - Pour chaque on va chercher son ID et son label
    // - Puis on les ajoute dans un format de Json correct
    // - Enfin on met toutes ces infos dans une liste (définie par compréhension) (renommée data)
    // - on ajoute aussi l'ID pour la fonction utilisée et l'outil
    WITH [ n in collect(i) | { id:ID(n), label:labels(n)[0], properties: properties(n)}] AS data,
       { id:ID(f), label:labels(f)[0], properties: properties(f) } AS function,
       { id:ID(t), label:labels(t)[0], properties: properties(t) } AS tool

    return { function: function, tool: tool, data:  data } AS nodes
}
WITH collect(nodes) AS nodes, pathList
CALL {
    WITH pathList // importation de la variable dans la sous requête
    UNWIND pathList AS pathAsIdList
    UNWIND pathAsIdList AS stepID // On parcourt tout les noeuds de chaque chemin
    WITH collect( DISTINCT stepID) as filtered
    UNWIND filtered AS stepID
    MATCH (a)-[r]->(b) WHERE ID(r)=stepID // on va chercher la relation
    WITH ID(a) as s, ID(b) as e, r
    WITH { id:ID(r), start: s, end: e, properties: properties(r) } AS relationship
    RETURN relationship
}
    WITH collect(relationship) AS relationships, nodes, pathList
    RETURN { paths: pathList, nodes: nodes, relationships: relationships } AS result
    """)
res = requete_format().evaluate()
print(res)
print(json.dumps(res['datas'][0], indent=2))

None


TypeError: 'NoneType' object is not subscriptable

In [9]:
datas = res['nodes']
s = set()
t = set()
r = set()
cpt=0
for d in datas:
    if (s.__contains__(d['function']['id'])):
        print("Error, duplicated datas")
    else:
        s.add(d['function']['id'])
    if (t.__contains__(d['tool']['id'])):
        print("Error duplicated tool")
    else:
        t.add(d['tool']['id'])

print("Nb de fonctions : ", s.__len__())
print("Nb outils différents : ", t.__len__())
paths = res['paths']
sf = set()
cpt=0
for p in paths:
    for step in p:
        sf.add(step)

print("fin nb d'étapes différentes: ", sf.__len__())

rels = res['relationships']
for rel in rels:
    r.add(rel['id'])

print("Nb relations : ", r.__len__())

print(json.dumps(res, indent=2))


TypeError: 'NoneType' object is not subscriptable

In [6]:
def requete_format():
    return graph_db.run("""
// Première requête renvoyant le résultat sous forme d'une liste de chemins (juste les ID)
CALL {
    MATCH path=(i:IO {term: 'Sequence'})-[:inputOf|outputOf *1..5]->(o:IO {term: 'Alignment'})
    WHERE NONE (n IN nodes(path) WHERE size([x IN nodes(path) WHERE n = x]) > 1 )
    WITH apoc.path.elements(path) AS pathAsList
    WITH collect([x in pathAsList | ID(x)]) AS pathAsIdList
    RETURN pathAsIdList AS pathList
}
// Post traitement pour aller chercher les données en lien avec les outils proposés
CALL {
    WITH pathList // importation de la variable dans la sous requête
    UNWIND pathList AS pathAsIdList
    UNWIND pathAsIdList AS stepID // On parcourt tout les noeuds de chaque chemin
    MATCH (t:Tool)--(f:Function) WHERE ID(f)=stepID // on va chercher le noeud et ses infos, uniquement si on est bien sur une Function (et non un IO)

    // Plusieurs étapes en une ligne:
    // - On reparcours tout les noeuds trouvés
    // - Pour chaque on va chercher son ID et son label
    // - Puis on les ajoute dans un format de Json correct
    // - Enfin on met toutes ces infos dans une liste (définie par compréhension) (renommée data)
    // - on ajoute aussi l'ID pour la fonction utilisée et l'outil
    WITH { id:ID(f), label:labels(f)[0], properties: properties(f) } AS function,
       { id:ID(t), label:labels(t)[0], properties: properties(t) } AS tool, f
       

    CALL {
        WITH f 
        MATCH (f)-[i:inputOf|outputOf]-(io: IO) 
        WITH 
            { id:ID(io), label:labels(io)[0], properties: properties(io) } AS inOut,
            { id:ID(i), type:type(i), properties: properties(i)} AS inputOutput
        RETURN {inOut: inOut, inputOuput: inputOutput} AS io
    }
    
    WITH collect(io) AS iolist, function, tool

    return { function: function, tool: tool, io: iolist} AS nodes
}
    WITH collect(nodes) AS nodes, pathList
CALL {
    WITH pathList // importation de la variable dans la sous requête
    UNWIND pathList AS pathAsIdList
    UNWIND pathAsIdList AS stepID // On parcourt tout les noeuds de chaque chemin
    WITH collect( DISTINCT stepID) as filtered
    UNWIND filtered AS stepID
    MATCH (a)-[r]->(b) WHERE ID(r)=stepID // on va chercher la relation
    WITH ID(a) as s, ID(b) as e, r
    WITH { id:ID(r), start: s, end: e, properties: properties(r) } AS relationship
    RETURN relationship
}
    WITH collect(relationship) AS relationships, nodes, pathList
    RETURN { paths: pathList, nodes: nodes, relationships: relationships } AS result
    """)
res = requete_format().evaluate()
print(json.dumps(res, indent=2))

null


In [25]:
def requete_toolList(idList):
    query = """
    MATCH (t:Tool) WHERE ID(t) IN $idList
    WITH t, {} as res
CALL {
    WITH t
    MATCH (t)--(f:Function)
    CALL {
        WITH f
        MATCH (f)-[format :inputOf]-(io:IO)
        WITH collect({format: [form in format.format | apoc.convert.fromJsonMap(form)], data: properties(io)}) AS input, f
        MATCH (f)-[format :outputOf]-(io:IO)
        WITH collect({format: [form in format.format | apoc.convert.fromJsonMap(form)], data: properties(io)}) AS output, input, f
        MATCH (f)-[:doOperation]-(op:Operation)
        WITH collect(properties(op)) AS operation, output, input
        RETURN input, output, operation
    }
    WITH collect(f{.*, input:input, output:output, operation:operation}) AS function
    RETURN function
}
CALL {
    WITH t
    MATCH (t)-[pubID]-(pub:Publication)
    RETURN collect(pubID{.*, metadata: properties(pub)}) AS publication
}
CALL { WITH t MATCH (t)--(lang:Language) RETURN collect(properties(lang)) as language }
CALL { WITH t MATCH (t)--(link:Link) RETURN collect(properties(link)) as link }
CALL { WITH t MATCH (t)--(d:Download) RETURN collect(properties(d)) AS download }
CALL { WITH t MATCH (t)--(c:Credit) RETURN collect(properties(c)) AS credit }
CALL { WITH t MATCH (t)--(d:Documentation) RETURN collect(properties(d)) AS documentation}
CALL { WITH t MATCH (t)--(e:EditPermission) RETURN collect(properties(e)) AS editPermission}

WITH collect(t{.*, neo4jId: ID(t), function: function,
            link:link,
            language:language,
            publication:publication,
            download:download,
            credit:credit,
            documentation:documentation,
            editPermission:editPermission
            }) AS tools, res
UNWIND tools AS tool
    WITH apoc.map.setKey(res, apoc.convert.toString(tool['neo4jId']), tool) AS res
RETURN collect(res) AS toolMap
            """
    params = {}
    params['idList'] = idList
    return graph_db.run(query, parameters=params)

print(json.dumps(requete_toolList([161374, 161393]).data()[0], indent=2))

{
  "toolMap": [
    {
      "161374": {
        "accessibility": "Open access",
        "link": [],
        "description": "The tool offers a wide range of multivariate methods for the exploration and integration of biological datasets with a particular focus on variable selection.",
        "language": [
          {
            "name": "R"
          }
        ],
        "download": [
          {
            "type": "Binaries",
            "url": "https://cran.r-project.org/web/packages/mixOmics/"
          }
        ],
        "validated": 1,
        "function": [
          {
            "output": [
              {
                "data": {
                  "term": "Score",
                  "uri": "http://edamontology.org/data_1772"
                },
                "format": [
                  {
                    "term": "DSV",
                    "uri": "http://edamontology.org/format_3751"
                  }
                ]
              }
            ],
            "inpu