# Cypher Queries for Determining Regulatory Paths
*Núria Queralt Rosinach, Andrew Su*

**Queries for the Neo4j guide online**

## Overview
NGLY1 - AQP1 **regulatory review** (*NGLY1 v3.1*)

## Servers 

    * Local: bolt://kylo.scripps.edu:7689
    * AWS: bolt://52.87.232.110:7689

### Imports

In [16]:
from neo4j.v1 import GraphDatabase, basic_auth
import pandas as pd
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

### Functions

In [17]:
def runQuery( driver, query ):
    '''
    This function runs the query onto the database and returns the result.
    in: cypher query string
    out: neo4j query result object
    '''
    
    with driver.session() as session:
        result = session.run('' + query + '')
        
    return result


def parseNode( node ):
    '''
    This function parses the information gathered in the node data structure object resulting after querying neo4j.
        in: node record neo4j object
        out: node as dict
    '''
    
    n = dict()
    n["idx"] = int(node.id)
    n["type"] = list(node.labels)[0]
    n["id"] = str(node.properties['id'])
    n["preflabel"] = str(node.properties['preflabel'])
    n["name"] = str(node.properties['name'])
    n["description"] = str(node.properties['description'])

    return n


def parsePath( path ):
    '''
    This function parsers the information gathered in the path data structure object resulting after querying neo4j.
        in: path record neo4j object
        out: path as dict
    '''
    
    out = {}
    out['Nodes'] = []
    for node in path['path'].nodes:
        n = {}
        n['idx'] = int(node.id)
        n['type'] = list(node.labels)[0]
        n['id'] = str(node.properties['id'])
        n['preflabel'] = str(node.properties['preflabel'])
        n['name'] = str(node.properties['name'])
        n['description'] = str(node.properties['description'])
        out['Nodes'].append(n)
    out['Edges'] = []
    for edge in path['path'].relationships:
        e = {}
        e['idx'] = int(edge.id)
        e['start_node'] = int(edge.start)
        e['end_node'] = int(edge.end)
        e['type'] = str(edge.type)
        e['property_label'] = str(edge.properties['property_label'])
        e['property_uri'] = str(edge.properties['property_uri'])
        e['reference_uri'] = str(edge.properties['reference_uri'])
        e['reference_date'] = str(edge.properties['reference_date'])
        e['reference_supporting_text'] = str(edge.properties['reference_supporting_text'])
        out['Edges'].append(e)
        
    return out

### Initialize neo4j

In [18]:
driver = GraphDatabase.driver("bolt://kylo.scripps.edu:7689", auth=basic_auth("neo4j", "xena"))
#driver = GraphDatabase.driver("bolt://52.87.232.110:7689")

In [None]:
# Question
## Query topology graph
## Table of summary
## Graph of paths
## Explore paths=> executable cypher query

# NETWORK HYPOTHESES

### Open query to check if they are connected => metapaths
* Are associated at expression level?
    * Are connected through the human transcriptome? -imposing **ONLY** rna genes-

In [7]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[*..3]-(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*'

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 2349447
CPU times: user 13.7 ms, sys: 5.2 ms, total: 18.9 ms
Wall time: 6min 47s


**Metapaths:**

In [8]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[*..3]-(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*'

        RETURN DISTINCT extract (x in rels(path) | type(x)) as types, extract (n in nodes(path) | labels(n)) as labels, length(path) as mp_length, count(distinct path) as paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 20.3 ms, sys: 5.65 ms, total: 25.9 ms
Wall time: 7min 1s


In [9]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,2,"[[GENE], [GENE], [GENE]]",7,"[RO:0002434, RO:0002434]"
1,3,"[[GENE], [GENE], [GENE], [GENE]]",5120,"[RO:0002434, RO:0002434, RO:0002434]"
2,3,"[[GENE], [GENE], [ANAT], [GENE]]",2564,"[RO:0002434, RO:0002206, RO:0002206]"
3,3,"[[GENE], [GENE], [PHYS], [GENE]]",936,"[RO:0002434, BFO:0000050, BFO:0000050]"
4,3,"[[GENE], [GENE], [PHYS], [GENE]]",504,"[RO:0002434, RO:0002331, RO:0002331]"
5,3,"[[GENE], [GENE], [GENE], [GENE]]",162,"[RO:0002434, RO:HOM0000011, RO:0002434]"
6,3,"[[GENE], [GENE], [PHYS], [GENE]]",110,"[RO:0002434, RO:0002327, RO:0002327]"
7,3,"[[GENE], [GENE], [GENE], [GENE]]",80,"[RO:0002434, RO:0002325, RO:0002434]"
8,3,"[[GENE], [GENE], [DISO], [GENE]]",64,"[RO:0002434, RO:0002607, RO:0002607]"
9,3,"[[GENE], [GENE], [GENE], [GENE]]",30,"[RO:0002434, RO:0002434, RO:HOM0000011]"


* Are associated at expression level?
    * Are connected through the human transcriptome? -imposing rna genes **AND** regulatory edges-

In [11]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[*..3]-(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 932233
CPU times: user 14.5 ms, sys: 5.29 ms, total: 19.8 ms
Wall time: 7min 52s


**Metapaths:**

In [12]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[*..3]-(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0

        RETURN DISTINCT extract (x in rels(path) | type(x)) as types, extract (n in nodes(path) | labels(n)) as labels, length(path) as mp_length, count(distinct path) as paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 19.6 ms, sys: 4.6 ms, total: 24.2 ms
Wall time: 8min 10s


In [13]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,2,"[[GENE], [GENE], [GENE]]",3,"[RO:0002434, RO:0002434]"
1,3,"[[GENE], [GENE], [GENE], [GENE]]",2560,"[RO:0002434, RO:0002434, RO:0002434]"
2,3,"[[GENE], [GENE], [GENE], [GENE]]",20,"[RO:0002434, RO:0002435, RO:0002434]"
3,3,"[[GENE], [GENE], [GENE], [GENE]]",20,"[RO:0002434, RO:0002325, RO:0002434]"
4,3,"[[GENE], [GENE], [GENE], [GENE]]",10,"[RO:0002434, RO:HOM0000011, RO:0002434]"
5,3,"[[GENE], [GENE], [GENE], [GENE]]",10,"[RO:0002434, RO:0002434, RO:HOM0000011]"
6,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",754924,"[RO:0002434, RO:0002434, RO:0002434, RO:0002434]"
7,4,"[[GENE], [GENE], [GENE], [ANAT], [GENE]]",50195,"[RO:0002434, RO:0002434, RO:0002206, RO:0002206]"
8,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",19962,"[RO:0002434, RO:HOM0000011, RO:0002434, RO:0002434]"
9,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",19648,"[RO:0002434, RO:0002325, RO:0002434, RO:0002434]"


* Are associated at expression level?
    * Are connected through TFs? -imposing **ONLY** regulatory edges-

In [27]:
%%time
query = (
        """
        MATCH path=(source:GENE)--()-[*..3]-(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 1051945
CPU times: user 8.48 ms, sys: 11.9 ms, total: 20.3 ms
Wall time: 8min 36s


**Metapaths:**

In [28]:
%%time
query = (
        """
        MATCH path=(source:GENE)--()-[*..3]-(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN DISTINCT extract (x in rels(path) | type(x)) as types, extract (n in nodes(path) | labels(n)) as labels, length(path) as mp_length, count(distinct path) as paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 25.4 ms, sys: 5.71 ms, total: 31.1 ms
Wall time: 9min


In [29]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,2,"[[GENE], [GENE], [GENE]]",3,"[RO:0002434, RO:0002434]"
1,3,"[[GENE], [GENE], [GENE], [GENE]]",2666,"[RO:0002434, RO:0002434, RO:0002434]"
2,3,"[[GENE], [GENE], [GENE], [GENE]]",20,"[RO:0002434, RO:0002435, RO:0002434]"
3,3,"[[GENE], [GENE], [GENE], [GENE]]",20,"[RO:0002434, RO:0002325, RO:0002434]"
4,3,"[[GENE], [GENE], [GENE], [GENE]]",18,"[RO:0002434, RO:HOM0000011, RO:0002434]"
5,3,"[[GENE], [GENE], [GENE], [GENE]]",10,"[RO:0002434, RO:0002434, RO:HOM0000011]"
6,3,"[[GENE], [ANAT], [GENE], [GENE]]",8,"[RO:0002206, RO:0002206, RO:0002434]"
7,3,"[[GENE], [DISO], [GENE], [GENE]]",8,"[RO:0002200, RO:0002200, RO:0002434]"
8,3,"[[GENE], [GENE], [ANAT], [GENE]]",5,"[RO:0002434, RO:0002206, RO:0002206]"
9,3,"[[GENE], [PHYS], [GENE], [GENE]]",3,"[RO:0002331, RO:0002331, RO:0002434]"


* Are associated at expression level?
    * Are connected through TFs or expression genes? -imposing rna genes **OR** regulatory edges-

In [30]:
%%time
query = (
        """
        MATCH path=(source:GENE)--()-[*..3]-(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*freeze.*|.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 2469159
CPU times: user 16.3 ms, sys: 4.11 ms, total: 20.4 ms
Wall time: 9min


**Metapaths:**

In [31]:
%%time
query = (
        """
        MATCH path=(source:GENE)--()-[*..3]-(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*freeze.*|.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN DISTINCT extract (x in rels(path) | type(x)) as types, extract (n in nodes(path) | labels(n)) as labels, length(path) as mp_length, count(distinct path) as paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 27.8 ms, sys: 7.4 ms, total: 35.2 ms
Wall time: 9min 52s


In [32]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,2,"[[GENE], [GENE], [GENE]]",7,"[RO:0002434, RO:0002434]"
1,3,"[[GENE], [GENE], [GENE], [GENE]]",5226,"[RO:0002434, RO:0002434, RO:0002434]"
2,3,"[[GENE], [GENE], [ANAT], [GENE]]",2569,"[RO:0002434, RO:0002206, RO:0002206]"
3,3,"[[GENE], [GENE], [PHYS], [GENE]]",936,"[RO:0002434, BFO:0000050, BFO:0000050]"
4,3,"[[GENE], [GENE], [PHYS], [GENE]]",504,"[RO:0002434, RO:0002331, RO:0002331]"
5,3,"[[GENE], [GENE], [GENE], [GENE]]",170,"[RO:0002434, RO:HOM0000011, RO:0002434]"
6,3,"[[GENE], [GENE], [PHYS], [GENE]]",110,"[RO:0002434, RO:0002327, RO:0002327]"
7,3,"[[GENE], [GENE], [GENE], [GENE]]",80,"[RO:0002434, RO:0002325, RO:0002434]"
8,3,"[[GENE], [GENE], [DISO], [GENE]]",64,"[RO:0002434, RO:0002607, RO:0002607]"
9,3,"[[GENE], [GENE], [GENE], [GENE]]",30,"[RO:0002434, RO:0002434, RO:HOM0000011]"


### Specific query template with > 1000 paths => results viz in a summary table
Query template to answer the connection at expression level.



* Looking for **gene interactors** between RNA genes and TFs of AQP1 ( L=3 )

In [35]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 2328
CPU times: user 2.45 ms, sys: 0 ns, total: 2.45 ms
Wall time: 2.81 s


In [36]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002434`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0

        RETURN count(distinct rna) as rnas, count(distinct tf) as tfs, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Expressed_genes': record['rnas'], 
                  'TFs': record['tfs'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 2.94 ms, sys: 0 ns, total: 2.94 ms
Wall time: 2.86 s


In [37]:
res_df

Unnamed: 0,Expressed_genes,Paths,TFs
0,894,2328,31


In [40]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002434`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,rna,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0

        RETURN DISTINCT rna.name, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Expressed_gene_name': record['rna.name'], 
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 16.6 ms, sys: 0 ns, total: 16.6 ms
Wall time: 2.9 s


In [41]:
res_df

Unnamed: 0,Expressed_gene_name,Paths
0,MYC associated zinc finger protein,19
1,SMAD family member 3,18
2,"ELK1, ETS transcription factor",17
3,paired like homeodomain 2,17
4,melanogenesis associated transcription factor,17
5,transcription factor AP-2 gamma,17
6,signal transducer and activator of transcription 1,17
7,HIC ZBTB transcriptional repressor 1,15
8,activating transcription factor 3,15
9,heat shock transcription factor 2,13


In [42]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002434`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0

        RETURN DISTINCT tf.name, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'TF_name': record['tf.name'], 
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.69 ms, sys: 0 ns, total: 3.69 ms
Wall time: 2.86 s


In [43]:
res_df

Unnamed: 0,Paths,TF_name
0,460,lymphoid enhancer binding factor 1
1,383,tumor protein p53
2,363,MYC associated zinc finger protein
3,282,transcription regulator
4,161,twin-arginine translocase TatA/TatE family subunit
5,133,myocyte enhancer factor 2A
6,124,TATA-box binding protein
7,114,mitochondrial elongation factor G- like protein
8,107,"Fos proto-oncogene, AP-1 transcription factor subunit"
9,70,transcription factor 4


In [47]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:0002434`]-(tf:GENE)-[i3:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,i2,i3,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0

        RETURN DISTINCT i1.reference_supporting_text AS Prov1, i2.reference_supporting_text AS Prov2, i3.reference_supporting_text AS Prov3, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Provenance_i1': record['Prov1'], 
                  'Provenance_i2': record['Prov2'], 
                  'Provenance_i3': record['Prov3'], 
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.44 ms, sys: 0 ns, total: 3.44 ms
Wall time: 3.01 s


In [48]:
res_df

Unnamed: 0,Paths,Provenance_i1,Provenance_i2,Provenance_i3
0,1363,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.","This edge comes from the C3:TFT dataset in ""msigdb"" source."
1,282,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source."
2,146,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,"This edge comes from the NEPH2012 dataset in ""tftargets"" source.","This edge comes from the C3:TFT dataset in ""msigdb"" source."
3,120,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source."
4,106,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,"This edge comes from the NEPH2012 dataset in ""tftargets"" source.",This edge comes from the Monarch Knowledge Graph 2018.
5,74,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.",This edge comes from the Monarch Knowledge Graph 2018.
6,69,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,"This edge comes from the ENCODE_ENCFF001UUQ dataset in ""tftargets"" source.","This edge comes from the C3:TFT dataset in ""msigdb"" source."
7,40,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.","This edge comes from the TRED dataset in ""tftargets"" source."
8,30,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,"This edge comes from the ENCODE_ENCFF001UUQ dataset in ""tftargets"" source.",This edge comes from the Monarch Knowledge Graph 2018.
9,30,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,"This edge comes from the NEPH2012 dataset in ""tftargets"" source.","This edge comes from the TRED dataset in ""tftargets"" source."


In [49]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:0002434`]-(tf:GENE)-[i3:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,i2,i3,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0

        RETURN DISTINCT i1.reference_supporting_text AS Prov1, rna.name, i2.reference_supporting_text AS Prov2, tf.name, i3.reference_supporting_text AS Prov3, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Provenance_i1': record['Prov1'], 
                  'Expressed_gene_name': record['rna.name'],
                  'Provenance_i2': record['Prov2'],
                  'TF_name': record['tf.name'],
                  'Provenance_i3': record['Prov3'], 
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 79.2 ms, sys: 2.91 ms, total: 82.1 ms
Wall time: 3 s


In [50]:
res_df

Unnamed: 0,Expressed_gene_name,Paths,Provenance_i1,Provenance_i2,Provenance_i3,TF_name
0,proliferating cell nuclear antigen,2,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.",TATA-box binding protein
1,lysine acetyltransferase 2B,2,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53
2,StAR related lipid transfer domain containing 10,2,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53
3,"argonaute 2, RISC catalytic component",2,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53
4,transcription factor AP-2 gamma,2,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,"This edge comes from the NEPH2012 dataset in ""tftargets"" source.","This edge comes from the C3:TFT dataset in ""msigdb"" source.",MYC associated zinc finger protein
5,USO1 vesicle transport factor,2,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53
6,"ELK1, ETS transcription factor",2,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.",TATA-box binding protein
7,"ribonucleoprotein, PTB binding 1",2,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53
8,insulin like growth factor 2 mRNA binding protein 1,2,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53
9,HIC ZBTB transcriptional repressor 1,2,This edge comes from the RNA-seq profile dataset extracted by the Freeze Lab 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53


In [51]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:0002434`]-(tf:GENE)-[i3:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,i2,i3,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0

        RETURN DISTINCT rna.name, i2.reference_supporting_text AS Prov2, tf.name, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Expressed_gene_name': record['rna.name'],
                  'Provenance_i2': record['Prov2'],
                  'TF_name': record['tf.name'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 59.9 ms, sys: 0 ns, total: 59.9 ms
Wall time: 2.96 s


In [52]:
res_df

Unnamed: 0,Expressed_gene_name,Paths,Provenance_i2,TF_name
0,SP100 nuclear antigen,2,This edge comes from the Monarch Knowledge Graph 2018.,tumor protein p53
1,aryl hydrocarbon receptor,2,This edge comes from the Monarch Knowledge Graph 2018.,TATA-box binding protein
2,ubiquitin specific peptidase 15,2,This edge comes from the Monarch Knowledge Graph 2018.,tumor protein p53
3,ATR serine/threonine kinase,2,This edge comes from the Monarch Knowledge Graph 2018.,tumor protein p53
4,cyclin D1,2,This edge comes from the Monarch Knowledge Graph 2018.,lymphoid enhancer binding factor 1
5,lysine acetyltransferase 2B,2,This edge comes from the Monarch Knowledge Graph 2018.,TATA-box binding protein
6,USO1 vesicle transport factor,2,This edge comes from the Monarch Knowledge Graph 2018.,tumor protein p53
7,transcription factor 7 like 1,2,This edge comes from the Monarch Knowledge Graph 2018.,lymphoid enhancer binding factor 1
8,protein arginine methyltransferase 1,2,This edge comes from the Monarch Knowledge Graph 2018.,tumor protein p53
9,"ENAH, actin regulator",2,This edge comes from the Monarch Knowledge Graph 2018.,tumor protein p53


### Specific query template with < 1000 paths => results viz graphically
Query template to answer the connection at expression level.

In [34]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:0002435`]-(:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query: `genetically interacts with` OR `RO:0002435`
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 20
CPU times: user 0 ns, sys: 1.91 ms, total: 1.91 ms
Wall time: 1.19 s


we could do the above query but with `colocalizes with` or `RO:0002325`

---
## Question 2
Input rna + tf genes interactors to look for NGLY1 phenotypes

In [66]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002435`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0      

        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(:DISO)--(g:GENE)
        
        WHERE g.id in genes
        
        RETURN count(distinct path) as paths
        """
)

# run query: `genetically interacts with` OR `RO:0002435`
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 21
CPU times: user 0 ns, sys: 2.03 ms, total: 2.03 ms
Wall time: 1.38 s


In [71]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002435`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0      

        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 2.45 ms, sys: 1.11 ms, total: 3.56 ms
Wall time: 1.45 s


In [72]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,"[STT3B, catalytic subunit of the oligosaccharyltransferase complex, kinesin family member 2A, tumor protein p53]",3,HP:0001250,epileptic seizure
1,"[STT3B, catalytic subunit of the oligosaccharyltransferase complex, kinesin family member 2A]",2,HP:0001263,Developmental disability
2,"[STT3B, catalytic subunit of the oligosaccharyltransferase complex, kinesin family member 2A]",2,HP:0000252,microcephaly
3,"[STT3B, catalytic subunit of the oligosaccharyltransferase complex, kinesin family member 2A]",2,HP:0001511,Intrauterine growth retardation
4,"[STT3B, catalytic subunit of the oligosaccharyltransferase complex]",1,HP:0001249,intellectual disability
5,"[STT3B, catalytic subunit of the oligosaccharyltransferase complex]",1,HP:0000648,optic atrophy
6,[tumor protein p53],1,HP:0002015,Premature spillage
7,[tumor protein p53],1,HP:0002910,Elevated transaminases
8,[tumor protein p53],1,HP:0001310,Dysmetria
9,"[STT3B, catalytic subunit of the oligosaccharyltransferase complex]",1,HP:0001272,Cerebellar atrophy


* interacts with

In [73]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002434`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0      

        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(:DISO)--(g:GENE)
        
        WHERE g.id in genes
        
        RETURN count(distinct path) as paths
        """
)

# run query: `genetically interacts with` OR `RO:0002435`
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 1286
CPU times: user 2.59 ms, sys: 0 ns, total: 2.59 ms
Wall time: 16.4 s


In [74]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002434`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'HGNC:17646' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_supporting_text) =~ '.*freeze.*' AND size(interactions) <> 0      

        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 8.12 ms, sys: 0 ns, total: 8.12 ms
Wall time: 15.2 s


In [75]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,"[glutamate ionotropic receptor AMPA type subunit 3, erythrocyte membrane protein band 4.1 like 1, receptor interacting serine/threonine kinase 4, WD repeat domain 11, DNA ligase 4, recombination signal binding protein for immunoglobulin kappa J region, immunoglobulin binding protein 1, serologically defined colon cancer antigen 8, heterogeneous nuclear ribonucleoprotein A1, AKT serine/threonine kinase 3, dopey family member 1, tetratricopeptide repeat domain 8, ATRX, chromatin remodeler, solute carrier family 25 member 22, GNAS complex locus, transducin beta like 1 X-linked receptor 1, SBDS, ribosome maturation factor, ribonucleotide reductase regulatory TP53 inducible subunit M2B, zinc finger E-box binding homeobox 2, TOP1 binding arginine/serine rich protein, heparan sulfate 6-O-sulfotransferase 1, OPA3, outer mitochondrial membrane lipid metabolism regulator, GATA binding protein 6, L1 cell adhesion molecule, phosphatidylinositol glycan anchor biosynthesis class O, elastin, midline 2, phosphoglycerate dehydrogenase, nectin cell adhesion molecule 1, ATPase H+ transporting V1 subunit E1, gap junction protein alpha 1, GLI family zinc finger 2, EGF domain specific O-linked N-acetylglucosamine transferase, twist family bHLH transcription factor 2, insulin like growth factor 1 receptor, SPARC related modular calcium binding 1, FA complementation group E, RWD domain containing 2A, RB binding protein 8, endonuclease, ephrin B1, ATR serine/threonine kinase, calcium/calmodulin dependent serine protein kinase, Meis homeobox 2, component of oligomeric golgi complex 6, glypican 6, retinoic acid receptor beta, acyl-CoA synthetase long chain family member 4, latent transforming growth factor beta binding protein 2, damage specific DNA binding protein 2, paired box 3, patched 1, tripartite motif containing 32, lamin A/C, MRE11 homolog, double strand break repair nuclease, prosaposin, FAST kinase domains 2, RAS p21 protein activator 2, ubiquitin protein ligase E3A, ERCC excision repair 6 like 2, SRY-box 11, RAD50 double strand break repair protein, integrin subunit alpha 7, paired like homeodomain 1, B cell CLL/lymphoma 11B, MAF bZIP transcription factor B, heterogeneous nuclear ribonucleoprotein A2/B1, glycerol kinase, p21 (RAC1) activated kinase 3, Fos proto-oncogene, AP-1 transcription factor subunit, enhancer of polycomb homolog 2, ETS2 repressor factor, glutamate decarboxylase 1, glutamate ionotropic receptor kainate type subunit 2, pyruvate carboxylase, ADP ribosylation factor guanine nucleotide exchange factor 1, eukaryotic translation initiation factor 2 alpha kinase 3, ribosomal protein S6 kinase A3, nibrin, zinc finger protein 148, DCC netrin 1 receptor, phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha, ADAMTS like 4, zinc finger and BTB domain containing 20, neural precursor cell expressed, developmentally down-regulated 4-like, E3 ubiquitin protein ligase, nucleus accumbens associated 1]",85,HP:0001249,intellectual disability
1,"[DNA ligase 4, neural precursor cell expressed, developmentally down-regulated 4-like, E3 ubiquitin protein ligase, pyruvate carboxylase, glutamate ionotropic receptor AMPA type subunit 3, TATA-box binding protein, GNAS complex locus, zinc finger and BTB domain containing 20, solute carrier family 25 member 22, component of oligomeric golgi complex 6, DCC netrin 1 receptor, echinoderm microtubule associated protein like 1, glutamate ionotropic receptor NMDA type subunit 2D, L1 cell adhesion molecule, GATA binding protein 6, calcium/calmodulin dependent serine protein kinase, WD repeat domain 11, ANKH inorganic pyrophosphate transport regulator, ATPase H+ transporting V1 subunit E1, brain derived neurotrophic factor, amyloid beta precursor protein, GLI family zinc finger 2, ATRX, chromatin remodeler, midline 2, RAN binding protein 2, ADAMTS like 4, programmed cell death 10, glutamate decarboxylase 1, glutamate ionotropic receptor kainate type subunit 2, thymidine kinase 2, mitochondrial, damage specific DNA binding protein 2, FAST kinase domains 2, nectin cell adhesion molecule 1, transducin beta like 1 X-linked receptor 1, amylo-alpha-1, 6-glucosidase, 4-alpha-glucanotransferase, ETS2 repressor factor, adenosine kinase, B cell CLL/lymphoma 11B, zinc finger E-box binding homeobox 2, gap junction protein alpha 1, family with sequence similarity 111 member A, tetratricopeptide repeat domain 8, vacuolar protein sorting 13 homolog A, glycerol kinase, ubiquitin protein ligase E3A, phosphatidylinositol glycan anchor biosynthesis class O, prosaposin, phosphoglycerate dehydrogenase, AKT serine/threonine kinase 3, MAF bZIP transcription factor B, ATR serine/threonine kinase, extracellular matrix protein 1, eukaryotic translation initiation factor 2 alpha kinase 3, ribosomal protein S6 kinase A3, kinesin family member 2A, tumor protein p53, AKT serine/threonine kinase 2, patched 1, RAS p21 protein activator 2, TOP1 binding arginine/serine rich protein, SRY-box 11, recombination signal binding protein for immunoglobulin kappa J region, ribonucleotide reductase regulatory TP53 inducible subunit M2B, p21 (RAC1) activated kinase 3, Janus kinase 2, glial cell derived neurotrophic factor, EGF domain specific O-linked N-acetylglucosamine transferase, matrix Gla protein, nucleus accumbens associated 1, WW domain containing adaptor with coiled-coil, transcription factor 4, heparan sulfate 6-O-sulfotransferase 1]",71,HP:0001250,epileptic seizure
2,"[component of oligomeric golgi complex 6, proliferating cell nuclear antigen, midline 1, twist family bHLH transcription factor 2, midline 2, matrix Gla protein, SBDS, ribosome maturation factor, WD repeat domain 11, carbamoyl-phosphate synthetase 2, aspartate transcarbamylase, and dihydroorotase, transcription factor 4, neural precursor cell expressed, developmentally down-regulated 4-like, E3 ubiquitin protein ligase, TOP1 binding arginine/serine rich protein, DCC netrin 1 receptor, ATPase H+ transporting V1 subunit E1, tripartite motif containing 32, FAST kinase domains 2, recombination signal binding protein for immunoglobulin kappa J region, WW domain containing adaptor with coiled-coil, calcium/calmodulin dependent serine protein kinase, GLI family zinc finger 2, AKT serine/threonine kinase 3, adenosine kinase, RAS p21 protein activator 2, RWD domain containing 2A, nucleus accumbens associated 1, paired like homeodomain 1, EGF domain specific O-linked N-acetylglucosamine transferase, lamin A/C, latent transforming growth factor beta binding protein 2, family with sequence similarity 111 member A, DNA ligase 4, MAF bZIP transcription factor B, ADAMTS like 4, pyruvate carboxylase, tetratricopeptide repeat domain 8, serologically defined colon cancer antigen 8, GATA binding protein 6, zinc finger protein 148, B cell CLL/lymphoma 11B, damage specific DNA binding protein 2, echinoderm microtubule associated protein like 1, collagen type V alpha 1 chain, ephrin B1, glycerol kinase, FA complementation group E, glutamate ionotropic receptor NMDA type subunit 2D, solute carrier family 25 member 22, GNAS complex locus, glutamate ionotropic receptor kainate type subunit 2, ATRX, chromatin remodeler, ADP ribosylation factor guanine nucleotide exchange factor 1, solute carrier family 6 member 9, prosaposin, patched 1, eukaryotic translation initiation factor 2 alpha kinase 3, dopey family member 1, glutamate decarboxylase 1, ST3 beta-galactoside alpha-2,3-sialyltransferase 5, kinesin family member 2A, zinc finger protein, FOG family member 2, RB transcriptional corepressor 1, Meis homeobox 2, transducin beta like 1 X-linked receptor 1, SRY-box 11, phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha, heparan sulfate 6-O-sulfotransferase 1, dedicator of cytokinesis 7, ubiquitin protein ligase E3A, ribosomal protein S6 kinase A3, F-box and WD repeat domain containing 7]",70,HP:0001263,Developmental disability
3,"[acyl-CoA synthetase long chain family member 4, RB binding protein 8, endonuclease, proliferating cell nuclear antigen, solute carrier family 25 member 22, zinc finger protein 148, calcium/calmodulin dependent serine protein kinase, phosphoglycerate dehydrogenase, solute carrier family 6 member 9, DNA ligase 4, GATA binding protein 6, ATR serine/threonine kinase, LDL receptor related protein 5, gap junction protein alpha 1, ribosomal protein S6 kinase A3, ribonucleotide reductase regulatory TP53 inducible subunit M2B, receptor interacting serine/threonine kinase 4, Meis homeobox 2, FA complementation group E, kinesin family member 2A, ST3 beta-galactoside alpha-2,3-sialyltransferase 5, component of oligomeric golgi complex 6, phosphatidylinositol glycan anchor biosynthesis class O, transducin beta like 1 X-linked receptor 1, ERCC excision repair 6 like 2, nibrin, glutamate ionotropic receptor NMDA type subunit 2D, fibulin 5, paired box 3, ATPase H+ transporting V1 subunit E1, nucleus accumbens associated 1, ATRX, chromatin remodeler, recombination signal binding protein for immunoglobulin kappa J region, zinc finger E-box binding homeobox 2, eukaryotic translation initiation factor 2 alpha kinase 3, L1 cell adhesion molecule, glutamate decarboxylase 1, enhancer of polycomb homolog 2, damage specific DNA binding protein 2, MRE11 homolog, double strand break repair nuclease, GLI family zinc finger 2, transcription factor 4, SRY-box 11, insulin like growth factor 1 receptor, EGF domain specific O-linked N-acetylglucosamine transferase, lamin A/C, MAF bZIP transcription factor B, ephrin B1, paired like homeodomain 1, EGF containing fibulin extracellular matrix protein 2, RAD50 double strand break repair protein, p21 (RAC1) activated kinase 3, DCC netrin 1 receptor, ubiquitin protein ligase E3A]",53,HP:0000252,microcephaly
4,"[SRY-box 11, RAS p21 protein activator 2, patched 1, RB binding protein 8, endonuclease, senataxin, DCC netrin 1 receptor, notch 2, SMAD family member 3, fibronectin 1, FA complementation group E, cytochrome P450 family 7 subfamily B member 1, interferon regulatory factor 6, transducin beta like 1 X-linked receptor 1, glutamate decarboxylase 1, fibulin 5, transcription factor 4, atlastin GTPase 1, ephrin B1, integrin subunit alpha 7, paired box 3, phosphoglycerate dehydrogenase, programmed cell death 10, nucleus accumbens associated 1, DNA ligase 4, phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha, serpin family H member 1, latent transforming growth factor beta binding protein 2, coiled-coil domain containing 8, lamin A/C, ATR serine/threonine kinase, ATPase H+ transporting V1 subunit E1, ribosomal protein S6 kinase A3, RWD domain containing 2A, dopey family member 1, ubiquitin protein ligase E3A, ERCC excision repair 6 like 2, SBDS, ribosome maturation factor, calcium/calmodulin dependent serine protein kinase, collagen type V alpha 1 chain, zinc finger and BTB domain containing 20, immunoglobulin binding protein 1, procollagen-lysine,2-oxoglutarate 5-dioxygenase 2, ATRX, chromatin remodeler]",43,HP:0002650,scoliosis
5,"[ADP ribosylation factor guanine nucleotide exchange factor 1, ribosomal protein S6 kinase A3, solute carrier family 24 member 1, tetratricopeptide repeat domain 8, twist family bHLH transcription factor 2, homeobox A13, RAS p21 protein activator 2, RB transcriptional corepressor 1, midline 2, gap junction protein alpha 1, tripartite motif containing 37, transcription factor 4, GNAS complex locus, paired like homeodomain 2, SRY-box 11, neural precursor cell expressed, developmentally down-regulated 4-like, E3 ubiquitin protein ligase, ETS2 repressor factor, glycerol kinase, transducin beta like 1 X-linked receptor 1, paired box 3, DCC netrin 1 receptor, WW domain containing adaptor with coiled-coil, FA complementation group E, zinc finger E-box binding homeobox 2, ATPase H+ transporting V1 subunit E1, melanogenesis associated transcription factor, damage specific DNA binding protein 2, patched 1, ADAMTS like 4, ubiquitin protein ligase E3A, EGF domain specific O-linked N-acetylglucosamine transferase, DNA ligase 4, MAF bZIP transcription factor B, L1 cell adhesion molecule, senataxin, laminin subunit beta 2, calcium/calmodulin dependent serine protein kinase, recombination signal binding protein for immunoglobulin kappa J region, ERCC excision repair 6 like 2, ATR serine/threonine kinase]",40,HP:0000486,strabismus
6,"[phosphorylase kinase regulatory subunit beta, vacuolar protein sorting 13 homolog A, tripartite motif containing 37, notch 2, LDL receptor related protein 5, amylo-alpha-1, 6-glucosidase, 4-alpha-glucanotransferase, beta-2-microglobulin, SBDS, ribosome maturation factor, Janus kinase 2, calreticulin, FAST kinase domains 2, cytochrome P450 family 7 subfamily B member 1, Fas cell surface death receptor, GATA binding protein 2, intraflagellar transport 80, prosaposin, GNAS complex locus, DNA ligase 4, AKT serine/threonine kinase 2, chromosome 15 open reading frame 41, nuclear factor, erythroid 2 like 2, eukaryotic translation initiation factor 2 alpha kinase 3, STEAP3 metalloreductase, ATP binding cassette subfamily A member 1, RAS p21 protein activator 2, lamin A/C, splicing factor 3b subunit 1, component of oligomeric golgi complex 6, Fos proto-oncogene, AP-1 transcription factor subunit, IKAROS family zinc finger 3, pyruvate carboxylase]",31,HP:0002240,hepatomegaly
7,"[ETS2 repressor factor, AKT serine/threonine kinase 3, glial cell derived neurotrophic factor, zinc finger E-box binding homeobox 2, transcription factor 4, ephrin B1, WD repeat domain 11, collagen type V alpha 1 chain, retinoic acid receptor beta, prosaposin, ubiquitin protein ligase E3A, solute carrier family 25 member 22, ribosomal protein S6 kinase A3, ribonucleotide reductase regulatory TP53 inducible subunit M2B, ADAMTS like 4, RAS p21 protein activator 2, ATRX, chromatin remodeler, vacuolar protein sorting 13 homolog A, brain derived neurotrophic factor, latent transforming growth factor beta binding protein 2, eukaryotic translation initiation factor 2 alpha kinase 3, cofilin 2, SRY-box 11, FAST kinase domains 2, heparan sulfate 6-O-sulfotransferase 1, tripartite motif containing 37, phosphorylase kinase regulatory subunit beta, integrin subunit alpha 7]",28,HP:0001252,Hypotonia
8,"[DCC netrin 1 receptor, NSE3 homolog, SMC5-SMC6 complex component, insulin like growth factor 1 receptor, neural precursor cell expressed, developmentally down-regulated 4-like, E3 ubiquitin protein ligase, elastin, L1 cell adhesion molecule, calcium/calmodulin dependent serine protein kinase, patched 1, zinc finger protein, FOG family member 2, RAS p21 protein activator 2, SRY-box 11, ubiquitin protein ligase E3A, GATA binding protein 6, latent transforming growth factor beta binding protein 2, paired like homeodomain 1, notch 2, RAN binding protein 2, prolyl endopeptidase like, RB transcriptional corepressor 1, solute carrier family 25 member 22, glycerol kinase, ERCC excision repair 6 like 2, ETS2 repressor factor, paired like homeodomain 2]",24,HP:0001999,Abnormal facial shape
9,"[solute carrier family 6 member 9, AKT serine/threonine kinase 3, damage specific DNA binding protein 2, ST3 beta-galactoside alpha-2,3-sialyltransferase 5, matrix Gla protein, neural precursor cell expressed, developmentally down-regulated 4-like, E3 ubiquitin protein ligase, SPARC related modular calcium binding 1, ETS2 repressor factor, gap junction protein alpha 1, DCC netrin 1 receptor, ATRX, chromatin remodeler, OPA3, outer mitochondrial membrane lipid metabolism regulator, granulin precursor, tetratricopeptide repeat domain 8, prosaposin, ribosomal protein S6 kinase A3, cytochrome P450 family 7 subfamily B member 1, ANTXR cell adhesion molecule 1, calcium/calmodulin dependent serine protein kinase, GNAS complex locus, TOP1 binding arginine/serine rich protein, LDL receptor related protein 5, solute carrier family 25 member 22, FAST kinase domains 2]",24,HP:0000648,optic atrophy


---

In [59]:
%%time
query = (
        """
        MATCH (source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002435`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)

        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(:DISO)--(g:GENE)
        
        WHERE g.id in genes
        
        RETURN count(distinct path) as paths
        """
)

# run query: `genetically interacts with` OR `RO:0002435`
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 1519
CPU times: user 6.66 ms, sys: 3.41 ms, total: 10.1 ms
Wall time: 4min 4s


In [61]:
%%time
query = (
        """
        MATCH (source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002435`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)

        WITH COLLECT(DISTINCT rna.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(:DISO)--(g:GENE)
        
        WHERE g.id in genes
        
        RETURN count(distinct path) as paths
        """
)

# run query: `genetically interacts with` OR `RO:0002435`
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 1519
CPU times: user 2.45 ms, sys: 5.47 ms, total: 7.92 ms
Wall time: 2min 58s


In [62]:
%%time
query = (
        """
        MATCH (source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002435`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)

        WITH COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(:DISO)--(g:GENE)
        
        WHERE g.id in genes
        
        RETURN count(distinct path) as paths
        """
)

# run query: `genetically interacts with` OR `RO:0002435`
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 1519
CPU times: user 5.42 ms, sys: 2.49 ms, total: 7.91 ms
Wall time: 2min 58s


In [63]:
%%time
query = (
        """
        MATCH (source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002435`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)

        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'Genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 7.08 ms, sys: 5.61 ms, total: 12.7 ms
Wall time: 4min 7s


In [64]:
res_df

Unnamed: 0,Genes,phenotype_id,phenotype_label
0,87,HP:0001263,Developmental disability
1,82,HP:0001249,intellectual disability
2,75,HP:0001250,epileptic seizure
3,60,HP:0000252,microcephaly
4,51,HP:0000486,strabismus
5,40,HP:0002650,scoliosis
6,40,HP:0000508,ptosis
7,39,HP:0001511,Intrauterine growth retardation
8,35,HP:0002240,hepatomegaly
9,34,HP:0001744,Splenomegaly


In [None]:
%%time
query = (
        """
        MATCH (source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002435`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)

        WITH COLLECT(DISTINCT rna.id) as genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id as phenotype_id, ph.preflabel as phenotype_label, count(DISTINCT g.id) as genes
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['ph.preflabel'],
                  'phenotype_id': record['ph.id'],
                  'Genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

In [None]:
res_df

In [None]:
%%time
query = (
        """
        MATCH (source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:0002435`]-(tf:GENE)-[:`RO:0002434`]->(target:GENE)

        WITH COLLECT(DISTINCT tf.id) as genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id as phenotype_id, ph.preflabel as phenotype_label, count(DISTINCT g.id) as genes
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['ph.preflabel'],
                  'phenotype_id': record['ph.id'],
                  'Genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

In [None]:
res_df

---
# Without human expression data

### Specific query template with < 1000 paths => results viz graphically
Query template to answer the connection at expression level.

* in orthology relationship + genetically interacts with

In [54]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000017`]-(:GENE)-[:`RO:0002435`]-(:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549' AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query: `genetically interacts with` OR `RO:0002435`
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 4
CPU times: user 1.98 ms, sys: 0 ns, total: 1.98 ms
Wall time: 1.22 s


* in 1 to 1 orthology + interacts with

In [57]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000020`]-(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549' AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query: `genetically interacts with` OR `RO:0002435`
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 71
CPU times: user 0 ns, sys: 2.16 ms, total: 2.16 ms
Wall time: 2.53 s


* in orthology relationship + interacts with

In [58]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000017`]-(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549' AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query: `genetically interacts with` OR `RO:0002435`
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 210
CPU times: user 1.41 ms, sys: 642 µs, total: 2.05 ms
Wall time: 2.33 s


---
## Question 2

In [76]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000017`]-(rna:GENE)-[:`RO:0002434`]-(:GENE)-[tf:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549' AND size(interactions) <> 0      

        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(:DISO)--(g:GENE)
        
        WHERE g.id in genes
        
        RETURN count(distinct path) as paths
        """
)

# run query: `genetically interacts with` OR `RO:0002435`
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 60
CPU times: user 1.35 ms, sys: 607 µs, total: 1.96 ms
Wall time: 2.69 s


In [77]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000017`]-(rna:GENE)-[:`RO:0002434`]-(:GENE)-[tf:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549' AND size(interactions) <> 0      

        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 730 µs, sys: 2.72 ms, total: 3.45 ms
Wall time: 2.68 s


In [78]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,"[ELOVL fatty acid elongase 4, membrane metalloendopeptidase, glutamate ionotropic receptor AMPA type subunit 3, heat shock protein family B (small) member 1]",4,HP:0001265,hyporeflexia
1,"[MYCN proto-oncogene, bHLH transcription factor, UDP-galactose-4-epimerase, ELOVL fatty acid elongase 4]",3,HP:0001263,Developmental disability
2,"[glutamate ionotropic receptor AMPA type subunit 3, UDP-galactose-4-epimerase, MYCN proto-oncogene, bHLH transcription factor]",3,HP:0001249,intellectual disability
3,"[glutamate ionotropic receptor AMPA type subunit 3, serpin family I member 1, MYCN proto-oncogene, bHLH transcription factor]",3,HP:0001336,Myoclonus
4,"[3-hydroxyacyl-CoA dehydratase 1, serpin family H member 1, endothelin converting enzyme like 1]",3,HP:0002650,scoliosis
5,"[lysozyme, UDP-galactose-4-epimerase]",2,HP:0001744,Splenomegaly
6,"[ELOVL fatty acid elongase 4, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0000252,microcephaly
7,"[lysozyme, UDP-galactose-4-epimerase]",2,HP:0002240,hepatomegaly
8,"[3-hydroxyacyl-CoA dehydratase 1, UDP-galactose-4-epimerase]",2,HP:0001252,Hypotonia
9,"[ELOVL fatty acid elongase 4, membrane metalloendopeptidase]",2,HP:0001272,Cerebellar atrophy
