# Cypher Queries for Determining Regulatory Paths
*Núria Queralt Rosinach, Andrew Su*

**Queries for the Neo4j guide online**

## Overview
NGLY1 - AQP1 **regulatory review** (*NGLY1 v3.2*)

## Servers 

    * Local: bolt://kylo.scripps.edu:7690
    * AWS: bolt://52.87.232.110:7689

### Imports

In [1]:
from neo4j.v1 import GraphDatabase, basic_auth
import pandas as pd
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

### Functions

In [2]:
def runQuery( driver, query ):
    '''
    This function runs the query onto the database and returns the result.
    in: cypher query string
    out: neo4j query result object
    '''
    
    with driver.session() as session:
        result = session.run('' + query + '')
        
    return result


def parseNode( node ):
    '''
    This function parses the information gathered in the node data structure object resulting after querying neo4j.
        in: node record neo4j object
        out: node as dict
    '''
    
    n = dict()
    n["idx"] = int(node.id)
    n["type"] = list(node.labels)[0]
    n["id"] = str(node.properties['id'])
    n["preflabel"] = str(node.properties['preflabel'])
    n["name"] = str(node.properties['name'])
    n["description"] = str(node.properties['description'])

    return n


def parsePath( path ):
    '''
    This function parsers the information gathered in the path data structure object resulting after querying neo4j.
        in: path record neo4j object
        out: path as dict
    '''
    
    out = {}
    out['Nodes'] = []
    for node in path['path'].nodes:
        n = {}
        n['idx'] = int(node.id)
        n['type'] = list(node.labels)[0]
        n['id'] = str(node.properties['id'])
        n['preflabel'] = str(node.properties['preflabel'])
        n['name'] = str(node.properties['name'])
        n['description'] = str(node.properties['description'])
        out['Nodes'].append(n)
    out['Edges'] = []
    for edge in path['path'].relationships:
        e = {}
        e['idx'] = int(edge.id)
        e['start_node'] = int(edge.start)
        e['end_node'] = int(edge.end)
        e['type'] = str(edge.type)
        e['property_label'] = str(edge.properties['property_label'])
        e['property_uri'] = str(edge.properties['property_uri'])
        e['reference_uri'] = str(edge.properties['reference_uri'])
        e['reference_date'] = str(edge.properties['reference_date'])
        e['reference_supporting_text'] = str(edge.properties['reference_supporting_text'])
        out['Edges'].append(e)
        
    return out

### Initialize neo4j

In [3]:
driver = GraphDatabase.driver("bolt://kylo.scripps.edu:7690", auth=basic_auth("neo4j", "xena"))

# NETWORK HYPOTHESES

### Open query to check if they are connected => metapaths
* Are associated at expression level?
    * Are connected through the human transcriptome? -imposing **ONLY** rna genes-

In [5]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[*..3]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549'

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 6670
CPU times: user 1.73 ms, sys: 232 µs, total: 1.97 ms
Wall time: 13.1 s


**Metapaths:**

In [6]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[*..3]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549'

        RETURN DISTINCT extract (x in rels(path) | type(x)) as types, extract (n in nodes(path) | labels(n)) as labels, length(path) as mp_length, count(distinct path) as paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 7.79 ms, sys: 0 ns, total: 7.79 ms
Wall time: 13.2 s


In [7]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,3,"[[GENE], [GENE], [PHYS], [GENE]]",1,"[RO:0002434, None, RO:0002331]"
1,4,"[[GENE], [GENE], [ANAT], [GENE], [GENE]]",3150,"[RO:0002434, RO:0002206, RO:0002206, RO:0002434]"
2,4,"[[GENE], [GENE], [ANAT], [GENE], [GENE]]",1556,"[RO:0002434, RO:0002206, RO:0002206, RO:HOM0000017]"
3,4,"[[GENE], [GENE], [DISO], [GENE], [GENE]]",444,"[RO:0002434, RO:0002200, RO:0002200, RO:HOM0000017]"
4,4,"[[GENE], [GENE], [GENE], [ANAT], [GENE]]",236,"[RO:0002434, RO:HOM0000017, RO:0002206, RO:0002206]"
5,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",233,"[RO:0002434, RO:HOM0000020, RO:0002434, RO:0002434]"
6,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",224,"[RO:0002434, RO:0002434, RO:0002434, RO:HOM0000017]"
7,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",152,"[RO:0002434, RO:0002434, RO:HOM0000017, RO:0002434]"
8,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",114,"[RO:0002434, RO:HOM0000017, RO:0002434, RO:0002434]"
9,4,"[[GENE], [GENE], [GENE], [PHYS], [GENE]]",110,"[RO:0002434, RO:HOM0000017, RO:0002331, RO:0002331]"


* Are associated at expression level?
    * Are connected through the human transcriptome? -imposing rna genes **AND** regulatory edges-

In [8]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[*..3]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549' AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 177
CPU times: user 2.18 ms, sys: 265 µs, total: 2.44 ms
Wall time: 13.7 s


**Metapaths:**

In [9]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[*..3]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549' AND size(interactions) <> 0

        RETURN DISTINCT extract (x in rels(path) | type(x)) as types, extract (n in nodes(path) | labels(n)) as labels, length(path) as mp_length, count(distinct path) as paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 234 µs, sys: 4 ms, total: 4.24 ms
Wall time: 13.6 s


In [10]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",66,"[RO:0002434, RO:HOM0000017, RO:0002434, RO:0002434]"
1,4,"[[GENE], [GENE], [ANAT], [GENE], [GENE]]",66,"[RO:0002434, RO:0002206, RO:0002206, RO:0002434]"
2,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",41,"[RO:0002434, RO:HOM0000020, RO:0002434, RO:0002434]"
3,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",4,"[RO:0002434, RO:HOM0000018, RO:0002434, RO:0002434]"


* Are associated at expression level?
    * Are connected through TFs? -imposing **ONLY** regulatory edges-

In [11]:
%%time
query = (
        """
        MATCH path=(source:GENE)--()-[*..3]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 1315
CPU times: user 1.97 ms, sys: 259 µs, total: 2.23 ms
Wall time: 15.6 s


**Metapaths:**

In [12]:
%%time
query = (
        """
        MATCH path=(source:GENE)--()-[*..3]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN DISTINCT extract (x in rels(path) | type(x)) as types, extract (n in nodes(path) | labels(n)) as labels, length(path) as mp_length, count(distinct path) as paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 4.76 ms, sys: 634 µs, total: 5.4 ms
Wall time: 15.6 s


In [13]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,3,"[[GENE], [ANAT], [GENE], [GENE]]",1,"[RO:0002206, RO:0002206, RO:0002434]"
1,4,"[[GENE], [ANAT], [GENE], [GENE], [GENE]]",741,"[RO:0002206, RO:0002206, RO:0002434, RO:0002434]"
2,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",188,"[RO:HOM0000020, RO:0002434, RO:0002434, RO:0002434]"
3,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",90,"[RO:0002434, RO:HOM0000017, RO:0002434, RO:0002434]"
4,4,"[[GENE], [GENE], [ANAT], [GENE], [GENE]]",82,"[RO:0002434, RO:0002206, RO:0002206, RO:0002434]"
5,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",81,"[RO:0002434, RO:HOM0000020, RO:0002434, RO:0002434]"
6,4,"[[GENE], [GENE], [ANAT], [GENE], [GENE]]",30,"[RO:HOM0000020, RO:0002206, RO:0002206, RO:0002434]"
7,4,"[[GENE], [ANAT], [GENE], [GENE], [GENE]]",20,"[RO:0002206, RO:0002206, RO:0002325, RO:0002434]"
8,4,"[[GENE], [GENE], [DISO], [GENE], [GENE]]",16,"[RO:HOM0000020, RO:0002200, RO:0002200, RO:0002434]"
9,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",16,"[RO:HOM0000020, RO:0002434, RO:HOM0000011, RO:0002434]"


* Are associated at expression level?
    * Are connected through TFs or expression genes? -imposing rna genes **OR** regulatory edges-

In [14]:
%%time
query = (
        """
        MATCH path=(source:GENE)--()-[*..3]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*pubmed/29346549.*|.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN count(distinct path) as paths  
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 1315
CPU times: user 2.06 ms, sys: 263 µs, total: 2.32 ms
Wall time: 15.8 s


**Metapaths:**

In [15]:
%%time
query = (
        """
        MATCH path=(source:GENE)--()-[*..3]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*pubmed/29346549.*|.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN DISTINCT extract (x in rels(path) | type(x)) as types, extract (n in nodes(path) | labels(n)) as labels, length(path) as mp_length, count(distinct path) as paths 
        
        ORDER BY mp_length, paths DESC   
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 1.89 ms, sys: 3.23 ms, total: 5.12 ms
Wall time: 15.9 s


In [16]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,3,"[[GENE], [ANAT], [GENE], [GENE]]",1,"[RO:0002206, RO:0002206, RO:0002434]"
1,4,"[[GENE], [ANAT], [GENE], [GENE], [GENE]]",741,"[RO:0002206, RO:0002206, RO:0002434, RO:0002434]"
2,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",188,"[RO:HOM0000020, RO:0002434, RO:0002434, RO:0002434]"
3,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",90,"[RO:0002434, RO:HOM0000017, RO:0002434, RO:0002434]"
4,4,"[[GENE], [GENE], [ANAT], [GENE], [GENE]]",82,"[RO:0002434, RO:0002206, RO:0002206, RO:0002434]"
5,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",81,"[RO:0002434, RO:HOM0000020, RO:0002434, RO:0002434]"
6,4,"[[GENE], [GENE], [ANAT], [GENE], [GENE]]",30,"[RO:HOM0000020, RO:0002206, RO:0002206, RO:0002434]"
7,4,"[[GENE], [ANAT], [GENE], [GENE], [GENE]]",20,"[RO:0002206, RO:0002206, RO:0002325, RO:0002434]"
8,4,"[[GENE], [GENE], [DISO], [GENE], [GENE]]",16,"[RO:HOM0000020, RO:0002200, RO:0002200, RO:0002434]"
9,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",16,"[RO:HOM0000020, RO:0002434, RO:HOM0000011, RO:0002434]"


---
### Specific query template with > 1000 paths => results viz in a summary table
Query template to answer the connection at expression level.

~TO INCLUDE IN:~

* How to query
    * Query1: Open + metapath table (3000 rna OR 188 reg)
    * Query2: template -> >1000 paths => table
    * Query3: refine Q2 template -> <1000 paths => graph



* Looking for **gene interactors** between RNA genes and TFs of AQP1 (L=4)
    * metapath: source: NGLY1 fly gene; TF -> AQP1; target:AQP1 
    * template:

In [17]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[]-()-[]-()-[]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 754
CPU times: user 2 ms, sys: 0 ns, total: 2 ms
Wall time: 767 ms


#### Show summary table: metapaths

In [18]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1]-(g1)-[i2]-(g2)-[i3]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,g1,g2,tf,i1,i2,i3,i4,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        RETURN DISTINCT extract(x in rels(path) | type(x)) AS types, extract(n in nodes(path) | labels(n)) AS labels, length(path) AS mp_length, count(distinct path) AS paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 5.5 ms, sys: 0 ns, total: 5.5 ms
Wall time: 987 ms


In [19]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,4,"[[GENE], [ANAT], [GENE], [GENE], [GENE]]",530,"[RO:0002206, RO:0002206, RO:0002434, RO:0002434]"
1,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",160,"[RO:HOM0000020, RO:0002434, RO:0002434, RO:0002434]"
2,4,"[[GENE], [GENE], [ANAT], [GENE], [GENE]]",82,"[RO:0002434, RO:0002206, RO:0002206, RO:0002434]"
3,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",78,"[RO:0002434, RO:HOM0000017, RO:0002434, RO:0002434]"
4,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",70,"[RO:0002434, RO:HOM0000020, RO:0002434, RO:0002434]"
5,4,"[[GENE], [GENE], [ANAT], [GENE], [GENE]]",30,"[RO:HOM0000020, RO:0002206, RO:0002206, RO:0002434]"
6,4,"[[GENE], [ANAT], [GENE], [GENE], [GENE]]",20,"[RO:0002206, RO:0002206, RO:0002325, RO:0002434]"
7,4,"[[GENE], [GENE], [DISO], [GENE], [GENE]]",16,"[RO:HOM0000020, RO:0002200, RO:0002200, RO:0002434]"
8,4,"[[GENE], [GENE], [GENE], [GENE], [GENE]]",8,"[RO:HOM0000020, RO:0002434, RO:HOM0000011, RO:0002434]"
9,4,"[[GENE], [PHYS], [GENE], [GENE], [GENE]]",8,"[None, RO:0002331, RO:0002434, RO:0002434]"


#### Show summary table: species

In [20]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1]-(g1)-[i2]-(g2)-[i3]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,g1,g2,tf,i1,i2,i3,i4,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        RETURN DISTINCT extract (n in nodes(path) | n.name) as labels, count(distinct path) as paths 
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 19.5 ms, sys: 0 ns, total: 19.5 ms
Wall time: 1 s


In [21]:
res_df

Unnamed: 0,Nodes,Paths
0,"[PNGase-like, uncharacterized protein, secondary oocyte, TATA-box binding protein, aquaporin 1 (Colton blood group)]",22
1,"[PNGase-like, N-glycanase 1, valosin containing protein, tumor protein p53, aquaporin 1 (Colton blood group)]",12
2,"[PNGase-like, N-glycanase 1, ribosomal protein S27a, tumor protein p53, aquaporin 1 (Colton blood group)]",8
3,"[PNGase-like, N-glycanase 1, carbonic anhydrase 9, tumor protein p53, aquaporin 1 (Colton blood group)]",8
4,"[PNGase-like, Rad23 nucleotide excision repair protein binds to XPC, RAD23 homolog A, nucleotide excision repair protein, tumor protein p53, aquaporin 1 (Colton blood group)]",8
5,"[PNGase-like, TER94, valosin containing protein, tumor protein p53, aquaporin 1 (Colton blood group)]",8
6,"[PNGase-like, N-glycanase 1, CCCTC-binding factor, IKAROS family zinc finger 2, aquaporin 1 (Colton blood group)]",8
7,"[PNGase-like, N-glycanase 1, epidermal growth factor receptor, tumor protein p53, aquaporin 1 (Colton blood group)]",8
8,"[PNGase-like, 26S proteasome regulatory complex, proteasome 26S subunit, ATPase 1, tumor protein p53, aquaporin 1 (Colton blood group)]",8
9,"[PNGase-like, ribosomal protein S27a, ribosomal protein S27a, tumor protein p53, aquaporin 1 (Colton blood group)]",8


In [22]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1]-(g1)-[i2]-(g2)-[i3]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,g1,g2,tf,i1,i2,i3,i4,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        RETURN DISTINCT extract (n in nodes(path) | n.name) as labels,
        
                        extract(x in rels(path) | x.property_label) AS types, 
                        
                        count(*) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'],
                  'Metapath': record['types'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 29.2 ms, sys: 0 ns, total: 29.2 ms
Wall time: 1.02 s


In [23]:
res_df

Unnamed: 0,Metapath,Nodes,Paths
0,"[interacts with, expressed in, expressed in, interacts with]","[PNGase-like, uncharacterized protein, secondary oocyte, TATA-box binding protein, aquaporin 1 (Colton blood group)]",22
1,"[in 1 to 1 orthology relationship with, interacts with, interacts with, interacts with]","[PNGase-like, N-glycanase 1, valosin containing protein, tumor protein p53, aquaporin 1 (Colton blood group)]",12
2,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]","[PNGase-like, TER94, valosin containing protein, tumor protein p53, aquaporin 1 (Colton blood group)]",8
3,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]","[PNGase-like, 26S proteasome regulatory complex, proteasome 26S subunit, ATPase 1, tumor protein p53, aquaporin 1 (Colton blood group)]",8
4,"[in 1 to 1 orthology relationship with, interacts with, interacts with, interacts with]","[PNGase-like, N-glycanase 1, proteasome 26S subunit, ATPase 1, tumor protein p53, aquaporin 1 (Colton blood group)]",8
5,"[interacts with, in orthology relationship with, interacts with, interacts with]","[PNGase-like, uncharacterized protein, RAD23 homolog A, nucleotide excision repair protein, tumor protein p53, aquaporin 1 (Colton blood group)]",8
6,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]","[PNGase-like, Rad23 nucleotide excision repair protein binds to XPC, RAD23 homolog A, nucleotide excision repair protein, tumor protein p53, aquaporin 1 (Colton blood group)]",8
7,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]","[PNGase-like, ribosomal protein S27a, ribosomal protein S27a, tumor protein p53, aquaporin 1 (Colton blood group)]",8
8,"[in 1 to 1 orthology relationship with, interacts with, interacts with, interacts with]","[PNGase-like, N-glycanase 1, ribosomal protein S27a, tumor protein p53, aquaporin 1 (Colton blood group)]",8
9,"[in 1 to 1 orthology relationship with, interacts with, interacts with, interacts with]","[PNGase-like, N-glycanase 1, carbonic anhydrase 9, tumor protein p53, aquaporin 1 (Colton blood group)]",8


#### Show summary table: specific species 

In [24]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1]-(g1)-[i2]-(g2)-[i3]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,g1,g2,tf,i1,i2,i3,i4,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        RETURN  DISTINCT g1.name, count(g1.id) AS g1_count
        
        ORDER BY g1_count DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'g1_name': record['g1.name'],
                  'g1_count': record['g1_count']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.4 ms, sys: 307 µs, total: 3.7 ms
Wall time: 985 ms


In [25]:
res_df

Unnamed: 0,g1_count,g1_name
0,283,secondary oocyte
1,270,oocyte
2,232,N-glycanase 1
3,61,uncharacterized protein
4,20,"MYC proto-oncogene, bHLH transcription factor"
5,18,ribosomal protein S27a
6,14,26S proteasome regulatory complex
7,13,Carbonic anhydrase 2
8,12,Rad23 nucleotide excision repair protein binds to XPC
9,10,Heat shock protein 68


In [26]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1]-(g1)-[i2]-(g2)-[i3]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,g1,g2,tf,i1,i2,i3,i4,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        RETURN  DISTINCT g2.name, count(g2.id) AS g2_count
        
        ORDER BY g2_count DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'g2_name': record['g2.name'],
                  'g2_count': record['g2_count']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 6.12 ms, sys: 186 µs, total: 6.3 ms
Wall time: 979 ms


In [27]:
res_df

Unnamed: 0,g2_count,g2_name
0,46,secondary oocyte
1,36,sperm
2,30,carbonic anhydrase 9
3,24,ribosomal protein S27a
4,24,"RAD23 homolog A, nucleotide excision repair protein"
5,20,valosin containing protein
6,18,CCCTC-binding factor
7,16,lysozyme
8,16,"RAD23 homolog B, nucleotide excision repair protein"
9,16,"protein tyrosine phosphatase, receptor type O"


In [28]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1]-(g1)-[i2]-(g2)-[i3]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,g1,g2,tf,i1,i2,i3,i4,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        RETURN  DISTINCT tf.name, count(tf.id) AS tf_count
        
        ORDER BY tf_count DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'tf_name': record['tf.name'],
                  'tf_count': record['tf_count']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.2 ms, sys: 0 ns, total: 3.2 ms
Wall time: 970 ms


In [29]:
res_df

Unnamed: 0,tf_count,tf_name
0,386,tumor protein p53
1,144,TATA-box binding protein
2,118,"Fos proto-oncogene, AP-1 transcription factor subunit"
3,83,transcription factor 4
4,80,lymphoid enhancer binding factor 1
5,47,MYC associated zinc finger protein
6,34,myocyte enhancer factor 2A
7,30,IKAROS family zinc finger 2
8,28,protein E12
9,24,GATA binding protein 6


---

* Looking for **gene interactors** between RNA genes and TFs of AQP1 (L=4)
    * metapath: rna: 3150 ( ANAT ) -> 233,114/152,56/24 ( ALL GENE INTERACTORS )
                   metapath: 
                   '4	[[GENE], [GENE], [ANAT], [GENE], [GENE]]
                   [RO:0002434, RO:0002206, RO:0002206, RO:0002434]': 3150 paths  
    * template:

In [30]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:0002206`]-(:ANAT)-[:`RO:0002206`]-(:GENE)-[:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549'

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 3150
CPU times: user 1.51 ms, sys: 133 µs, total: 1.64 ms
Wall time: 487 ms


#### Show summary table

In [31]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:0002206`]-(:ANAT)-[:`RO:0002206`]-(:GENE)-[:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549'

        RETURN DISTINCT extract(x in rels(path) | x.property_label) AS types, extract(n in nodes(path) | n.name) AS labels, length(path) AS mp_length, count(distinct path) AS paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 36 ms, sys: 4.01 ms, total: 40 ms
Wall time: 604 ms


In [32]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,4,"[PNGase-like, uncharacterized protein, embryo, glycerol kinase 5, aquaporin 1 (Colton blood group)]",236,"[interacts with, expressed in, expressed in, interacts with]"
1,4,"[PNGase-like, uncharacterized protein, embryo, sedoheptulokinase, aquaporin 1 (Colton blood group)]",236,"[interacts with, expressed in, expressed in, interacts with]"
2,4,"[PNGase-like, uncharacterized protein, embryo, fibronectin type III and SPRY domain containing 2, aquaporin 1 (Colton blood group)]",236,"[interacts with, expressed in, expressed in, interacts with]"
3,4,"[PNGase-like, uncharacterized protein, embryo, aquaporin 11, aquaporin 1 (Colton blood group)]",236,"[interacts with, expressed in, expressed in, interacts with]"
4,4,"[PNGase-like, uncharacterized protein, embryo, midline 2, aquaporin 1 (Colton blood group)]",236,"[interacts with, expressed in, expressed in, interacts with]"
5,4,"[PNGase-like, uncharacterized protein, embryo, microtubule associated scaffold protein 2, aquaporin 1 (Colton blood group)]",236,"[interacts with, expressed in, expressed in, interacts with]"
6,4,"[PNGase-like, uncharacterized protein, embryo, regulator of G protein signaling 20, aquaporin 1 (Colton blood group)]",236,"[interacts with, expressed in, expressed in, interacts with]"
7,4,"[PNGase-like, uncharacterized protein, oocyte, lamin A/C, aquaporin 1 (Colton blood group)]",44,"[interacts with, expressed in, expressed in, interacts with]"
8,4,"[PNGase-like, uncharacterized protein, oocyte, centrosomal protein 44, aquaporin 1 (Colton blood group)]",44,"[interacts with, expressed in, expressed in, interacts with]"
9,4,"[PNGase-like, uncharacterized protein, secondary oocyte, TATA-box binding protein, aquaporin 1 (Colton blood group)]",20,"[interacts with, expressed in, expressed in, interacts with]"


    * refine template: rna -> 233 paths
                       metapath
                       '4	[[GENE], [GENE], [GENE], [GENE], [GENE]]
                       [RO:0002434, RO:HOM0000020, RO:0002434, RO:0002434]': 233 paths
                       OR
                       4	[[GENE], [GENE], [GENE], [GENE], [GENE]] 
                       [RO:0002434, RO:HOM0000017, RO:0002434, RO:0002434]: 114 paths

In [33]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000020`]-(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549'

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 233
CPU times: user 1.74 ms, sys: 133 µs, total: 1.88 ms
Wall time: 732 ms


#### show graph [viz here table]

In [34]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000020`]-(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549'

        RETURN DISTINCT extract(x in rels(path) | x.property_label) AS types, extract(n in nodes(path) | n.name) AS labels, length(path) AS mp_length, count(distinct path) AS paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 4.94 ms, sys: 30 µs, total: 4.97 ms
Wall time: 744 ms


In [35]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,4,"[PNGase-like, Proteasome alpha6 subunit, proteasome subunit alpha 1, coiled-coil domain containing 85B, aquaporin 1 (Colton blood group)]",8,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]"
1,4,"[PNGase-like, 26S protease regulatory subunit 6A, proteasome 26S subunit, ATPase 3, TNF receptor associated factor 2, aquaporin 1 (Colton blood group)]",8,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]"
2,4,"[PNGase-like, Proteasome alpha6 subunit, proteasome subunit alpha 1, tripartite motif containing 23, aquaporin 1 (Colton blood group)]",8,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]"
3,4,"[PNGase-like, uncharacterized protein, proteasome assembly chaperone 2, transcription factor 4, aquaporin 1 (Colton blood group)]",8,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]"
4,4,"[PNGase-like, Proteasome alpha6 subunit, proteasome subunit alpha 1, IKAROS family zinc finger 3, aquaporin 1 (Colton blood group)]",8,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]"
5,4,"[PNGase-like, Proteasome alpha6 subunit, proteasome subunit alpha 1, TNF receptor associated factor 1, aquaporin 1 (Colton blood group)]",8,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]"
6,4,"[PNGase-like, refractory to sigma P, sequestosome 1, TNF receptor associated factor 1, aquaporin 1 (Colton blood group)]",8,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]"
7,4,"[PNGase-like, Proteasome alpha6 subunit, proteasome subunit alpha 1, transcription factor 4, aquaporin 1 (Colton blood group)]",8,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]"
8,4,"[PNGase-like, Proteasome alpha6 subunit, proteasome subunit alpha 1, potassium channel tetramerization domain containing 17, aquaporin 1 (Colton blood group)]",8,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]"
9,4,"[PNGase-like, Proteasome alpha6 subunit, proteasome subunit alpha 1, microtubule associated scaffold protein 2, aquaporin 1 (Colton blood group)]",8,"[interacts with, in 1 to 1 orthology relationship with, interacts with, interacts with]"


    * refine template: rna -> 152 paths
                       metapath:
                       '4	[[GENE], [GENE], [GENE], [GENE], [GENE]]
                       [RO:0002434, RO:0002434, RO:HOM0000017, RO:0002434]': 152 paths
                       OR
                       4	[[GENE], [GENE], [GENE], [GENE], [GENE]]
                       [RO:0002434, RO:0002434, RO:HOM0000020, RO:0002434]: 56 paths

In [36]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:HOM0000017`]-(:GENE)-[:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549'

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 152
CPU times: user 1.68 ms, sys: 127 µs, total: 1.81 ms
Wall time: 69.5 ms


#### show graph [viz here table]

In [37]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:HOM0000017`]-(:GENE)-[:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549'

        RETURN DISTINCT extract(x in rels(path) | x.property_label) AS types, extract(n in nodes(path) | n.name) AS labels, length(path) AS mp_length, count(distinct path) AS paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 4.85 ms, sys: 64 µs, total: 4.91 ms
Wall time: 78.6 ms


In [38]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,4,"[PNGase-like, uncharacterized protein, Papilin, ADAMTS like 4, aquaporin 1 (Colton blood group)]",16,"[interacts with, interacts with, in orthology relationship with, interacts with]"
1,4,"[PNGase-like, Entomoglyceroporin 4, glycerol kinase 2, glycerol kinase, aquaporin 1 (Colton blood group)]",8,"[interacts with, interacts with, in orthology relationship with, interacts with]"
2,4,"[PNGase-like, MYC proto-oncogene, bHLH transcription factor, kayak, Fos proto-oncogene, AP-1 transcription factor subunit, aquaporin 1 (Colton blood group)]",8,"[interacts with, interacts with, in orthology relationship with, interacts with]"
3,4,"[PNGase-like, uncharacterized protein, sina homologue, siah E3 ubiquitin protein ligase 1, aquaporin 1 (Colton blood group)]",8,"[interacts with, interacts with, in orthology relationship with, interacts with]"
4,4,"[PNGase-like, Entomoglyceroporin 4, glycerol kinase 2, glycerol kinase 2, aquaporin 1 (Colton blood group)]",8,"[interacts with, interacts with, in orthology relationship with, interacts with]"
5,4,"[PNGase-like, uncharacterized protein, uncharacterized protein, glycerol kinase 2, aquaporin 1 (Colton blood group)]",8,"[interacts with, interacts with, in orthology relationship with, interacts with]"
6,4,"[PNGase-like, TweedleG, sina homologue, siah E3 ubiquitin protein ligase 1, aquaporin 1 (Colton blood group)]",8,"[interacts with, interacts with, in orthology relationship with, interacts with]"
7,4,"[PNGase-like, Glycogen phosphorylase, Papilin, ADAMTS like 4, aquaporin 1 (Colton blood group)]",8,"[interacts with, interacts with, in orthology relationship with, interacts with]"
8,4,"[PNGase-like, uncharacterized protein, Glycerol kinase 1, glycerol kinase 2, aquaporin 1 (Colton blood group)]",8,"[interacts with, interacts with, in orthology relationship with, interacts with]"
9,4,"[PNGase-like, uncharacterized protein, Glycerol kinase 1, glycerol kinase, aquaporin 1 (Colton blood group)]",8,"[interacts with, interacts with, in orthology relationship with, interacts with]"


    * refine template: rna -> 24 paths
                       metapath:
                       '4	[[GENE], [GENE], [GENE], [GENE], [GENE]]
                       [RO:0002434, RO:0002435, RO:HOM0000017, RO:0002434]': 24 paths

In [39]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:0002435`]-(:GENE)-[:`RO:HOM0000017`]-(:GENE)-[:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549'

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 24
CPU times: user 1.65 ms, sys: 130 µs, total: 1.78 ms
Wall time: 218 ms


#### show graph [viz here table]

In [40]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:0002435`]-(:GENE)-[:`RO:HOM0000017`]-(:GENE)-[:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked

        WHERE size(nodes_marked) = 0 AND toLower(i1.reference_uri) contains 'pubmed/29346549'

        RETURN DISTINCT extract(x in rels(path) | x.property_label) AS types, extract(n in nodes(path) | n.name) AS labels, length(path) AS mp_length, count(distinct path) AS paths 
        
        ORDER BY mp_length, paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Nodes': record['labels'], 
                  'Relations': record['types'],
                  'Metapath length': record['mp_length'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.81 ms, sys: 0 ns, total: 3.81 ms
Wall time: 221 ms


In [41]:
res_df

Unnamed: 0,Metapath length,Nodes,Paths,Relations
0,4,"[PNGase-like, heat shock protein 83-1, seven in absentia, siah E3 ubiquitin protein ligase 1, aquaporin 1 (Colton blood group)]",8,"[interacts with, genetically interacts with, in orthology relationship with, interacts with]"
1,4,"[PNGase-like, putative movement protein, Glycerol kinase 1, glycerol kinase 2, aquaporin 1 (Colton blood group)]",8,"[interacts with, genetically interacts with, in orthology relationship with, interacts with]"
2,4,"[PNGase-like, putative movement protein, Glycerol kinase 1, glycerol kinase, aquaporin 1 (Colton blood group)]",8,"[interacts with, genetically interacts with, in orthology relationship with, interacts with]"


---

* Looking for **gene interactors** between RNA genes and TFs of AQP1 (L=4)
    * metapath: reg: 188 -> 90 (~Q1)
                   metapath: 
                   '4	[[GENE], [GENE], [GENE], [GENE], [GENE]]
                   [RO:HOM0000020, RO:0002434, RO:0002434, RO:0002434]': 188 paths  
    * template:

In [42]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[:`RO:HOM0000020`]-(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 188
CPU times: user 1.8 ms, sys: 0 ns, total: 1.8 ms
Wall time: 883 ms


#### Show summary table

In [43]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:HOM0000020`]-(g1:GENE)-[i2:`RO:0002434`]-(g2:GENE)-[i3:`RO:0002434`]-(g3:GENE)-[i4:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,source,g1,g2,g3,target,i1,i2,i3,i4,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN DISTINCT source.name, 
        
                        i1.reference_supporting_text AS Prov1, 
         
                        g1.name,
                        
                        i2.reference_supporting_text AS Prov2, 
                        
                        g2.name,
        
                        i3.reference_supporting_text AS Prov3, 
                        
                        g3.name,
                        
                        i4.reference_supporting_text AS Prov4,
                        
                        target.name,
                        
                        count( path ) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'source_name': record['source.name'],
                  'Provenance_i1': record['Prov1'], 
                  'gene1_name': record['g1.name'],
                  'Provenance_i2': record['Prov2'], 
                  'gene2_name': record['g2.name'],
                  'Provenance_i3': record['Prov3'], 
                  'gene3_name': record['g3.name'],
                  'Provenance_i4': record['Prov4'],
                  'target_name': record['target.name'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 2.86 ms, sys: 3.82 ms, total: 6.68 ms
Wall time: 847 ms


In [44]:
res_df

Unnamed: 0,Paths,Provenance_i1,Provenance_i2,Provenance_i3,Provenance_i4,gene1_name,gene2_name,gene3_name,source_name,target_name
0,8,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",N-glycanase 1,ribosomal protein S27a,tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)
1,8,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the ENCODE_ENCFF001UUQ dataset in ""tftargets"" source.",This edge comes from the Monarch Knowledge Graph 2018.,N-glycanase 1,amyloid beta precursor protein,"Fos proto-oncogene, AP-1 transcription factor subunit",PNGase-like,aquaporin 1 (Colton blood group)
2,8,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",N-glycanase 1,carbonic anhydrase 9,tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)
3,8,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",N-glycanase 1,"proteasome 26S subunit, ATPase 1",tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)
4,8,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the ENCODE_ENCFF001UUQ dataset in ""tftargets"" source.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,N-glycanase 1,CCCTC-binding factor,lamin A/C,PNGase-like,aquaporin 1 (Colton blood group)
5,8,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",N-glycanase 1,"RAD23 homolog A, nucleotide excision repair protein",tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)
6,8,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",N-glycanase 1,valosin containing protein,tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)
7,8,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the ENCODE_ENCFF001UUQ dataset in ""tftargets"" source.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,N-glycanase 1,zinc finger protein 263,EGF containing fibulin extracellular matrix protein 2,PNGase-like,aquaporin 1 (Colton blood group)
8,4,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.","This edge comes from the C3:TFT dataset in ""msigdb"" source.",N-glycanase 1,leucine rich repeat and Ig domain containing 1,lymphoid enhancer binding factor 1,PNGase-like,aquaporin 1 (Colton blood group)
9,4,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.","This edge comes from the C3:TFT dataset in ""msigdb"" source.",N-glycanase 1,"protein tyrosine phosphatase, receptor type O",lymphoid enhancer binding factor 1,PNGase-like,aquaporin 1 (Colton blood group)


    * refine template: reg -> 90 paths
                       metapath (~Q1)
                       '4	[[GENE], [GENE], [GENE], [GENE], [GENE]]
                       [RO:0002434, RO:HOM0000017, RO:0002434, RO:0002434]': 90 paths

In [45]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:HOM0000017`]-(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 90
CPU times: user 1.88 ms, sys: 152 µs, total: 2.04 ms
Wall time: 733 ms


#### show graph [viz here table]

In [46]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]-(g1:GENE)-[i2:`RO:HOM0000017`]-(g2:GENE)-[i3:`RO:0002434`]-(g3:GENE)-[i4:`RO:0002434`]-(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,source,g1,g2,g3,target,i1,i2,i3,i4,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0

        RETURN DISTINCT source.name, 
        
                        i1.reference_supporting_text AS Prov1, 
         
                        g1.name,
                        
                        i2.reference_supporting_text AS Prov2, 
                        
                        g2.name,
        
                        i3.reference_supporting_text AS Prov3, 
                        
                        g3.name,
                        
                        i4.reference_supporting_text AS Prov4,
                        
                        target.name,
                        
                        count( path ) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'source_name': record['source.name'],
                  'Provenance_i1': record['Prov1'], 
                  'gene1_name': record['g1.name'],
                  'Provenance_i2': record['Prov2'], 
                  'gene2_name': record['g2.name'],
                  'Provenance_i3': record['Prov3'], 
                  'gene3_name': record['g3.name'],
                  'Provenance_i4': record['Prov4'],
                  'target_name': record['target.name'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 4.68 ms, sys: 54 µs, total: 4.74 ms
Wall time: 748 ms


In [47]:
res_df

Unnamed: 0,Paths,Provenance_i1,Provenance_i2,Provenance_i3,Provenance_i4,gene1_name,gene2_name,gene3_name,source_name,target_name
0,8,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",uncharacterized protein,"RAD23 homolog A, nucleotide excision repair protein",tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)
1,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",threonine efflux protein,lysozyme,tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)
2,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",Serpin 77Bb,serpin family B member 5,tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)
3,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",LysC,lysozyme,tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)
4,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",Lysozyme D,lysozyme,tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)
5,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.",This edge comes from the Monarch Knowledge Graph 2018.,uncharacterized protein,membrane metalloendopeptidase,transcription factor 4,PNGase-like,aquaporin 1 (Colton blood group)
6,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.",This edge comes from the Monarch Knowledge Graph 2018.,uncharacterized protein,periostin,transcription factor 4,PNGase-like,aquaporin 1 (Colton blood group)
7,4,This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.","This edge comes from the C3:TFT dataset in ""msigdb"" source.",uncharacterized protein,"RAD23 homolog B, nucleotide excision repair protein",MYC associated zinc finger protein,PNGase-like,aquaporin 1 (Colton blood group)
8,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the ENCODE_ENCFF001UUQ dataset in ""tftargets"" source.",This edge comes from the Monarch Knowledge Graph 2018.,Serpin 77Bb,serpin family E member 1,"Fos proto-oncogene, AP-1 transcription factor subunit",PNGase-like,aquaporin 1 (Colton blood group)
9,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.","MYC proto-oncogene, bHLH transcription factor","MYCN proto-oncogene, bHLH transcription factor",tumor protein p53,PNGase-like,aquaporin 1 (Colton blood group)


---

### Specific query template with < 1000 paths => results viz graphically and interesting summary table (4 guide)
Query template to answer the connection at expression level.

#### Looking for **gene interactors** between RNA genes and TFs of AQP1 ( L=4 )
* Ensure RNA and TF gene positions with conditions:
    * I am imposing rna and reg rel-type and directionality on i1 and i4 to ensure rna and tf identities
    * I am imposing i2: {1 to 1 ortho; RO:HOM0000020} rel which is more restrictive than {in ortho; RO:HOM0000017}

In [48]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000020`]-(:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 19
CPU times: user 1.31 ms, sys: 105 µs, total: 1.41 ms
Wall time: 132 ms


#### Show edges' provenance

In [49]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000020`]-(:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,i2,i3,i4,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'

        RETURN DISTINCT i1.reference_supporting_text AS Prov1, 
         
                        i2.reference_supporting_text AS Prov2, 
        
                        i3.reference_supporting_text AS Prov3, 
                        
                        i4.reference_supporting_text AS Prov4,
                        
                        count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Provenance_i1': record['Prov1'], 
                  'Provenance_i2': record['Prov2'], 
                  'Provenance_i3': record['Prov3'], 
                  'Provenance_i4': record['Prov4'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.85 ms, sys: 0 ns, total: 3.85 ms
Wall time: 87.9 ms


In [50]:
res_df

Unnamed: 0,Paths,Provenance_i1,Provenance_i2,Provenance_i3,Provenance_i4
0,15,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source."
1,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source."


In [51]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000020`]-(ortholog_rna:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,i1,i2,i3,i4,rna,ortholog_rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'

        RETURN DISTINCT i1.reference_supporting_text AS Prov1, 
        
                        rna.name,
         
                        i2.reference_supporting_text AS Prov2, 
                        
                        ortholog_rna.name,
        
                        i3.reference_supporting_text AS Prov3, 
                        
                        tf.name,
                        
                        i4.reference_supporting_text AS Prov4,
                        
                        count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Provenance_i1': record['Prov1'], 
                  'Expressed_gene_name': record['rna.name'],
                  'Provenance_i2': record['Prov2'],
                  'Expressed_ortholog_name': record['ortholog_rna.name'],
                  'Provenance_i3': record['Prov3'],
                  'TF_name': record['tf.name'],
                  'Provenance_i4': record['Prov4'], 
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 722 µs, sys: 3.75 ms, total: 4.47 ms
Wall time: 92.2 ms


In [52]:
res_df

Unnamed: 0,Expressed_gene_name,Expressed_ortholog_name,Paths,Provenance_i1,Provenance_i2,Provenance_i3,Provenance_i4,TF_name
0,26S protease regulatory subunit 6A,"proteasome 26S subunit, ATPase 3",4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53
1,Proteasome alpha6 subunit,proteasome subunit alpha 1,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53
2,Proteasome alpha3 subunit,proteasome subunit alpha 4,4,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53
3,"MYC proto-oncogene, bHLH transcription factor","MYC proto-oncogene, bHLH transcription factor",2,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.",lymphoid enhancer binding factor 1
4,"MYC proto-oncogene, bHLH transcription factor","MYC proto-oncogene, bHLH transcription factor",2,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the C3:TFT dataset in ""msigdb"" source.",TATA-box binding protein
5,refractory to sigma P,sequestosome 1,2,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53
6,"MYC proto-oncogene, bHLH transcription factor","MYC proto-oncogene, bHLH transcription factor",1,"To understand how loss of NGLY1 contributes to disease, we developed a Drosophila model of NGLY1 deficiency. Loss of NGLY1 function resulted in developmental delay and lethality. We used RNAseq to determine which processes are misregulated in the absence of NGLY1.",This edge comes from the Monarch Knowledge Graph 2018.,This edge comes from the Monarch Knowledge Graph 2018.,"This edge comes from the TRED dataset in ""tftargets"" source.",tumor protein p53


#### Show interesting summary table with rna and tf data

In [53]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000020`]-(ortholog_rna:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'

        RETURN count(distinct rna) as rnas, count(distinct tf) as tfs, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Expressed_genes': record['rnas'], 
                  'TFs': record['tfs'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 2.92 ms, sys: 235 µs, total: 3.15 ms
Wall time: 91 ms


In [54]:
res_df

Unnamed: 0,Expressed_genes,Paths,TFs
0,5,19,3


In [55]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000020`]-(ortholog_rna:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,rna,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'

        RETURN DISTINCT rna.name, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Expressed_gene_name': record['rna.name'], 
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 0 ns, sys: 3.71 ms, total: 3.71 ms
Wall time: 90.2 ms


In [56]:
res_df

Unnamed: 0,Expressed_gene_name,Paths
0,"MYC proto-oncogene, bHLH transcription factor",5
1,Proteasome alpha6 subunit,4
2,Proteasome alpha3 subunit,4
3,26S protease regulatory subunit 6A,4
4,refractory to sigma P,2


In [57]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000020`]-(ortholog_rna:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'

        RETURN DISTINCT tf.name, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'TF_name': record['tf.name'], 
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 2.98 ms, sys: 0 ns, total: 2.98 ms
Wall time: 87.2 ms


In [58]:
res_df

Unnamed: 0,Paths,TF_name
0,15,tumor protein p53
1,2,TATA-box binding protein
2,2,lymphoid enhancer binding factor 1


In [59]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000020`]-(ortholog_rna:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'

        RETURN DISTINCT rna.name, tf.name, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Expressed_gene_name': record['rna.name'], 
                  'TF_name': record['tf.name'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.14 ms, sys: 0 ns, total: 3.14 ms
Wall time: 90 ms


In [60]:
res_df

Unnamed: 0,Expressed_gene_name,Paths,TF_name
0,Proteasome alpha3 subunit,4,tumor protein p53
1,26S protease regulatory subunit 6A,4,tumor protein p53
2,Proteasome alpha6 subunit,4,tumor protein p53
3,refractory to sigma P,2,tumor protein p53
4,"MYC proto-oncogene, bHLH transcription factor",2,TATA-box binding protein
5,"MYC proto-oncogene, bHLH transcription factor",2,lymphoid enhancer binding factor 1
6,"MYC proto-oncogene, bHLH transcription factor",1,tumor protein p53


#### The same query, but with less restrictive orthology condition on i2: {in ortho; HOM0000017}

In [61]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000017`]-(ortholog_rna:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'

        RETURN DISTINCT rna.name, tf.name, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Expressed_gene_name': record['rna.name'], 
                  'TF_name': record['tf.name'],
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.14 ms, sys: 0 ns, total: 3.14 ms
Wall time: 70.2 ms


In [62]:
res_df

Unnamed: 0,Expressed_gene_name,Paths,TF_name
0,Serpin 77Bb,6,tumor protein p53
1,Lysozyme B,4,tumor protein p53
2,Lysozyme D,4,tumor protein p53
3,"MYC proto-oncogene, bHLH transcription factor",4,tumor protein p53
4,LysC,4,tumor protein p53
5,Heat shock protein 68,4,tumor protein p53
6,Carbonic anhydrase 2,4,lymphoid enhancer binding factor 1
7,Carbonic anhydrase 2,4,tumor protein p53
8,threonine efflux protein,4,tumor protein p53
9,Carbonic anhydrase 2,2,protein E12


This less restrictive query returns 42 paths. Bit diff info regarding rna and tf genes. But, for the guide maybe better the other (prior) one.

---
## Question 2
Input rna + tf genes interactors to look for NGLY1 phenotypes
* 1 to 1 orthology

In [63]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000020`]-(ortholog_rna:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,ortholog_rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'      

        WITH COLLECT(DISTINCT ortholog_rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 30
CPU times: user 1.57 ms, sys: 126 µs, total: 1.7 ms
Wall time: 181 ms


In [64]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000020`]-(ortholog_rna:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,ortholog_rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'      

        WITH COLLECT(DISTINCT ortholog_rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 4.39 ms, sys: 30 µs, total: 4.42 ms
Wall time: 187 ms


In [65]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,"[sequestosome 1, MYC proto-oncogene, bHLH transcription factor, tumor protein p53, TATA-box binding protein]",4,HP:0002015,Premature spillage
1,"[tumor protein p53, TATA-box binding protein, sequestosome 1]",3,HP:0001310,Dysmetria
2,"[TATA-box binding protein, sequestosome 1]",2,HP:0001336,Myoclonus
3,"[sequestosome 1, TATA-box binding protein]",2,HP:0002072,choreatic disease
4,"[TATA-box binding protein, sequestosome 1]",2,HP:0001272,Cerebellar atrophy
5,"[sequestosome 1, TATA-box binding protein]",2,HP:0001332,dystonia
6,"[tumor protein p53, TATA-box binding protein]",2,HP:0001250,epileptic seizure
7,[sequestosome 1],1,HP:0000657,Oculomotor apraxia
8,[sequestosome 1],1,HP:0009830,peripheral nervous system disease
9,[tumor protein p53],1,HP:0002910,Elevated transaminases


* in orthology

In [66]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000017`]-(ortholog_rna:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'   

        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(:DISO)--(g:GENE)
        
        WHERE g.id in genes
        
        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 6
CPU times: user 1.65 ms, sys: 131 µs, total: 1.78 ms
Wall time: 214 ms


In [67]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[i2:`RO:HOM0000017`]-(ortholog_rna:GENE)-[i3:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,rna,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        AND toLower(i4.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'      

        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT tf.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 0 ns, sys: 4.06 ms, total: 4.06 ms
Wall time: 209 ms


In [68]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,[tumor protein p53],1,HP:0002015,Premature spillage
1,[tumor protein p53],1,HP:0002910,Elevated transaminases
2,[tumor protein p53],1,HP:0001310,Dysmetria
3,[tumor protein p53],1,HP:0001250,epileptic seizure


In orthology relationship, which is less restrictive, gives less phenotypic results. So, this is not interesting because we are loosing precision and information quality.

---
# Further analysis of Q1, without rule on i4 to impose tf position

### Specific query template with < 1000 paths => results viz graphically
Query template to answer the connection at expression level.

* i1:in orthology relationship + interacts with (no rule on i4)

In [69]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000017`]-(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 54
CPU times: user 1.27 ms, sys: 103 µs, total: 1.38 ms
Wall time: 120 ms


* i1:in 1 to 1 orthology + interacts with (no rule on i4) + rna **AND** regulatory edges (rule)

In [70]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000020`]-(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 

        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 30
CPU times: user 2.33 ms, sys: 188 µs, total: 2.52 ms
Wall time: 88.2 ms


    * interesting summary table: tf and expressed genes
        * expressed genes summary

In [71]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:HOM0000020`]-(:GENE)-[:`RO:0002434`]-(:GENE)-[:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,rna,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 

        RETURN DISTINCT rna.name, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'Expressed_gene_name': record['rna.name'], 
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 0 ns, sys: 2.79 ms, total: 2.79 ms
Wall time: 84.4 ms


In [72]:
res_df

Unnamed: 0,Expressed_gene_name,Paths
0,"MYC proto-oncogene, bHLH transcription factor",14
1,Proteasome alpha6 subunit,4
2,Proteasome alpha3 subunit,4
3,26S protease regulatory subunit 6A,4
4,refractory to sigma P,2
5,uncharacterized protein,2


        * summary of TFs:  I HAVE TO MAKE A RULE ON i4!!! 30 => 19 paths

In [73]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:HOM0000020`]-(:GENE)-[:`RO:0002434`]-(tf:GENE)-[i4:`RO:0002434`]->(target:GENE)

        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,tf,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0 
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549'
        
        AND i4.reference_supporting_text =~ '.*tftargets.*|.*msigdb.*'

        RETURN DISTINCT tf.name, count(distinct path) as paths
        
        ORDER BY paths DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'TF_name': record['tf.name'], 
                  'Paths': record['paths']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 0 ns, sys: 3.41 ms, total: 3.41 ms
Wall time: 84.9 ms


In [74]:
res_df

Unnamed: 0,Paths,TF_name
0,15,tumor protein p53
1,2,TATA-box binding protein
2,2,lymphoid enhancer binding factor 1


---
## Question 2

* Explore Q2 without imposing node position for the TF

In [75]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000017`]-(ortholog_rna:GENE)-[:`RO:0002434`]-(reg:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,ortholog_rna,reg,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0      
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 

        WITH COLLECT(DISTINCT ortholog_rna.id) + COLLECT(DISTINCT reg.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN count(distinct path) as paths
        """
)

# run query
result = runQuery( driver, query )

# parse results
for record in result:
    print('Paths: {}'.format(record['paths']))

Paths: 42
CPU times: user 0 ns, sys: 3.02 ms, total: 3.02 ms
Wall time: 258 ms


In [76]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000017`]-(ortholog_rna:GENE)-[:`RO:0002434`]-(reg:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,ortholog_rna,reg,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0      
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        WITH COLLECT(DISTINCT ortholog_rna.id) + COLLECT(DISTINCT reg.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 5.33 ms, sys: 0 ns, total: 5.33 ms
Wall time: 236 ms


In [77]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,"[Fos proto-oncogene, AP-1 transcription factor subunit, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0001249,intellectual disability
1,"[MYCN proto-oncogene, bHLH transcription factor, transcription factor 4]",2,HP:0001263,Developmental disability
2,"[MYCN proto-oncogene, bHLH transcription factor, transcription factor 4]",2,HP:0000252,microcephaly
3,"[tumor protein p53, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0002910,Elevated transaminases
4,"[tumor protein p53, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0001310,Dysmetria
5,"[lysozyme, Fos proto-oncogene, AP-1 transcription factor subunit]",2,HP:0002240,hepatomegaly
6,"[transcription factor 4, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0000463,Anteverted nares
7,"[tumor protein p53, transcription factor 4]",2,HP:0001250,epileptic seizure
8,"[MYCN proto-oncogene, bHLH transcription factor]",1,HP:0001336,Myoclonus
9,[lysozyme],1,HP:0001744,Splenomegaly


* collecting only reg genes

In [78]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000017`]-(ortholog_rna:GENE)-[:`RO:0002434`]-(reg:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,ortholog_rna,reg,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0      
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        WITH COLLECT(DISTINCT reg.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.62 ms, sys: 0 ns, total: 3.62 ms
Wall time: 150 ms


In [79]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,"[tumor protein p53, transcription factor 4]",2,HP:0001250,epileptic seizure
1,"[Fos proto-oncogene, AP-1 transcription factor subunit]",1,HP:0001249,intellectual disability
2,[transcription factor 4],1,HP:0001263,Developmental disability
3,[tumor protein p53],1,HP:0002015,Premature spillage
4,[transcription factor 4],1,HP:0000252,microcephaly
5,"[Fos proto-oncogene, AP-1 transcription factor subunit]",1,HP:0009830,peripheral nervous system disease
6,[transcription factor 4],1,HP:0002650,scoliosis
7,[tumor protein p53],1,HP:0002910,Elevated transaminases
8,[transcription factor 4],1,HP:0002019,constipation
9,[tumor protein p53],1,HP:0001310,Dysmetria


* only mondo in the match

In [80]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000017`]-(ortholog_rna:GENE)-[:`RO:0002434`]-(reg:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,ortholog_rna,reg,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0      
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        WITH COLLECT(DISTINCT ortholog_rna.id) + COLLECT(DISTINCT reg.id) AS genes   
        
        MATCH path=(:DISO {id: 'MONDO:0014109'})-->(ph:DISO)--(g:GENE)
           
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 2.54 ms, sys: 209 µs, total: 2.75 ms
Wall time: 149 ms


In [81]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,"[MYCN proto-oncogene, bHLH transcription factor, transcription factor 4]",2,HP:0001263,Developmental disability
1,"[MYCN proto-oncogene, bHLH transcription factor, transcription factor 4]",2,HP:0000252,microcephaly
2,"[tumor protein p53, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0002910,Elevated transaminases
3,"[lysozyme, Fos proto-oncogene, AP-1 transcription factor subunit]",2,HP:0002240,hepatomegaly
4,"[tumor protein p53, transcription factor 4]",2,HP:0001250,epileptic seizure
5,"[Fos proto-oncogene, AP-1 transcription factor subunit]",1,HP:0000975,Hyperhidrosis
6,[transcription factor 4],1,HP:0002650,scoliosis
7,[transcription factor 4],1,HP:0200055,Small hand
8,[transcription factor 4],1,HP:0001252,Hypotonia
9,"[MYCN proto-oncogene, bHLH transcription factor]",1,HP:0001945,Fever


* including mondo in the match

In [82]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(:GENE)-[:`RO:HOM0000017`]-(ortholog_rna:GENE)-[:`RO:0002434`]-(reg:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,ortholog_rna,reg,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0      
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        WITH COLLECT(DISTINCT ortholog_rna.id) + COLLECT(DISTINCT reg.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        OPTIONAL MATCH (:DISO {id: 'MONDO:0014109'})-->(ph:DISO)--(g:GENE)
        
        //WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.14 ms, sys: 0 ns, total: 3.14 ms
Wall time: 255 ms


In [83]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,"[Fos proto-oncogene, AP-1 transcription factor subunit, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0001249,intellectual disability
1,"[MYCN proto-oncogene, bHLH transcription factor, transcription factor 4]",2,HP:0001263,Developmental disability
2,"[MYCN proto-oncogene, bHLH transcription factor, transcription factor 4]",2,HP:0000252,microcephaly
3,"[tumor protein p53, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0002910,Elevated transaminases
4,"[tumor protein p53, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0001310,Dysmetria
5,"[lysozyme, Fos proto-oncogene, AP-1 transcription factor subunit]",2,HP:0002240,hepatomegaly
6,"[transcription factor 4, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0000463,Anteverted nares
7,"[tumor protein p53, transcription factor 4]",2,HP:0001250,epileptic seizure
8,"[MYCN proto-oncogene, bHLH transcription factor]",1,HP:0001336,Myoclonus
9,[lysozyme],1,HP:0001744,Splenomegaly


* including fly rna genes

In [84]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:HOM0000017`]-(ortholog_rna:GENE)-[:`RO:0002434`]-(reg:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,rna,ortholog_rna,reg,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0      
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT ortholog_rna.id) + COLLECT(DISTINCT reg.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes AND ph.id CONTAINS 'HP:'
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 4.38 ms, sys: 0 ns, total: 4.38 ms
Wall time: 322 ms


In [85]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,"[Fos proto-oncogene, AP-1 transcription factor subunit, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0001249,intellectual disability
1,"[MYCN proto-oncogene, bHLH transcription factor, transcription factor 4]",2,HP:0001263,Developmental disability
2,"[MYCN proto-oncogene, bHLH transcription factor, transcription factor 4]",2,HP:0000252,microcephaly
3,"[tumor protein p53, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0002910,Elevated transaminases
4,"[tumor protein p53, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0001310,Dysmetria
5,"[lysozyme, Fos proto-oncogene, AP-1 transcription factor subunit]",2,HP:0002240,hepatomegaly
6,"[transcription factor 4, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0000463,Anteverted nares
7,"[tumor protein p53, transcription factor 4]",2,HP:0001250,epileptic seizure
8,"[MYCN proto-oncogene, bHLH transcription factor]",1,HP:0001336,Myoclonus
9,[lysozyme],1,HP:0001744,Splenomegaly


* including fly rna genes and all DISO (no rule)

In [86]:
%%time
query = (
        """
        MATCH path=(source:GENE)-[i1:`RO:0002434`]->(rna:GENE)-[:`RO:HOM0000017`]-(ortholog_rna:GENE)-[:`RO:0002434`]-(reg:GENE)-[:`RO:0002434`]->(target:GENE)
        
        WHERE source.id = 'FlyBase:FBgn0033050' AND target.id = 'HGNC:633' AND ALL(x IN nodes(path) WHERE single(y IN nodes(path) WHERE y = x))

        WITH path,rna,ortholog_rna,reg,

        [n IN nodes(path) WHERE n.preflabel IN ['cytoplasm','cytosol','nucleus','metabolism','membrane','protein binding','visible','viable','phenotype']] AS nodes_marked,

        [r IN relationships(path) WHERE toLower(r.reference_supporting_text) =~ '.*tftargets.*|.*msigdb.*'] AS interactions

        WHERE size(nodes_marked) = 0 AND size(interactions) <> 0      
        
        AND toLower(i1.reference_uri) contains 'pubmed/29346549' 
        
        WITH COLLECT(DISTINCT rna.id) + COLLECT(DISTINCT ortholog_rna.id) + COLLECT(DISTINCT reg.id) AS genes
        
        MATCH path=(:DISO {id: 'DOID:0060728'})-->(ph:DISO)--(g:GENE)
        
        WHERE g.id in genes 
        
        RETURN DISTINCT ph.id AS phenotype_id, ph.preflabel AS phenotype_label, COLLECT(DISTINCT g.name) AS gene_id_list, count(DISTINCT g.id) AS genes
        
        ORDER BY genes DESC
        """
)

# run query
result = runQuery( driver, query )

# parse results
out_l = list()
for record in result:
    out_l.append({'phenotype_label': record['phenotype_label'],
                  'phenotype_id': record['phenotype_id'],
                  'gene_id_list': record['gene_id_list'],
                  'genes': record['genes']})
    
res_df = pd.DataFrame(out_l)

CPU times: user 3.21 ms, sys: 0 ns, total: 3.21 ms
Wall time: 316 ms


In [87]:
res_df

Unnamed: 0,gene_id_list,genes,phenotype_id,phenotype_label
0,"[Fos proto-oncogene, AP-1 transcription factor subunit, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0001249,intellectual disability
1,"[MYCN proto-oncogene, bHLH transcription factor, transcription factor 4]",2,HP:0001263,Developmental disability
2,"[MYCN proto-oncogene, bHLH transcription factor, transcription factor 4]",2,HP:0000252,microcephaly
3,"[tumor protein p53, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0002910,Elevated transaminases
4,"[tumor protein p53, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0001310,Dysmetria
5,"[lysozyme, Fos proto-oncogene, AP-1 transcription factor subunit]",2,HP:0002240,hepatomegaly
6,"[transcription factor 4, MYCN proto-oncogene, bHLH transcription factor]",2,HP:0000463,Anteverted nares
7,"[tumor protein p53, transcription factor 4]",2,HP:0001250,epileptic seizure
8,"[MYCN proto-oncogene, bHLH transcription factor]",1,HP:0001336,Myoclonus
9,[lysozyme],1,HP:0001744,Splenomegaly
