In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%matplotlib inline

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# Load modules

   * [pandas](https://pandas.pydata.org/)
   * [python-igraph](http://igraph.org/python/)
   * [py2neo](http://py2neo.org/2.0/intro.html)
   * [visJS2jupyter](https://ucsd-ccbb.github.io/visJS2jupyter/)

In [None]:
import pandas as pd
import igraph as ig
from py2neo import Graph, Node, Relationship
import visJS2jupyter.visJS_module

# The graph
See http://localhost:7474/browser/ for the neo4j interface. 

The graph has the following structure:

<p align="center">
    <span style="color:red">(BABY)</span>
    <span style="color:blue">   –[SAMPLELINK]&rarr;   </span>
    <span style="color:red">(SAMPLE)</span>
    <span style="color:blue">   –[PROTEINLINK]&rarr;   </span>
    <span style="color:red">(PROTEIN)</span>
    <span style="color:blue">   –[FUNCTIONLINK]&rarr;   </span>
    <span style="color:red">(REACTION)</span>
</p>

and: 

<p align="center">
    <span style="color:red"> (COMPOUND) </span>
    <span style="color:blue">   –[SUBSTRATE]&rarr;   </span>
    <span style="color:red">(REACTION)</span>
    <span style="color:blue">   –[PRODUCT]&rarr;   </span>
    <span style="color:red"> (COMPOUND) </span>
</p>

where red are nodes and blue are edges. 

---

The attributes for each of the above objects are as follows: 
* **BABY**
  * id: *string*, unique
  * NAME: *string*, same as id
  * NEC: *string*
  * NEC_DIAGNOSIS_DOL: *int*	
  * BIRTH_AGE: *int*	
  * FEEDING: *string*	
  * DELIVERY: *string*
  * BIRTH_WEIGHT: *flaot*
  * INFECTION: *string*
  * INFECTION_DIAGNOSIS_DOL: *list*
  * SEX: *string*
  * ANTIBIOTIC_TREATMENT: *list*


* **SAMPLE**
  * id: *string*, unique \<"BABY"\>.\<"DAY"\>.\<"SAMPLE"\>
  * NAME: *string*, same as id
  * DAY: *int*
  * GESTATION_WEEK: *int*
  * BABY: *int*
  * NUM_REPS: *int*


* **PROTEIN**
  * id: *string*, unique
  * NAME: *string* same as id
  * CLUSTER: *string*
  * GENUS: *string* 
  * SPECIES *string*
  * TAXA: *string*
  * KO: *list*
  

* **REACTION** 
  * id: *string*, unique, 
  * NAME: *string*
  * SYNONYMS: *list*
  * EC_NUMBERS: *list*
  * EQUATION: *string*
  * NAME_EQUATION: *string*
  * PATHWAY: *list*
  * DIRECTION_FROM_METACYC *string


* **COMPOUND**
  * id: *string*, unique
  * NAME: *string*
  * SYNONYMS: *list*
  * FORMULA: *string*
  * PATHWAY: *list*
  * BRITE_HIERARCHY: *list*


* –[**SAMPLELINK**]&rarr;



* –[**PROTEINLINK**]&rarr;
  * Q_VALUE: *float*
  * NSAF: *float*


* –[**FUNCTIONLINKE*]&rarr;
  * KO: *string*



* –[**PRODUCT**]&rarr;
  * STOICHIOMETRY: *string*


* –[**SUBSTRATE**]&rarr;
  * STOICHIOMETRY: *string*

---
# Query graph

The graph can be queried using CYPHER in two ways:

## 1. py2neo
Write CYPHER statements and recive a cursor back

## 2. GUI
Use the CYPHER input at the database browser GUI.  


---
## REMEMBER: 

* `n.id`  != `ID(n)`

  `n.id` is the unique ID given to a compound, reaction, etc by us. 
  
  `ID(n)` is a unique integer ID given to each object in the neo4j graph by neo4j. 


* A `cursor` created by a `graph.run` command is 

   >*"forward-only", meaning that navigation starts before the first record and may proceed only in a forward direction.*

# Access the graph through py2neo

In [None]:
graph = Graph(host="neo4j")

# Example CYPHER queries

These queries can be pasted in to the neo4j interface, or run through py2neo

In [None]:
# Reactions with acetate (C00033) as substrate
query = '''MATCH (c:COMPOUND)-[:SUBSTRATE]->(r:REACTION) 
           WHERE c.id = 'C00033' 
           RETURN r.id AS reaction, c.id AS compound
           LIMIT 5'''
print(query)

In [None]:
# find reactions linked to certain pathways
pathway_list = ['rn00710', 'rn00791', 'rn00770']
query = '''MATCH (r:REACTION) 
           WHERE any(x IN r.PATHWAY WHERE x IN %s) 
           RETURN r 
           LIMIT 5'''%pathway_list
print(query)

## Run by submitting a query to py2neo

In [None]:
# Reactions and Proteins linked to acetate (C00033)
query = '''MATCH (c:COMPOUND)-[]-(r:REACTION)<-[:FUNCTIONLINK]-(p:PROTEIN) 
           WHERE c.id = 'C00033' 
           RETURN r.id AS reaction, p.id as protein
           LIMIT 5'''
print(query)

In [None]:
cursor = graph.run(query)
for record in cursor:
    print(record['reaction'], '\t', record['protein'])

# Use baby metadata to select samples

## Samples from NEC babies


In [None]:
query = '''MATCH (b:BABY)-[:SAMPLELINK]->(s:SAMPLE)
           WHERE b.NEC = 'Yes' 
           RETURN s.id as sample
           LIMIT 5'''
print(query)

In [None]:
cursor = graph.run(query)
for record in cursor:
    print(record['sample'])

## Baby 70,  Day 20

In [None]:
query = '''MATCH (b:BABY)-[:SAMPLELINK]->(s:SAMPLE)
           WHERE b.id = '70' 
           AND s.DAY = '20'
           RETURN s.id as sample
           LIMIT 5'''
print(query)

In [None]:
cursor = graph.run(query)
for record in cursor:
    print(record['sample'])

## All reactions for all samples on day 20

In [None]:
query = '''MATCH (s:SAMPLE)
           WHERE s.DAY = '20'
           RETURN s.id as sample
           LIMIT 5'''
print(query)

In [None]:
cursor = graph.run(query)
sample_list = [record["sample"] for record in cursor]
sample_list

In the following queries, using `{x}` in the query allows py2neo to replace it
with some arguement, in this case we will replace it with the above list of sample ids. 

Note the line `LIMIT 10`. This is to run example code quickly, don't include it in a real call. 

In [None]:
# query for finding all protein/reactions related to these samples
query = '''MATCH (s:SAMPLE)-[:PROTEINLINK]->(p:PROTEIN)-[:FUNCTIONLINK]->(r:REACTION)
           WHERE any(z IN s.id WHERE z IN {x}) 
           RETURN p.id as protein, r.id as reaction
           LIMIT 10 '''
print(query)

In [None]:
cursor = graph.run(query, x=sample_list)
for p in cursor:
    print(p['protein'], p["reaction"])

In [None]:
# Now find links to reactions producing acetate
query = '''MATCH (s:SAMPLE)-[:PROTEINLINK]->(p:PROTEIN)-[f:FUNCTIONLINK]->(r:REACTION)-[:PRODUCT]->(c:COMPOUND)
           WHERE c.id = 'C00033' 
           AND any(z IN s.id WHERE z IN {x})
           RETURN r.id AS reaction, p.id as protein, f.KO AS KO
           LIMIT 25'''
print(query)

In [None]:
# duplicates are due to multiple proteins linking to the same reactions
cursor = graph.run(query, x=sample_list)
for record in cursor:
    print(record['reaction'], record['protein'], record['KO'])

In [None]:
# dump the cursor this time into a dataframe
# and remove duplicates
cursor = graph.run(query, x=sample_list)
df = pd.DataFrame(cursor.data()).drop_duplicates()
df.head()

# Turn a query result into an igraph graph

In [None]:
g = ig.Graph.TupleList(df.values, directed=False)

In [None]:
g.vcount(), g.ecount()

In [None]:
layout = g.layout("kk")

# Pull graphs based on sample list

Given a list of samples, and list of compounds, pull out graph of reactions to compounds. 

In [None]:
def pull_graph(sample_list, compounds_list):
    
    query = '''MATCH (s:SAMPLE)-[:PROTEINLINK]->(:PROTEIN)-[:FUNCTIONLINK]->(r:REACTION)-[]-(c:COMPOUND)
               WHERE s.id in {x}
               AND c.id in {y}
               RETURN r.id AS reaction'''
    
    cursor = graph.run(query, x=sample_list, y=compounds_list)
    reactions = list(set([record["reaction"] for record in cursor]))    
    
    # one hop from the reactions  
    query = '''MATCH (c:COMPOUND)-[]-(r:REACTION)
               WHERE r.id IN {x}
               RETURN r.id AS reaction, c.id as compound'''
    
    cursor = graph.run(query, x=reactions)
    df_hop_from_reaction = pd.DataFrame(cursor.data())
       
    g = ig.Graph.TupleList(df_hop_from_reaction.values, directed=False)
    
    # Get the "type" and "display_name" of each of the nodes
    all_node_ids = [v['name'] for v in g.vs()]

    query = '''MATCH (n)
               WHERE n.id IN {x}
               RETURN n.id AS id, LABELS(n)[0] AS type, n.NAME as display_name '''

    cursor = graph.run(query, x=all_node_ids)
    vertex_attributes = pd.DataFrame(cursor.data()).set_index('id')
    
    for v in g.vs:
        v['type'] = vertex_attributes.loc[v['name']]['type']
        v['display_name'] = vertex_attributes.loc[v['name']]['display_name']
        
    return g

In [None]:
compounds_list = pd.read_excel("../data/CompoundsOfInterest.xlsx", header=None, sheet_name=0)
compounds_list = [x.strip() for x in compounds_list[0].values]

In [None]:
print(sample_list)
print(compounds_list)

In [None]:
g = pull_graph(sample_list, compounds_list)

In [None]:
g = g.simplify()

In [None]:
g.vcount(), g.ecount()

# Visualize a graph using visJS2jupyter

From https://github.com/ucsd-ccbb/visJS2jupyter/issues/21 and https://github.com/ucsd-ccbb/visJS2jupyter/issues/22, a work around is needed to plot inline. 
Instead open the html file externally. 

In [None]:
def prepare_plot_igraph(g, 
                        layout='fruchterman_reingold', 
                        positions=None, 
                        scale=100, 
                        edge_cmap=plt.cm.tab20c, 
                        vertex_cmap=plt.cm.tab20):

    # calculate layout if needed
    if positions is None: 
        positions=g.layout(layout)
 
    # color vertices
    d_vertex = {}
    uniq_vertex_att = set(g.vs['type'])
    vertex_cmap_values = vertex_cmap(np.arange(len(uniq_vertex_att))/len(uniq_vertex_att), alpha=1)
    for att, row in zip(uniq_vertex_att, vertex_cmap_values):
        d_vertex[att] = matplotlib.colors.rgb2hex(row)
      
    nodes_dict = [{"id":n.attributes()['name'],
                   "name":n.attributes()['display_name'],
                   "x":positions[n.index][0]*scale,
                   "y":positions[n.index][1]*scale,  
                   "degree":g.degree(n), 
                   "color":d_vertex[n.attributes()["type"]]
                  } for n in g.vs()]
   
    edges_dict = [{"source":n.source, 
                   "target":n.target
                  } for n in g.es()]

    return nodes_dict, edges_dict

In [None]:
nodes_dict, edges_dict = prepare_plot_igraph(g, scale=10)

In [None]:
# plot network
html = visJS2jupyter.visJS_module.visjs_network(nodes_dict,
                                         edges_dict, 
                                         node_size_multiplier=30,
                                         node_size_field='degree',
                                         edge_label_field='type',
                                         node_label_field='name',
                                         edge_width=5, 
                                         edge_arrow_to=False,
                                         edge_color_highlight='#8A324E',
                                         edge_color_hover='red',
                                         graph_id=0).data

In [None]:
with open("example_graph.html", "w") as out:
    out.write(html)