In [None]:
# @name
# @description
# @author
# @date

# Description

This is the notebook for the creation of the first review network and derived hypotheses. 

* Using intermediary variables from workflow objects. In this workflow variables are directly used for the next step. 


* Review network: From Monarch knowledge graph, we built a network seeded by 8 nodes, retrieving their explicit relationships and all the relationships among all these nodes. Seed nodes:

    - 'MONDO:0014109', # NGLY1 deficiency
    - 'HGNC:17646', # NGLY1 human gene
    - 'HGNC:633', # AQP1 human gene
    - 'MGI:103201', # AQP1 mouse gene
    - 'HGNC:7781', # NRF1 human gene* Ginger: known as NFE2L1. http://biogps.org/#goto=genereport&id=4779
    - 'HGNC:24622', # ENGASE human gene
    - 'HGNC:636', # AQP3 human gene
    - 'HGNC:19940' # AQP11 human gene
    

* Connecting paths: query templates.

In [1]:
#import time
import transcriptomics, regulation, curation, monarch, graph, neo4jlib, hypothesis, summary, utils

## Edges library
### Review edges to integrate into the knowledge graph
#### import transcriptomics
We retrieved edges from RNA-seq transcriptomics profiles using the transcriptomics module:

    - Experimental data sets: from Chow et al. paper [pmid:29346549] (NGLY1 deficiency model on fruit fly)

In [None]:
%%time
# prepare data to graph schema
csv_path = '~/workspace/ngly1-graph/regulation/ngly1-fly-chow-2018/data/supp_table_1.csv'
data = transcriptomics.read_data(csv_path)
clean_data = transcriptomics.clean_data(data)
data_edges = transcriptomics.prepare_data_edges(clean_data)
rna_network = transcriptomics.prepare_rna_edges(data_edges)

# build network with graph schema
rna_edges = transcriptomics.build_edges(rna_network)
rna_nodes = transcriptomics.build_nodes(rna_network)

Network is returned as both CSV files at graph/ and digital object

In [None]:
# print type of objects
print('type edges:', type(rna_edges))
print('type nodes:', type(rna_nodes))
print()

# print objects sizes
print('len edges:', len(rna_edges))
print('len nodes:', len(rna_nodes))
print()

# print object attribute
print('attribute edges:', rna_edges[0].keys())
print('attribute nodes:', rna_nodes[0].keys())

#### import regulation

In [None]:
%%time
# prepare msigdb data
gmt_path = '/home/nuria/workspace/ngly1-graph/regulation/msigdb/data/c3.tft.v6.1.entrez.gmt'
regulation.prepare_msigdb_data(gmt_path)

# prepare individual networks
data = regulation.load_tf_gene_edges()
dicts = regulation.get_gene_id_normalization_dictionaries(data)
data_edges = regulation.prepare_data_edges(data, dicts)

# prepare regulation network
reg_network = regulation.prepare_regulation_edges(data_edges)

# build network with graph schema
reg_edges = regulation.build_edges(reg_network)
reg_nodes = regulation.build_nodes(reg_network)

Network is returned as both CSV file at graph/ and digital object

In [None]:
# print type of objects
print('type edges:', type(reg_edges))
print('type nodes:', type(reg_nodes))
print()

# print objects sizes
print('len edges:', len(reg_edges))
print('len nodes:', len(reg_nodes))
print()

# print object attribute
print('attribute edges:', reg_edges[0].keys())
print('attribute nodes:', reg_nodes[0].keys())

#### import curation

In [None]:
#%%time
## version for wikibase (data will come from wikibase dump neo4j CSV)
## prepare curated edges and nodes
#csv_path = '/home/nuria/workspace/ngly1-graph/regulation/graph/curated_v20180118'
#edges_df, nodes_df = curation.read_data(csv_path)
#curated_graph_df = curation.prepare_curated_edges(edges_df)
#curated_graph_nodes_df = curation.prepare_curated_nodes(nodes_df)

## build edges and nodes files with graph schema
#curation_edges = curation.build_edges(curated_graph_df)
#curation_nodes = curation.build_nodes(curated_graph_nodes_df)

In [None]:
%%time
# graph v3.2
# read network from drive and concat all curated statements
curation_edges, curation_nodes = read_network(version='v20180118')

# prepare data edges and nodes
data_edges = prepare_data_edges(curation_edges)
data_nodes = prepare_data_nodes(curation_nodes)

# prepare curated edges and nodes
curated_network = prepare_curated_edges(data_edges)
curated_concepts = prepare_curated_nodes(data_nodes)


# build edges and nodes files
curation_edges = build_edges(curated_network)
curation_nodes = build_nodes(curated_concepts)

Network is returned as both CSV file and digital object

In [None]:
# print type of objects
print('type edges:', type(curation_edges))
print('type nodes:', type(curation_nodes))
print()

# print objects sizes
print('len edges:', len(curation_edges))
print('len nodes:', len(curation_nodes))
print()

# print object attribute
print('attribute edges:', curation_edges[0].keys())
print('attribute nodes:', curation_nodes[0].keys())

#### import monarch
We retrieved edges from Monarch using the monarch module:

    - From 8 seed nodes we retrieved 1st shell
    - From all seed and 1 shell nodes we retrieved edges among them

In [None]:
%%time
# prepare data to graph schema
# seed nodes
seedList = [ 
    'MONDO:0014109', # NGLY1 deficiency
    'HGNC:17646', # NGLY1 human gene
    'HGNC:633', # AQP1 human gene
    'MGI:103201', # AQP1 mouse gene
    'HGNC:7781', # NRF1 human gene* Ginger: known as NFE2L1. http://biogps.org/#goto=genereport&id=4779
    'HGNC:24622', # ENGASE human gene
    'HGNC:636', # AQP3 human gene
    'HGNC:19940' # AQP11 human gene
] 

# get first shell of neighbours
neighboursList = monarch.get_neighbours_list(seedList)
print(len(neighboursList))

# introduce animal model ortho-phenotypes for seed and 1st shell neighbors
seed_orthophenoList = monarch.get_orthopheno_list(seedList)
print(len(seed_orthophenoList))
neighbours_orthophenoList = monarch.get_orthopheno_list(neighboursList)
print(len(neighbours_orthophenoList))

# network nodes: seed + 1shell + ortholog-phentoype
geneList = sum([seedList,
                neighboursList,
                seed_orthophenoList,
                neighbours_orthophenoList], 
               [])
print('genelist: ',len(geneList))

# get Monarch network
monarch_network = monarch.extract_edges(geneList)
print('network: ',len(monarch_network))

# save edges
monarch.print_network(monarch_network, 'monarch_connections')

# build network with graph schema #!!!#
monarch_edges = monarch.build_edges(monarch_network)
monarch_nodes = monarch.build_nodes(monarch_network)

In [None]:
# print type of objects
print('type edges:', type(monarch_edges))
print('type nodes:', type(monarch_nodes))
print()

# print objects sizes
print('len edges:', len(monarch_edges))
print('len nodes:', len(monarch_nodes))
print()

# print object attribute
print('attribute edges:', monarch_edges[0].keys())
print('attribute nodes:', monarch_nodes[0].keys())

Network is returned as both CSV file and digital object

## Graph library
### Create the review knowledge graph
#### import graph

Tasks:

* Load Networks and calculate graph nodes
* Monarch graph connectivity
* Build graph

In [None]:
%%time
# load networks and calculate graph nodes
graph_nodes_list, reg_graph_edges = graph.graph_nodes(
    curation=curation_edges,
    monarch=monarch_edges,
    transcriptomics=rna_edges,
    regulation=reg_edges
)

# monarch graph connectivity
# get Monarch edges
monarch_network_graph = monarch.extract_edges(graph_nodes_list)
print('network: ',len(monarch_network_graph))

# save network
monarch.print_network(monarch_network_graph, 'monarch_connections_graph')

# build Monarch graph network
monarch_graph_edges = monarch.build_edges(monarch_network_graph)
monarch_graph_nodes = monarch.build_nodes(monarch_network_graph)

# build graph
edges = graph.build_edges(
    curation=curation_edges,
    monarch=monarch_graph_edges,
    transcriptomics=rna_edges,
    regulation=reg_graph_edges
)
nodes = graph.build_nodes(
    statements=edges,
    curation=curation_nodes,
    monarch=monarch_graph_nodes,
    transcriptomics=rna_nodes,
    regulation=reg_nodes
)

In [None]:
# print type of objects
print('type edges:', type(edges))
print('type nodes:', type(nodes))
print()

# print objects sizes
print('len edges:', len(edges))
print('len nodes:', len(nodes))
print()

# print object attribute
print('attribute edges:', edges[0].keys())
print('attribute nodes:', nodes[0].keys())

## Neo4jlib library
### Import the graph into Neo4j graph database
#### import neo4jlib

In [None]:
%%time
# import to graph interface, by now neo4j
## get edges and files for neo4j
edges_df = utils.get_dataframe(edges)
nodes_df = utils.get_dataframe(nodes)
statements = neo4jlib.get_statements(edges_df)
concepts = neo4jlib.get_concepts(nodes_df)
print('statements: ', len(statements))
print('concepts: ',len(concepts))

## import the graph into neo4j
# save files into neo4j import dir
neo4j_path = './neo4j-community-3.0.3'
neo4jlib.save_neo4j_files(statements, neo4j_path, file_type = 'statements')
neo4jlib.save_neo4j_files(concepts, neo4j_path, file_type = 'concepts')

# import graph into neo4j
neo4jlib.do_import(neo4j_path)

In [None]:
# print type of objects
print('type edges:', type(statements))
print('type nodes:', type(concepts))
print()

# print objects sizes
print('len edges:', len(statements))
print('len nodes:', len(concepts))
print()

# print object attribute
print('attribute edges:', statements[0].keys())
print('attribute nodes:', concepts[0].keys())

## hypothesis-generation library
### Query the graph for mechanistic explanation, then summarize the extracted paths
#### import hypothesis, summary

### Ortopheno query with general nodes/relations removed

In [None]:
%%time
# get orthopheno paths
seed = list([
        'HGNC:17646',  # NGLY1 human gene
        'HGNC:633'  # AQP1 human gene
])
hypothesis.query(seed,queryname='ngly1_aqp1',port='7687') #http_port= 7470; bolt_port=7680

In [None]:
%%time
# get orthopheno paths
seed = list([
        'HGNC:17646',  # NGLY1 human gene
        'HGNC:633'  # AQP1 human gene
])
hypothesis.query(seed, queryname='ngly1_aqp1', pwdegree='1000', phdegree='1000', port='7687')

In [None]:
%%time
import hypothesis
# get orthopheno paths
seed = list([
        'HGNC:17646',  # NGLY1 human gene
        'HGNC:633'  # AQP1 human gene
])
hypothesis.open_query(seed,queryname='ngly1_aqp1',port='7687')

In [None]:
%%time
# get summary
data = summary.path_load('./hypothesis/query_ngly1_aqp1_paths_v2019-03-09')

#parse data for summarization
data_parsed = list()
#funcs = [summary.metapaths, summary.nodes, summary.node_types, summary.edges, summary.edge_types]
for query in data:
    query_parsed = summary.query_parser(query)
    #metapath(query_parsed)
    #map(lambda x: x(query_parsed), funcs)
    data_parsed.append(query_parsed)
summary.metapaths(data_parsed)
summary.nodes(data_parsed)
summary.node_types(data_parsed)
summary.edges(data_parsed)
summary.edge_types(data_parsed)
#for query in data_parsed:
#    map(lambda x: x(query), funcs)