In [8]:
# @name graph_nodes
# @description notebook to build the graph nodes, needed input to calculate the monarch connections for the graph (pre-graph)
# @author Núria Queralt Rosinach
# @date 01-17-2019

In [9]:
# to do:
#       * review 'name' manually, specially for gene orthologs

In [10]:
import sys, glob, os
import pandas as pd
from biothings_client import get_client
import datetime

# read data
sys.path.insert(0, './')

# database version path
version = 'v20180118'

# timestamp
today = datetime.date.today()

## Concatenate
1. import graph edges and nodes into the graph folder
2. check format
3. concat(edges and nodes): first curated > monarch > regulation
* drop duplicated rows and concepts in edges/nodes files
4. save graph edges and nodes
5. format for neo4j statements and concepts files
* save neo4j files
6. import graph into neo4j

### Edges

In [11]:
%%bash
# import graph edges into the graph folder
# curated, monarch orthopheno connections, g2p (curated, monarch)
# ngly1 v2 (animal model) corrected: ../curation/kylo/neo4j/networks/v20180118
# by script: workspace/ngly1-graph/curation/kylo/neo4j/networks/concatenate_network_files.ipynb
# Corrections: curated/papers edges/nodes files replaced "-" by "_" to concatenate them. g2p curated/monarch edges
# lam_nodes: two phenotypes withdrawn; glcnac_nodes: two CHEM added
# Edges/Nodes:
#             * monarch: v20171128, statements/concepts
#             * curated: 5 nodes, 2 papers
#             * g2p: statements, network (curated)/monarch
# Genes: NCBIGene
mkdir -p graph
cp -r ../curation/kylo/neo4j/networks/v20180118 graph/.
cd graph/v20180118
rm -f curated_statements.tsv ngly1_statements.tsv ngly1_concepts.tsv 
rm -f g2p_edges_network.tsv g2p_edges_monarch.tsv
rm -f monarch_edges_v2017-11-28.tsv monarch_nodes_v2017-11-28.tsv
cd ../..
# checked that all have graph format like regulation and rna edges/nodes. 
# graph nodes: should query biothings to incorporate name (and for monarch: alias and summary)

In [12]:
# working directory
path = os.getcwd() + "/graph/" + version

## curated network
print('\nPreparing curated network...')
# concat all curated statements in the network
df_l = []
for file in glob.glob('{}/*_edges.tsv'.format(path)):
    with open(file, 'r') as f:
        df_l.append(pd.read_table(f))

curated_df = pd.concat(df_l, ignore_index=True, join="inner")

# ID curie normalization: curation to Monarch ID
# subject_id
curated_df['subject_id'] = ( curated_df.subject_id
               .apply(lambda x:
                      'ClinVarVariant:50962' if 'HGVS' in str(x) else     
                       x.replace('Reactome', 'REACT') if 'Reactome' in str(x) else str(x).strip()
                     )
)
# object_id
curated_df['object_id'] = ( curated_df.object_id
               .apply(lambda x:
                      'ClinVarVariant:50962' if 'HGVS' in str(x) else     
                       x.replace('Reactome', 'REACT') if 'Reactome' in str(x) else str(x).strip()
                     )
)

# uniform columns bw monarch and curated file formats
curated_df = curated_df[['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri']]

# drop row duplicates
print('\nDrop duplicated rows...')
print(len(curated_df))
curated_df.drop_duplicates(inplace=True)
print(len(curated_df))

# save curated edges
curated_df.fillna('NA').to_csv('{}/curated_edges_v{}.csv'.format(path,today), index=False)
print(curated_df.shape)
print(curated_df.columns)

# ID conversion: from ngly1 curated network to monarch graph 
# script: http://localhost:8888/notebooks/workspace/ngly1-graph/regulation/curated.ipynb
# HUMAN GENES: NCBIGene to HGNC ID using biothings
# DISEASES: DO, OMIM, Orphanet IDs to MONDO ID manually 
# and add edges disease id to mondo id in the graph (exact match)
# and add new mondo id nodes parsing the mondo owl ontology to extract node attributes
print('\nID conversion: from ngly1 curated network to monarch graph...')
# Genes #
print('\nMapping genes to HGNC ID...')
# biothings api + dictionaries
# concepts
concept_dct = dict()
for i, row in curated_df.iterrows():
    # node for subject
    concept_dct[row['subject_id']] = 1
    # node for object
    concept_dct[row['object_id']] = 1
    
# api input
entrez = list()
diseases = set()
for idx, row in concept_dct.items():
    if ':' in idx:
        if 'ncbigene' in idx.split(':')[0].lower():
            entrez.append(idx.split(':')[1])
        elif 'doid' in idx.split(':')[0].lower() or 'omim' in idx.split(':')[0].lower() or 'orphanet' in idx.split(':')[0].lower():
            diseases.add(idx)
entrez = list(set(entrez))

# api call 
mg = get_client('gene')
df = mg.querymany(entrez, scopes = 'entrezgene', fields='HGNC', size=1, as_dataframe=True)

# build dictionary
ids = df.reset_index().rename(columns={'query': 'entrez'}).copy()
entrez2hgnc_dct = dict(zip(ids.entrez, ids.HGNC))

# map to hgnc
lines = []
for idx, row in curated_df.iterrows():
    # subject
    if ':' in row['subject_id']:
        if 'NCBIGene' in row['subject_id'].split(':')[0]:
            # human ncbi gene ids with HGNC ID
            if str(entrez2hgnc_dct[row['subject_id'].split(':')[1]]) != 'nan':
                row['subject_id'] = "HGNC:"+entrez2hgnc_dct[row['subject_id'].split(':')[1]]
            # specific non human ncbi gene ids in the curated set
            elif row['subject_id'] == 'NCBIGene:173028':
                row['subject_id'] = 'WormBase:WBGene00010160'
            elif row['subject_id'] == 'NCBIGene:11826':
                row['subject_id'] = 'MGI:103201'
    
    # object
    if ':' in row['object_id']:
        if 'NCBIGene' in row['object_id'].split(':')[0]:
            # human ncbi gene ids with HGNC ID
            if str(entrez2hgnc_dct[row['object_id'].split(':')[1]]) != 'nan':
                row['object_id'] = "HGNC:"+entrez2hgnc_dct[row['object_id'].split(':')[1]]
            # specific non human ncbi gene ids in the curated set
            elif row['object_id'] == 'NCBIGene:173028':
                row['object_id'] = 'WormBase:WBGene00010160'
            elif row['object_id'] == 'NCBIGene:11826':
                row['object_id'] = 'MGI:103201'

    lines.append((row))
curated_df = pd.DataFrame.from_records(lines)

# Diseases #
print('\nMapping diseases to MONDO ID...')
print('List of curated diseases:',diseases)
# add edges
# manually: dict diseases to mondo
d2m = {
       'OMIM:223900': 'MONDO:0009131', 
       'DOID:2476': 'MONDO:0019064', 
       'Orphanet:869': 'MONDO:0009279', 
       'DOID:11589': 'MONDO:0009131', 
       'OMIM:614653': 'MONDO:0013839', 
       'OMIM:615510': 'MONDO:0014219', 
       'Orphanet:314381': 'MONDO:0013839', 
       'DOID:10595': 'MONDO:0015626', 
       'OMIM:608984': 'MONDO:0012166', 
       'DOID:5212': 'MONDO:0015286', 
       'OMIM:615273': 'MONDO:0014109', 
       'DOID:0060308': 'MONDO:0019502', 
       'DOID:0060728': 'MONDO:0014109', 
       'OMIM:231550': 'MONDO:0009279', 
       'DOID:0050602': 'MONDO:0009279'
}

# add equivalentTo MONDO edges
edges_l = list()
for disease, mondo in d2m.items():
    edge = dict()
    edge['subject_id'] = disease
    edge['object_id'] = mondo
    edge['property_id'] = 'skos:exactMatch'
    edge['property_label'] = 'exact match'
    edge['property_description'] = 'NA'
    edge['property_uri'] = 'NA'
    edge['reference_uri'] = 'https://monarchinitiative.org/disease/'+mondo
    edge['reference_supporting_text'] = 'Manual extraction from Monarch Knowledge Graph.'
    edge['reference_date'] = '2018-04'
    edges_l.append(edge)
    
d2m_edges_df = pd.DataFrame(edges_l)
curated_df = pd.concat([curated_df,d2m_edges_df], ignore_index=True, join="inner")
print(curated_df.shape)

# add g2p network
print('\nAdding gene to protein network...')
# biothings api + dictionaries
# api input
uniprot = list()
for idx, row in concept_dct.items():
    if ':' in idx:
        if 'uniprot' in idx.split(':')[0].lower():
            uniprot.append(idx.split(':')[1])
uniprot = list(set(uniprot))

# api call 
mg = get_client('gene')
df = mg.querymany(uniprot, scopes = 'uniprot', fields='HGNC', size=1, as_dataframe=True)

# build dictionary
ids = df.reset_index().rename(columns={'query': 'uniprot'}).copy()
uniprot2hgnc_dct = dict(zip(ids.uniprot, ids.HGNC))

# add equivalentTo edges
edges_l = list()
for uniprot, hgnc in uniprot2hgnc_dct.items():
    if str(uniprot2hgnc_dct[uniprot]) == 'nan':
        continue
    edge = dict()
    edge['subject_id'] = 'HGNC:'+hgnc
    edge['object_id'] = 'UniProt:'+uniprot
    edge['property_id'] = 'RO:0002205'
    edge['property_label'] = 'has gene product'
    edge['property_description'] = 'NA'
    edge['property_uri'] = 'NA'
    edge['reference_uri'] = 'http://mygene.info/clients/'
    edge['reference_supporting_text'] = 'Automatic extraction via the python client for mygene.info services.'
    edge['reference_date'] = today
    edges_l.append(edge)
    
p2g_edges_df = pd.DataFrame(edges_l)
print(p2g_edges_df.shape)
print(p2g_edges_df.columns)
curated_df = pd.concat([curated_df,p2g_edges_df], ignore_index=True, join="inner")
print(curated_df.shape)

# drop g2p duplicates
print('\nDrop duplicated gene-protein relations...')
# mark `has gene product` to be deleted if duplicated
curated_df['g2p_mark'] = (
    [ curated_df.at[idx,'property_label'] if 'has gene product' in curated_df.at[idx,'property_label'] else 
     idx for idx in curated_df.index ]
)
# keep first: keep the g2p manually added
curated_df.drop_duplicates(subset=['subject_id','property_id', 'object_id', 'g2p_mark'],keep='first',inplace=True)

# save curated normalized to graph edges
curated_df.fillna('NA').to_csv('{}/curated_graph_edges_v{}.csv'.format(path,today), index=False)
print(curated_df.shape)
print(curated_df.columns)

## monarch network
print('\nPreparing Monarch network...')
#path = os.getcwd() + "/../connectivity/1shell-animalModel-hgnc/add-connections-to-net"
path = os.getcwd() + "/../monarch/1shell-animal/add-connections-to-net"
monarch_df = pd.read_table('{}/monarch_edges_v2019-01-16.tsv'.format(path))
print(monarch_df.shape)
print(monarch_df.columns)

## transcriptomics network
print('\nPreparing transcriptomics network...')
path = os.getcwd() + "/graph"
rna = pd.read_csv('{}/rna_edges_v2019-01-17.csv'.format(path))
print(rna.shape)
print(rna.columns)

## regulation network
print('\nPreparing tf-gene network...')
path = os.getcwd() + "/graph"
tf = pd.read_csv('{}/regulation_edges_v2019-01-17.csv'.format(path))
print(tf.shape)
print(tf.columns)

# concat 1) curated 2) monarch 3) RNA-seq edges
print('\nConcatenating into a graph...')
statements = pd.concat([curated_df,monarch_df,rna], ignore_index=True, join="inner")
print(statements.shape)

# drop row duplicates
print('\nDrop duplicated rows...')
statements.drop_duplicates(keep='first',inplace=True)
print(statements.shape)

## merge graph & tf
# merge: 4 merges
print('\nMerging tf-gene network to the graph...')
# merge1: L_sub  &  tf_sub
merge1 = pd.merge(statements,tf, how='inner', left_on='subject_id', right_on='subject_id', suffixes=('_graph', '_tf'))

# merge2: L_obj  &  tf_sub
merge2 = pd.merge(statements,tf, how='inner', left_on='object_id', right_on='subject_id', suffixes=('_graph', '_tf'))

# merge3: L_sub  &  tf_obj
merge3 = pd.merge(statements,tf, how='inner', left_on='subject_id', right_on='object_id', suffixes=('_graph', '_tf'))

# merge4: L_obj  &  tf_obj
merge4 = pd.merge(statements,tf, how='inner', left_on='object_id', right_on='object_id', suffixes=('_graph', '_tf'))

# prepare merged edges: slice tf edges from merge
# merge1
merge1_clean = (merge1
                [['subject_id', 'property_id_tf', 'object_id_tf', 'reference_uri_tf',
       'reference_supporting_text_tf', 'reference_date_tf', 'property_label_tf',
       'property_description_tf', 'property_uri_tf']]
                .rename(columns={
                    'property_id_tf': 'property_id', 
                    'object_id_tf': 'object_id', 
                    'reference_uri_tf': 'reference_uri',
       'reference_supporting_text_tf': 'reference_supporting_text', 
                    'reference_date_tf': 'reference_date', 
                    'property_label_tf': 'property_label',
       'property_description_tf': 'property_description', 
                    'property_uri_tf': 'property_uri'
                })
               )

# merge2
merge2_clean = (merge2
                [['subject_id_tf', 'property_id_tf', 'object_id_tf', 'reference_uri_tf',
       'reference_supporting_text_tf', 'reference_date_tf', 'property_label_tf',
       'property_description_tf', 'property_uri_tf']]
                .rename(columns={
                    'subject_id_tf': 'subject_id', 
                    'property_id_tf': 'property_id', 
                    'object_id_tf': 'object_id',
                    'reference_uri_tf': 'reference_uri',
       'reference_supporting_text_tf': 'reference_supporting_text', 
                    'reference_date_tf': 'reference_date', 
                    'property_label_tf': 'property_label',
       'property_description_tf': 'property_description', 
                    'property_uri_tf': 'property_uri'
                })
               )

# merge3
merge3_clean = (merge3
                [['subject_id_tf', 'property_id_tf', 'object_id_tf', 'reference_uri_tf',
       'reference_supporting_text_tf', 'reference_date_tf', 'property_label_tf',
       'property_description_tf', 'property_uri_tf']]
                .rename(columns={
                    'subject_id_tf': 'subject_id',
                    'property_id_tf': 'property_id', 
                    'object_id_tf': 'object_id', 
                    'reference_uri_tf': 'reference_uri',
       'reference_supporting_text_tf': 'reference_supporting_text', 
                    'reference_date_tf': 'reference_date', 
                    'property_label_tf': 'property_label',
       'property_description_tf': 'property_description', 
                    'property_uri_tf': 'property_uri'
                })
               )

# merge4
merge4_clean = (merge4
                [['subject_id_tf', 'property_id_tf', 'object_id', 'reference_uri_tf',
       'reference_supporting_text_tf', 'reference_date_tf', 'property_label_tf',
       'property_description_tf', 'property_uri_tf']]
                .rename(columns={
                    'subject_id_tf': 'subject_id',
                    'property_id_tf': 'property_id',
                    'reference_uri_tf': 'reference_uri',
       'reference_supporting_text_tf': 'reference_supporting_text', 
                    'reference_date_tf': 'reference_date', 
                    'property_label_tf': 'property_label',
       'property_description_tf': 'property_description', 
                    'property_uri_tf': 'property_uri'
                })
               )

## concat merged edges to statements (<= curated+monarch+rna)
# concat all 4 merges to merged edges
merged = pd.concat([merge1_clean,merge2_clean,merge3_clean,merge4_clean], ignore_index=True, join="inner")

# drop duplicates
merged.drop_duplicates(inplace=True)
print(merged.shape)

# save graph
print('\nSaving tf merged edges...')
path = os.getcwd() + "/graph"
merged.fillna('NA').to_csv('{}/regulation_graph_edges_v{}.csv'.format(path,today), index=False)

# concat merged to statements
statements = pd.concat([statements,merged], ignore_index=True, join="inner")
print(statements.shape)

# drop duplicates
print('\nDrop duplicated rows...')
statements.drop_duplicates(keep='first',inplace=True)
print(statements.shape)

# add property_uri for those without but with a curie property_id annotated
curie_dct = {
    'ro': 'http://purl.obolibrary.org/obo/',
    'bfo': 'http://purl.obolibrary.org/obo/',
    'geno': 'http://purl.obolibrary.org/obo/',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
    'skos': 'http://www.w3.org/2004/02/skos/core#',
    'pato': 'http://purl.obolibrary.org/obo/',
    'sio': 'http://semanticscience.org/resource/',
    'pmid': 'https://www.ncbi.nlm.nih.gov/pubmed/',
    'encode': 'https://www.encodeproject.org/search/?searchTerm='
}
for i, row in statements.iterrows():
    if ':' in str(row['property_uri']):
        property_uri = row['property_uri']
    elif ':' in str(row['property_id']):
        try:
            property_uri = curie_dct[row['property_id'].split(':')[0].lower()]+row['property_id'].replace(':','_')
        except KeyError:
            property_uri = None
            print('There is a reference curie with and unrecognized namespace:', row['property_id'])
    else:
        property_uri = None
    statements.at[i, 'property_uri'] = property_uri
    
# save graph
print('\nSaving final graph...')
path = os.getcwd() + "/graph"
statements = statements[['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri']]
print(statements.shape)
print(statements.columns)
statements.fillna('NA').to_csv('{}/graph_pre_monarch_connectivity_edges_v{}.csv'.format(path,today), index=False)


Preparing curated network...

Drop duplicated rows...
322
321
(321, 9)
Index(['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri'],
      dtype='object')

ID conversion: from ngly1 curated network to monarch graph...

Mapping genes to HGNC ID...
querying 1-18...done.
Finished.

Mapping diseases to MONDO ID...
List of curated diseases: {'Orphanet:314381', 'OMIM:223900', 'DOID:11589', 'OMIM:614653', 'DOID:0060308', 'Orphanet:869', 'DOID:0060728', 'DOID:5212', 'OMIM:608984', 'OMIM:615510', 'DOID:2476', 'OMIM:231550', 'DOID:0050602', 'DOID:10595', 'OMIM:615273'}
(336, 9)

Adding gene to protein network...
querying 1-43...done.
Finished.
(42, 9)
Index(['object_id', 'property_description', 'property_id', 'property_label',
       'property_uri', 'reference_date', 'reference_supporting_text',
       'reference_uri', 'subject_id'],
      dtype='object')
(378, 9)

Drop du

  interactivity=interactivity, compiler=compiler, result=result)


(197267, 9)
Index(['object_id', 'property_description', 'property_id', 'property_label',
       'property_uri', 'reference_date', 'reference_supporting_text',
       'reference_uri', 'subject_id'],
      dtype='object')

Concatenating into a graph...
(33463, 9)

Drop duplicated rows...
(33463, 9)

Merging tf-gene network to the graph...
(9723, 9)

Saving tf merged edges...
(43186, 9)

Drop duplicated rows...
(43186, 9)

Saving final graph...
(43186, 9)
Index(['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri'],
      dtype='object')


### Nodes
* Graph nodes: add name, synonyms (alias), and description (summary) from biothings, add in case the value is None or NaN

In [23]:
#sys.path.insert(0,'/home/nuria/soft/utils3/ontologies')
sys.path.insert(0,'./utils')
import mondo_class as mondo

In [24]:
# working directory
path = os.getcwd() + "/graph/" + version

## curated nodes
print('\nPreparing curated nodes...')
# concat all curated concepts in the network
df_l = []
for file in glob.glob('{}/*_nodes.tsv'.format(path)):
    with open(file, 'r') as f:
        df_l.append(pd.read_table(f))

curated_df = pd.concat(df_l, ignore_index=True, join="inner")   

# ID curie normalization: curation to Monarch ID
curated_df['id'] = ( curated_df.id
               .apply(lambda x:
                      'ClinVarVariant:50962' if 'HGVS' in str(x) else     
                       x.replace('Reactome', 'REACT') if 'Reactome' in str(x) else str(x).strip()
                     )
)

# uniform columns bw monarch and curated file formats
curated_df = curated_df[['id', 'semantic_groups', 'preflabel', 'synonyms', 'description']]

# drop duplicates
curated_df.drop_duplicates(inplace=True)

# save curated nodes
curated_df.fillna('NA').to_csv('{}/curated_nodes_v{}.csv'.format(path,today), index=False)

# ID conversion: from ngly1 curated network to monarch graph 
# script: http://localhost:8888/notebooks/workspace/ngly1-graph/regulation/curated.ipynb
# HUMAN GENES: NCBIGene to HGNC ID using biothings
# DISEASES: DO, OMIM, Orphanet IDs to MONDO ID manually 
# and add edges disease id to mondo id in the graph (exact match)
# and add new mondo id nodes parsing the mondo owl ontology to extract node attributes
print('\nID conversion: from ngly1 curated network to monarch graph...')
# Genes #
print('\nMapping genes to HGNC ID...')
# biothings api + dictionaries 
# api input
entrez = list()
for i, row in curated_df.iterrows():
    if ':' in row['id']:
        if 'ncbigene' in row['id'].split(':')[0].lower():
            entrez.append(row['id'].split(':')[1])
entrez = list(set(entrez))

# api call 
mg = get_client('gene')
df = mg.querymany(entrez, scopes = 'entrezgene', fields='HGNC', size=1, as_dataframe=True)

# build dictionary
ids = df.reset_index().rename(columns={'query': 'entrez'}).copy()
entrez2hgnc_dct = dict(zip(ids.entrez, ids.HGNC))

# map to hgnc
lines = []
for idx, row in curated_df.iterrows():
    # subject
    if ':' in row['id']:
        if 'ncbigene' in row['id'].split(':')[0].lower():
            # human ncbi gene ids with HGNC ID
            if str(entrez2hgnc_dct[row['id'].split(':')[1]]) != 'nan':
                row['id'] = "HGNC:"+entrez2hgnc_dct[row['id'].split(':')[1]]
            # specific non human ncbi gene ids in the curated set
            elif row['id'] == 'NCBIGene:173028':
                row['id'] = 'WormBase:WBGene00010160'
            elif row['id'] == 'NCBIGene:11826':
                row['id'] = 'MGI:103201'

    lines.append((row))
curated_df = pd.DataFrame.from_records(lines)

# Diseases #
print('\nAdding diseases described by the MONDO ontology...')
# import mondo owl terms
owl_f = os.getcwd() + '/../ontologies/mondo.owl'
tm = mondo.term(owl_f)
# extract metadata from the mondo ontology
nodes_l = list()
for disease, mondo in d2m.items():
    mondo_term = tm.get_metadata_per_id(id=mondo)
    node = dict()
    node['id'] = mondo_term['id']
    node['semantic_groups'] = 'DISO'
    node['preflabel'] = mondo_term['label']
    node['synonyms'] = mondo_term['synonyms']
    node['description'] = mondo_term['definition']
    nodes_l.append(node)
    
# add disease nodes to curated_df
d2m_nodes_df = pd.DataFrame(nodes_l)
d2m_nodes_df.drop_duplicates(inplace=True)
curated_df = pd.concat([curated_df,d2m_nodes_df], ignore_index=True, join="inner")

# biothings: annotate name to genes
print('\nAdding BioThings annotation: gene names...')
# input: (preflabel) symbol,alias
symbols = list(curated_df.preflabel)
print('symbols:', len(symbols))

# query biothings
mg = get_client('gene')
df = mg.querymany(symbols, scopes = 'symbol,alias', fields='name', size=1, as_dataframe=True)

# dictionary: {symbol:name}
ids = ( df.reset_index().rename(columns={'query':'symbol'}) )
#ids['names'] = ids.name.apply(lambda x: x if str(x) != 'nan' else 'NA')
#curated_s2n = dict(zip(ids.symbol, ids.names))
#curated_df['name'] = curated_df.preflabel.apply(lambda x: curated_s2n[x] if x in curated_s2n.keys() else 'NA')
curated_s2n = dict(zip(ids.symbol, ids.name))
curated_df['name'] = curated_df.preflabel.apply(lambda x: curated_s2n[x] if x in curated_s2n.keys() else x)

# save curated nodes
curated_df.fillna('NA').to_csv('{}/curated_graph_nodes_v{}.csv'.format(path,today), index=False)
print(curated_df.shape)
print(curated_df.columns)

## monarch nodes
print('\nPreparing Monarch nodes...')
path = os.getcwd() + "/../monarch/1shell-animal/add-connections-to-net"
monarch_df = pd.read_table('{}/monarch_nodes_v2019-01-16.tsv'.format(path))

# biothings: annotate name,synonyms,description to genes
print('\nAdding BioThings annotation: gene name, synonyms, description...')
# input: (preflabel) symbol,alias
symbols = list()
for i, row in monarch_df.iterrows():
    if isinstance(row['semantic_groups'],list):
        for label in row['semantic_groups']:
            if 'GENE' in label:
                symbols.append(row['preflabel'])
    else:
        if 'GENE' in row['semantic_groups']:
            symbols.append(row['preflabel'])
print('symbols:', len(symbols))

# query biothings
mg = get_client('gene')
df = mg.querymany(symbols, scopes = 'symbol,alias', fields='name,alias,summary', size=1, as_dataframe=True)

# dictionary: {symbol:name}
ids = ( df.reset_index().rename(columns={'query':'symbol'}) )
#ids['names'] = ids.name.apply(lambda x: x if str(x) != 'nan' else 'NA')
ids['synonyms'] = ids.alias.apply(lambda x: x if str(x) != 'nan' else 'NA')
ids['description'] = ids.summary.apply(lambda x: x if str(x) != 'nan' else 'NA')
#monarch_s2n = dict(zip(ids.symbol, ids.names))
monarch_s2n = dict(zip(ids.symbol, ids.name))
monarch_s2s = dict(zip(ids.symbol, ids.synonyms))
monarch_s2d = dict(zip(ids.symbol, ids.description))
#monarch_df['name'] = monarch_df.preflabel.apply(lambda x: monarch_s2n[x] if x in monarch_s2n.keys() else 'NA')
monarch_df['name'] = monarch_df.preflabel.apply(lambda x: monarch_s2n[x] if x in monarch_s2n.keys() else x)
monarch_df['synonyms'] = monarch_df.preflabel.apply(lambda x: monarch_s2s[x] if x in monarch_s2s.keys() else 'NA')
monarch_df['description'] = monarch_df.preflabel.apply(lambda x: monarch_s2d[x] if x in monarch_s2d.keys() else 'NA')
print(monarch_df.shape)
print(monarch_df.columns)

## rna nodes
print('\nPreparing transcriptomics nodes...')
path = os.getcwd() + "/graph"
rna_df = pd.read_csv('{}/rna_nodes_v2019-01-17.csv'.format(path))
print(rna_df.shape)
print(rna_df.columns)

## tf nodes: i only want tf merged nodes annotated
print('\nPreparing tf-gene nodes...')
path = os.getcwd() + "/graph"
tf_df = pd.read_csv('{}/regulation_nodes_v2019-01-17.csv'.format(path))
print(tf_df.shape)
print(tf_df.columns)

## Annotating curated gene nodes from P-G edges
print('\nPreparing encoding genes from ngly1 curated network...')
# biothings api + dictionaries 
print('\nAdding BioThings annotation: gene symbol, name, synonyms, description...')
# api input
uniprot = set()
for i, row in curated_df.iterrows():
    if ':' in row['id']:
        if 'uniprot' in row['id'].split(':')[0].lower():
            uniprot.add(row['id'].split(':')[1])

# api call 
mg = get_client('gene')
df = mg.querymany(uniprot, scopes = 'uniprot', fields='HGNC,symbol,name,alias,summary', size=1, as_dataframe=True)

# build a list of nodes as list of dict, i.e a df, where a dict is a node
nodes_l = list()
for i,concept in df.iterrows():
    if str(concept['HGNC']) != 'nan':
        node = dict()
        node['id'] = 'HGNC:'+concept['HGNC']
        node['semantic_groups'] = 'GENE'
        node['preflabel'] = concept['symbol'] 
        node['name'] = concept['name'] 
        node['synonyms'] = '|'.join(list(concept['alias'])) if isinstance(concept['alias'], list) else concept['alias']
        node['description'] = concept['summary']
        nodes_l.append(node)
    
# structure as dataframe
p2g_nodes_df = pd.DataFrame(nodes_l)
p2g_nodes_df = p2g_nodes_df.fillna('NA')
p2g_nodes_df.drop_duplicates(inplace=True)

## Annotating nodes in the graph
print('\nAnnotating nodes in the graph...')
# extracting nodes in the graph
st_nodes_l = pd.concat([statements.subject_id,statements.object_id], ignore_index=True)
st_nodes_l.drop_duplicates(inplace=True)
st_nodes_df = pd.DataFrame({'id': st_nodes_l})
print(st_nodes_df.shape)

# annotating nodes 
curated_nodes = pd.merge(curated_df,st_nodes_df,how='inner',on='id')
monarch_nodes = pd.merge(monarch_df,st_nodes_df,how='inner',on='id')
rna_nodes = pd.merge(rna_df,st_nodes_df,how='inner',on='id')
regulation_nodes = pd.merge(tf_df,st_nodes_df,how='inner', on='id')
p2g_nodes = pd.merge(p2g_nodes_df,st_nodes_df,how='inner', on='id')

# concat all, (importantly, concatenate first curated concepts with extended definitions)
print('\nConcatenating all nodes...')
nodes = pd.concat([curated_nodes,monarch_nodes,rna_nodes,regulation_nodes,p2g_nodes], ignore_index=True, join="inner")
print(nodes.shape)

# drop duplicated rows
print('\nDrop duplicated rows...')
nodes['synonyms'] = nodes.synonyms.apply(lambda x: str('|'.join(x)) if isinstance(x,list) else x)
nodes.drop_duplicates(keep='first',inplace=True)
print(nodes.shape)

# drop duplicated nodes (keep first row (the curated), remove others (monarch))
print('\nDrop duplicated nodes...')
nodes.drop_duplicates(subset=['id'],keep='first',inplace=True)
print(nodes.shape)

# check
if len(set(st_nodes_df.id)) != len(set(nodes.id)):
    print('\nThere is a problem in the annotation of nodes.\nThe number of annotated nodes is different than the number of nodes in the graph.')
    print('Curated nodes not in the graph: {}'.format(set(curated_df.id)-set(curated_nodes.id)))
    print('Monarch nodes not in the graph: {}'.format(set(monarch_df.id)-set(monarch_nodes.id)))
    print('RNA-seq nodes not in the graph: {}'.format(set(rna_df.id)-set(rna_nodes.id)))
    print('Regulation nodes not in the graph: {}'.format(len(set(tf_df.id)-set(regulation_nodes.id))))
else:
    print('\nAll graph nodes are annotated.')
    print('Regulation nodes not in the graph: {}'.format(len(set(tf_df.id)-set(regulation_nodes.id))))
          
## biothings
# add attributes

# all genes/proteins => add entrez|uniprot

# save graph nodes
print('\nSaving final graph...')
path = os.getcwd() + "/graph"
nodes = nodes[['id','semantic_groups','preflabel','synonyms','name','description']]
nodes['synonyms'] = nodes.synonyms.apply(lambda x: str('|'.join(x)) if isinstance(x,list) else x)
print(nodes.shape)
print(nodes.columns)
nodes.fillna('NA').to_csv('{}/graph_pre_monarch_connectivity_nodes_v{}.csv'.format(path,today), index=False)


Preparing curated nodes...

ID conversion: from ngly1 curated network to monarch graph...

Mapping genes to HGNC ID...
querying 1-18...done.
Finished.

Adding diseases described by the MONDO ontology...

Adding BioThings annotation: gene names...
symbols: 298
querying 1-298...done.
Finished.
38 input query terms found dup hits:
	[('or', 5), ('NGLY1', 3), ('of', 16), ('1', 14), ('by', 9), ('MRS', 9), ('CSF', 8), ('acid', 4), ('B
591 input query terms found no hit:
	['NGLY1-deficiency', 'misfolded', 'incompletely', 'synthesized', 'protein', 'catabolic', 'process', 
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
(298, 6)
Index(['id', 'semantic_groups', 'preflabel', 'synonyms', 'description',
       'name'],
      dtype='object')

Preparing Monarch nodes...

Adding BioThings annotation: gene name, synonyms, description...
symbols: 1596
querying 1-1000...done.
querying 1001-1596...done.
Finished.
289 input query terms found dup hits:
	[('Aqp7', 2), ('Gk