In [4]:
# @name curated
# @description notebook to deal with curated edges
# @author Núria Queralt Rosinach
# @date 04-19-2018

In [5]:
# to do:
#       

In [6]:
import os
version = "v20180118"
if not os.path.exists('./graph/{}'.format(version)): os.makedirs('./graph/{}'.format(version))
import pandas as pd
from biothings_client import get_client

## ID ngly1 network - Monarch graph conversion
1. import ngly1 curated edges and nodes
2. HUMAN GENES: NCBIGene to HGNC ID using biothings
3. DISEASES: DO, OMIM, Orphanet to MONDO manually and add edges disease id to mondo id in the graph
4. add new MONDO nodes

### import ngly1 curated edges and nodes

In [7]:
edges_df = pd.read_csv('./graph/{}/curated_edges.csv'.format(version))
edges_df.head(2)

Unnamed: 0,subject_id,property_id,object_id,reference_uri,reference_supporting_text,reference_date,property_label,property_description,property_uri
0,NCBIGene:358,RO:0002331,KEGG-path:map04976,,,,involved in,,
1,NCBIGene:358,RO:0002331,KEGG-path:map04964,,,,involved in,,


In [8]:
nodes_df = pd.read_csv('./graph/{}/curated_nodes.csv'.format(version))
nodes_df.head(2)

Unnamed: 0,id,semantic_groups,preflabel,synonyms,description
0,NCBIGene:358,GENE,AQP1,CO|AQP-CHIP|AQP1|aquaporin 1 (Colton blood gro...,protein-coding gene in the species Homo sapiens
1,KEGG-path:map04976,PHYS,Bile secretion,,Biochemical pathway


### Genes

In [9]:
# biothings api + dictionaries
# concepts
concept_dct = dict()
for i, row in edges_df.iterrows():
    # node for subject
    concept_dct[row['subject_id']] = 1
    # node for object
    concept_dct[row['object_id']] = 1
len(concept_dct.keys())

267

In [10]:
# api input
entrez = list()
diseases = set()
for idx, row in concept_dct.items():
    if ':' in idx:
        if 'ncbigene' in idx.split(':')[0].lower():
            entrez.append(idx.split(':')[1])
        elif 'doid' in idx.split(':')[0].lower() or 'omim' in idx.split(':')[0].lower() or 'orphanet' in idx.split(':')[0].lower():
            diseases.add(idx)
print(len(entrez))
entrez = list(set(entrez))
print(len(entrez))
print(len(diseases))
print(diseases)

18
18
15
{'DOID:11589', 'DOID:2476', 'DOID:5212', 'DOID:0060728', 'OMIM:615273', 'Orphanet:314381', 'OMIM:615510', 'OMIM:223900', 'OMIM:608984', 'DOID:0050602', 'Orphanet:869', 'OMIM:231550', 'DOID:10595', 'DOID:0060308', 'OMIM:614653'}


In [11]:
# api call 
mg = get_client('gene')
df = mg.querymany(entrez, scopes = 'entrezgene', fields='HGNC', size=1, as_dataframe=True)
df.head(2)

querying 1-18...done.
Finished.


Unnamed: 0_level_0,HGNC,_id,_score
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
343,642,343,1.55
359,634,359,1.55


In [12]:
# build dictionary
ids = df.reset_index().rename(columns={'query': 'entrez'}).copy()
# deal with no mappings
#ids['HGNC'] = ids.HGNC.apply(lambda x: x if type(x) == str else 'NA')
# dictionary
entrez2hgnc_dct = dict(zip(ids.entrez, ids.HGNC))
print(entrez2hgnc_dct['173028'])
entrez2hgnc_dct['358']    

nan


'633'

In [13]:
# map to hgnc
lines = []
for idx, row in edges_df.iterrows():
    # subject
    if ':' in row['subject_id']:
        if 'NCBIGene' in row['subject_id'].split(':')[0]:
            if str(entrez2hgnc_dct[row['subject_id'].split(':')[1]]) != 'nan':
                row['subject_id'] = "HGNC:"+entrez2hgnc_dct[row['subject_id'].split(':')[1]]
    
    # object
    if ':' in row['object_id']:
        if 'NCBIGene' in row['object_id'].split(':')[0]:
            if str(entrez2hgnc_dct[row['object_id'].split(':')[1]]) != 'nan':
                row['object_id'] = "HGNC:"+entrez2hgnc_dct[row['object_id'].split(':')[1]]

    lines.append((row))
edges = pd.DataFrame.from_records(lines)
edges.head(2)

Unnamed: 0,subject_id,property_id,object_id,reference_uri,reference_supporting_text,reference_date,property_label,property_description,property_uri
0,HGNC:633,RO:0002331,KEGG-path:map04976,,,,involved in,,
1,HGNC:633,RO:0002331,KEGG-path:map04964,,,,involved in,,


### diseases
#### add edges

In [14]:
# manually: dict diseases to mondo
d2m = {
       'OMIM:223900': 'MONDO:0009131', 
       'DOID:2476': 'MONDO:0019064', 
       'Orphanet:869': 'MONDO:0009279', 
       'DOID:11589': 'MONDO:0009131', 
       'OMIM:614653': 'MONDO:0013839', 
       'OMIM:615510': 'MONDO:0014219', 
       'Orphanet:314381': 'MONDO:0013839', 
       'DOID:10595': 'MONDO:0015626', 
       'OMIM:608984': 'MONDO:0012166', 
       'DOID:5212': 'MONDO:0015286', 
       'OMIM:615273': 'MONDO:0014109', 
       'DOID:0060308': 'MONDO:0019502', 
       'DOID:0060728': 'MONDO:0014109', 
       'OMIM:231550': 'MONDO:0009279', 
       'DOID:0050602': 'MONDO:0009279'
}

In [15]:
print(edges.columns)

Index(['subject_id', 'property_id', 'object_id', 'reference_uri',
       'reference_supporting_text', 'reference_date', 'property_label',
       'property_description', 'property_uri'],
      dtype='object')


In [16]:
# add equivalentTo MONDO edges
edges_l = list()
for disease, mondo in d2m.items():
    edge = dict()
    edge['subject_id'] = disease
    edge['object_id'] = mondo
    edge['property_id'] = 'skos:exactMatch'
    edge['property_label'] = 'exact match'
    edge['property_description'] = 'NA'
    edge['property_uri'] = 'NA'
    edge['reference_uri'] = 'NA'
    edge['reference_supporting_text'] = 'NA'
    edge['reference_date'] = 'NA'
    edges_l.append(edge)
    
d2m_edges_df = pd.DataFrame(edges_l)
d2m_edges_df

Unnamed: 0,object_id,property_description,property_id,property_label,property_uri,reference_date,reference_supporting_text,reference_uri,subject_id
0,MONDO:0009131,,skos:exactMatch,exact match,,,,,OMIM:223900
1,MONDO:0019064,,skos:exactMatch,exact match,,,,,DOID:2476
2,MONDO:0009279,,skos:exactMatch,exact match,,,,,Orphanet:869
3,MONDO:0009131,,skos:exactMatch,exact match,,,,,DOID:11589
4,MONDO:0013839,,skos:exactMatch,exact match,,,,,OMIM:614653
5,MONDO:0014219,,skos:exactMatch,exact match,,,,,OMIM:615510
6,MONDO:0013839,,skos:exactMatch,exact match,,,,,Orphanet:314381
7,MONDO:0015626,,skos:exactMatch,exact match,,,,,DOID:10595
8,MONDO:0012166,,skos:exactMatch,exact match,,,,,OMIM:608984
9,MONDO:0015286,,skos:exactMatch,exact match,,,,,DOID:5212


In [17]:
edges = pd.concat([edges,d2m_edges_df], ignore_index=True, join="inner")
edges.tail(2)

Unnamed: 0,subject_id,property_id,object_id,reference_uri,reference_supporting_text,reference_date,property_label,property_description,property_uri
331,OMIM:615273,skos:exactMatch,MONDO:0014109,,,,exact match,,
332,DOID:0060308,skos:exactMatch,MONDO:0019502,,,,exact match,,
333,DOID:0060728,skos:exactMatch,MONDO:0014109,,,,exact match,,
334,OMIM:231550,skos:exactMatch,MONDO:0009279,,,,exact match,,
335,DOID:0050602,skos:exactMatch,MONDO:0009279,,,,exact match,,


#### add nodes

In [18]:
# build dictionary with nodes'description
print(nodes_df.columns)

Index(['id', 'semantic_groups', 'preflabel', 'synonyms', 'description'], dtype='object')


In [19]:
import sys
sys.path.insert(0,'/home/nuria/workspace/utils3/ontologies')
import mondo_class as mondo

In [20]:
# test mondo module
owl_f = '/home/nuria/workspace/ngly1-graph/ontologies/mondo.owl'
tm = mondo.term(owl_f)
print(tm.get_metadata_per_id(id='MONDO:0015286'))

{'id': 'MONDO:0015286', 'iri': 'http://purl.obolibrary.org/obo/MONDO_0015286', 'label': 'congenital disorder of glycosylation', 'synonyms': 'CDG|Carbohydrate deficient glycoprotein syndrome|carbohydrate-deficient glycoprotein syndrome', 'definition': 'Congenital disorder of glycosylation (CDG) is a fast growing group of inborn errors of metabolism characterized by defective activity of enzymes that participate in glycosylation (modification of proteins and other macromolecules by adding and processing of oligosaccharide side chains). CDG is comprised of phenotypically diverse disorders affecting multiple systems including the central nervous system, muscle function, immunity, endocrine system, and coagulation. The numerous entities in this group are subdivided, based on the synthetic pathway affected, into disorder of protein N-glycosylation, disorder of protein O-glycosylation, disorder of multiple glycosylation, and disorder of glycosphingolipid and glycosylphosphatidylinositol ancho

In [32]:
# extract metadata from the mondo ontology
nodes_l = list()
for disease, mondo in d2m.items():
    mondo_term = tm.get_metadata_per_id(id=mondo)
    node = dict()
    node['id'] = mondo_term['id']
    node['semantic_groups'] = 'DISO'
    node['preflabel'] = mondo_term['label']
    #node['name'] = mondo_term['label']
    node['synonyms'] = mondo_term['synonyms']
    node['description'] = mondo_term['definition']
    nodes_l.append(node)
    
# add to nodes_df
d2m_nodes_df = pd.DataFrame(nodes_l)
d2m_nodes_df.drop_duplicates(inplace=True)
d2m_nodes_df

Unnamed: 0,description,id,preflabel,semantic_groups,synonyms
0,A congenital disorder caused by mutations in t...,MONDO:0009131,Riley-Day syndrome,DISO,Familial Dysautonomia|HSAN 3|HSAN III|HSAN3|He...
1,Hereditary spastic paraplegias (HSP) comprise ...,MONDO:0019064,hereditary spastic paraplegia,DISO,Familial spastic paraplegia|French settlement ...
2,Triple A syndrome is a very rare multisystem d...,MONDO:0009279,triple-a syndrome,DISO,2A syndrome|3A syndrome|4A syndrome|AAA syndro...
4,,MONDO:0013839,hereditary sensory and autonomic neuropathy ty...,DISO,Familial dysautonomia with contractures|HSAN6|...
5,,MONDO:0014219,"alacrima, achalasia, and mental retardation sy...",DISO,
7,An inherited degenerative disorder involving t...,MONDO:0015626,Charcot-Marie-tooth disease,DISO,CMT|CMT - Charcot-Marie-Tooth disease|Charcot ...
8,,MONDO:0012166,"ataxia, sensory, 1, autosomal dominant",DISO,
9,Congenital disorder of glycosylation (CDG) is ...,MONDO:0015286,congenital disorder of glycosylation,DISO,CDG|Carbohydrate deficient glycoprotein syndro...
10,A rare autosomal recessive inherited disorder ...,MONDO:0014109,NGLY1-deficiency,DISO,NGLY1 deficiency|NGLY1-CDDG|congenital disorde...
11,Autosomal recessive form of non-syndromic inte...,MONDO:0019502,autosomal recessive non-syndromic intellectual...,DISO,AR-NSID|NS-ARID|autosomal recessive mental ret...


In [33]:
nodes = pd.concat([nodes_df,d2m_nodes_df], ignore_index=True, join="inner")
nodes.tail(2)

Unnamed: 0,id,semantic_groups,preflabel,synonyms,description
293,MONDO:0015626,DISO,Charcot-Marie-tooth disease,CMT|CMT - Charcot-Marie-Tooth disease|Charcot ...,An inherited degenerative disorder involving t...
294,MONDO:0012166,DISO,"ataxia, sensory, 1, autosomal dominant",,
295,MONDO:0015286,DISO,congenital disorder of glycosylation,CDG|Carbohydrate deficient glycoprotein syndro...,Congenital disorder of glycosylation (CDG) is ...
296,MONDO:0014109,DISO,NGLY1-deficiency,NGLY1 deficiency|NGLY1-CDDG|congenital disorde...,A rare autosomal recessive inherited disorder ...
297,MONDO:0019502,DISO,autosomal recessive non-syndromic intellectual...,AR-NSID|NS-ARID|autosomal recessive mental ret...,Autosomal recessive form of non-syndromic inte...
