# Expansion of nodes using Monarch APIs
* https://api.monarchinitiative.org/api/#/
* https://scigraph-ontology.monarchinitiative.org/scigraph/docs/#/

In [2]:
import pandas as pd
import requests
import json
from pandas.io.json import json_normalize

In [3]:
# input disease
disease = 'OMIM:615273'

# output files
path = 'ngly1-cddg-expansion/ngly1_cddg'

## Graph queries (SciGraph)

In [4]:
# api address
api = 'https://scigraph-ontology.monarchinitiative.org/scigraph'
endpoint = '/graph'

In [5]:
# get neighbors (JSON content)
r = requests.get('{}{}/neighbors/{}'.format(api,endpoint,disease))
r.headers

{'Connection': 'keep-alive', 'Transfer-Encoding': 'chunked', 'Cache-Control': 'no-transform, max-age=7200', 'Vary': 'Accept, Accept-Encoding', 'Content-Type': 'application/json', 'Server': 'nginx/1.10.0 (Ubuntu)', 'Content-Encoding': 'gzip', 'Date': 'Thu, 18 May 2017 17:25:07 GMT'}

In [6]:
# Read results 
r.json()

{'edges': [{'meta': {'isDefinedBy': ['http://purl.obolibrary.org/obo/upheno/monarch.owl'],
    'lbl': ['subClassOf']},
   'obj': 'Orphanet:183763',
   'pred': 'subClassOf',
   'sub': 'OMIM:615273'},
  {'meta': {'equivalentOriginalNodeSource': ['http://purl.obolibrary.org/obo/UMLS_C3808991'],
    'isDefinedBy': ['http://purl.obolibrary.org/obo/upheno/monarch.owl'],
    'lbl': ['subClassOf']},
   'obj': 'Orphanet:91088',
   'pred': 'subClassOf',
   'sub': 'OMIM:615273'},
  {'meta': {'equivalentOriginalNodeTarget': ['http://purl.obolibrary.org/obo/UMLS_C3808991'],
    'isDefinedBy': ['http://purl.obolibrary.org/obo/upheno/monarch.owl'],
    'lbl': ['equivalentClass']},
   'obj': 'OMIM:615273',
   'pred': 'equivalentClass',
   'sub': 'DOID:0060728'},
  {'meta': {'equivalentOriginalNodeSource': ['http://purl.obolibrary.org/obo/UMLS_C3808991'],
    'lbl': ['isDefinedBy']},
   'obj': 'OBO:upheno/monarch.owl',
   'pred': 'isDefinedBy',
   'sub': 'OMIM:615273'},
  {'meta': {'equivalentOriginalN

In [7]:
with open('{}_neighbors.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [8]:
neighbors_df = json_normalize(r.json(), 'nodes')
neighbors_df.head()

Unnamed: 0,id,lbl,meta
0,OBO:upheno/monarch.owl,,"{'types': ['Ontology', 'cliqueLeader', 'Node']}"
1,Orphanet:404454,,"{'category': ['disease'], 'types': ['disease',..."
2,DOID:0050570,congenital disorder of glycosylation type I,{'http://www.geneontology.org/formats/oboInOwl...
3,DOID:0060728,,"{'category': ['disease'], 'types': ['disease',..."
4,OMIM:615273,NGLY1-deficiency,{'http://www.geneontology.org/formats/oboInOwl...


In [9]:
# Analyze neighbours' types
neighbors_df.id.unique()
print('nodes: {}'.format(len(neighbors_df.id.unique())))

nodes: 9


In [10]:
neighbors_df['node_type'] = neighbors_df.id.apply(lambda x: x.split(':')[0])
neighbors_df.node_type.value_counts()

Orphanet    4
DOID        2
OMIM        1
OBO         1
UMLS        1
Name: node_type, dtype: int64

In [11]:
# conclusion: neighbors are xref (Orphanet, DO, umls) and provenance

In [12]:
# Filters
# Filter by interaction_type. BUT what are the strings per interaction_type???

In [13]:
# get reachable nodes (JSON content)
r = requests.get('{}{}/reachablefrom/{}'.format(api,endpoint,disease))
r.headers

{'Connection': 'keep-alive', 'Content-Length': '854', 'Cache-Control': 'no-transform, max-age=7200', 'Vary': 'Accept, Accept-Encoding', 'Content-Type': 'application/json', 'Server': 'nginx/1.10.0 (Ubuntu)', 'Content-Encoding': 'gzip', 'Date': 'Thu, 18 May 2017 17:25:09 GMT'}

In [14]:
# Read results 
r.json()
reach_df = json_normalize(r.json(), 'nodes')
reach_df.head()

Unnamed: 0,id,lbl,meta
0,OBO:upheno/monarch.owl,,"{'types': ['Ontology', 'cliqueLeader', 'Node']}"
1,Orphanet:404454,,"{'category': ['disease'], 'types': ['disease',..."
2,DOID:0050570,congenital disorder of glycosylation type I,{'http://www.geneontology.org/formats/oboInOwl...
3,Orphanet:183763,Rare genetic intellectual disability with deve...,{'http://www.geneontology.org/formats/oboInOwl...
4,Orphanet:377788,,"{'types': ['cliqueLeader', 'Node']}"


In [15]:
reach_df.id.unique()
print('nodes: {}'.format(len(reach_df.id.unique())))

nodes: 7


In [16]:
reach_df['node_type'] = reach_df.id.apply(lambda x: x.split(':')[0])
reach_df.node_type.value_counts()

Orphanet    4
DOID        1
OBO         1
UMLS        1
Name: node_type, dtype: int64

In [17]:
neighbors_df[neighbors_df.id.str.startswith('UMLS')]

Unnamed: 0,id,lbl,meta,node_type
7,UMLS:C3808991,,"{'category': ['disease'], 'types': ['disease',...",UMLS


In [18]:
# conclusion: reachables are xref

## Edge Queries (Monarch)

In [19]:
# api address
api = 'https://api.monarchinitiative.org/api'
endpoint = '/bioentity'

In [20]:
# get gene info
r = requests.get('{}{}/disease/{}'.format(api,endpoint,disease))
#r = requests.get('https://api.monarchinitiative.org/api/bioentity/gene/%s/phenotypes/'%gene, headers={'Accept':'application/json'})
r.headers
r.json()

{'foo': 'bar'}

In [21]:
with open('{}_id.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [22]:
# get disease-anatomy
r = requests.get('{}{}/disease/{}/anatomy/'.format(api,endpoint,disease))
r.json()

{'evidence_graph': {'edges': None, 'nodes': None},
 'evidence_types': None,
 'id': None,
 'object': {'categories': None,
  'consider': None,
  'deprecated': None,
  'description': None,
  'id': None,
  'label': None,
  'replaced_by': None,
  'synonyms': None,
  'taxon': {'id': None, 'label': None},
  'types': None,
  'xrefs': None},
 'object_extension': None,
 'provided_by': None,
 'publications': None,
 'qualifiers': None,
 'relation': {'categories': None,
  'consider': None,
  'deprecated': None,
  'description': None,
  'id': None,
  'label': None,
  'replaced_by': None,
  'synonyms': None,
  'types': None},
 'slim': None,
 'subject': {'categories': None,
  'consider': None,
  'deprecated': None,
  'description': None,
  'id': None,
  'label': None,
  'replaced_by': None,
  'synonyms': None,
  'taxon': {'id': None, 'label': None},
  'types': None,
  'xrefs': None},
 'subject_extension': None,
 'type': None}

In [23]:
# get disease-function
r = requests.get('{}{}/disease/{}/function/'.format(api, endpoint, disease))
r.json()

{'evidence_graph': {'edges': None, 'nodes': None},
 'evidence_types': None,
 'id': None,
 'object': {'categories': None,
  'consider': None,
  'deprecated': None,
  'description': None,
  'id': None,
  'label': None,
  'replaced_by': None,
  'synonyms': None,
  'taxon': {'id': None, 'label': None},
  'types': None,
  'xrefs': None},
 'object_extension': None,
 'provided_by': None,
 'publications': None,
 'qualifiers': None,
 'relation': {'categories': None,
  'consider': None,
  'deprecated': None,
  'description': None,
  'id': None,
  'label': None,
  'replaced_by': None,
  'synonyms': None,
  'types': None},
 'slim': None,
 'subject': {'categories': None,
  'consider': None,
  'deprecated': None,
  'description': None,
  'id': None,
  'label': None,
  'replaced_by': None,
  'synonyms': None,
  'taxon': {'id': None, 'label': None},
  'types': None,
  'xrefs': None},
 'subject_extension': None,
 'type': None}

In [24]:
# get disease-gene
r = requests.get('{}{}/disease/{}/genes/'.format(api,endpoint,disease))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))
r.json()

Number of nodes associated are 1


{'associations': [{'evidence_graph': {'edges': [{'obj': ':.well-known/genid/genoGM25347',
      'pred': 'GENO:0000222',
      'sub': ':.well-known/genid/person-3200-1'},
     {'obj': 'NCBIGene:55768',
      'pred': 'GENO:0000418',
      'sub': 'ClinVarVariant:126423'},
     {'obj': ':.well-known/genid/610661.0004-610661.0005',
      'pred': 'GENO:0000382',
      'sub': ':.well-known/genid/genoGM25990'},
     {'obj': 'OMIM:615273',
      'pred': 'GENO:0000840',
      'sub': 'ClinVarVariant:50962'},
     {'obj': 'OMIM:615273',
      'pred': 'RO:0002200',
      'sub': ':.well-known/genid/person-3200-1'},
     {'obj': 'OMIM:615273',
      'pred': 'RO:0002200',
      'sub': ':.well-known/genid/OMIM610661-615273VL'},
     {'obj': ':.well-known/genid/person-3200-1',
      'pred': 'OBAN:association_has_subject',
      'sub': 'MONARCH:42d05e731935a651eaf999c2aeb046987f01cca7'},
     {'obj': 'OMIM:615273',
      'pred': 'OBAN:association_has_object',
      'sub': 'MONARCH:a78544aa0feaec55807c289

In [25]:
with open('{}_genes.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [26]:
r_dict = r.json()
gda_df = json_normalize(r_dict, 'objects')
gda_df.columns = ['gene_id']
gda_df.head(2)

Unnamed: 0,gene_id
0,NCBIGene:55768


In [27]:
gda_df.to_csv('{}_disease_gene.tsv'.format(path), sep='\t', index=False, header=True)

In [28]:
# get disease-phenotype
r = requests.get('{}{}/disease/{}/phenotypes/'.format(api,endpoint,disease))
nassociations = len(r.json()['associations'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 8


In [29]:
with open('{}_phenotypes.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [30]:
gph_df = json_normalize(r.json(), 'objects')
gph_df.columns = ['phenotype_id']
gph_df.head(2)

Unnamed: 0,phenotype_id
0,HP:0001252
1,HP:0001263


In [31]:
gph_df.to_csv('{}_disease_phenotype.tsv'.format(path), sep='\t', index=False, header=True)

In [32]:
# get disease-substance
r = requests.get('{}{}/disease/{}/substance/'.format(api,endpoint,disease))
r.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

## Query Wikidata for Knowldege.Bio

In [33]:
# api address:
api = 'https://query.wikidata.org/sparql'

In [127]:
def generate_table(header, results):
    df = {}
    for res_d in results:
        for head in header:
            df[head] = []
        
    for res_d in results:
        for head in header:
            try:
                value = res_d[head]['value']
            except:
                value = 'NA'
            if value.startswith('http'):
                namespace, value = value.rsplit('/', 1)
            aux = df[head]
            aux.append(value)
            df[head] = aux
            
    try:
        results_df = pd.DataFrame.from_dict(df)
    except e:
        print(e)
        print(df)
        
    results_df = results_df[header]
    return results_df

In [34]:
# get doid: 
query = """SELECT DISTINCT ?doid ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P31 wd:Q12136 }
  UNION
  {?item wdt:P279 wd:Q12136 .}
  ?item wdt:P699 ?doid .
  values ?doid {"DOID:0060728"}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?doid ?itemLabel ?itemDesc"""

In [35]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
r.json()

{'head': {'vars': ['doid', 'item', 'itemLabel', 'altLabel', 'itemDesc']},
 'results': {'bindings': [{'altLabel': {'type': 'literal',
     'value': 'Alacrimia-choreoathetosis-liver dysfunction syndrome|congenital disorder of deglycosylation|congenital disorder of glycosylation type Iv|deficiency of N-glycanase 1|NGLY1-CDDG'},
    'doid': {'type': 'literal', 'value': 'DOID:0060728'},
    'item': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q28024539'},
    'itemDesc': {'type': 'literal',
     'value': 'Human disease',
     'xml:lang': 'en'},
    'itemLabel': {'type': 'literal',
     'value': 'NGLY1-deficiency',
     'xml:lang': 'en'}}]}}

In [130]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df

Unnamed: 0,doid,item,itemLabel,altLabel,itemDesc
0,DOID:0060728,Q28024539,NGLY1-deficiency,Alacrimia-choreoathetosis-liver dysfunction sy...,Human disease


In [131]:
df.to_csv('{}_doid_concepts_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [132]:
# get NCBIGene:
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P351 ?id .} # ncbi gene
  values ?id {"55768"}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

In [133]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
r.json()

{'head': {'vars': ['id', 'item', 'itemLabel', 'altLabel', 'itemDesc']},
 'results': {'bindings': [{'altLabel': {'type': 'literal',
     'value': 'CDDG|CDG1V|N-glycanase 1|NGLY1|PNG1|PNGase'},
    'id': {'type': 'literal', 'value': '55768'},
    'item': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q18042037'},
    'itemDesc': {'type': 'literal',
     'value': 'protein-coding gene in the species Homo sapiens',
     'xml:lang': 'en'},
    'itemLabel': {'type': 'literal', 'value': 'NGLY1', 'xml:lang': 'en'}}]}}

In [134]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df

Unnamed: 0,id,item,itemLabel,altLabel,itemDesc
0,55768,Q18042037,NGLY1,CDDG|CDG1V|N-glycanase 1|NGLY1|PNG1|PNGase,protein-coding gene in the species Homo sapiens


In [135]:
df.to_csv('{}_ncbigene_concepts_kb.tsv'.format(path), sep='\t', index=False, header=True)