In [1]:
import sys
import pandas as pd
import requests
import json
from pandas.io.json import json_normalize

# Expansion of nodes using Monarch APIs
* https://api.monarchinitiative.org/api/#/
* https://scigraph-ontology.monarchinitiative.org/scigraph/docs/#/

In [2]:
# input gene
gene = 'NCBIGene:358'

# output files
path = 'aqp1-human-expansion/aqp1_human'

## Graph queries (SciGraph)

In [3]:
# api address
api = 'https://scigraph-ontology.monarchinitiative.org/scigraph'
endpoint = '/graph'

In [4]:
# get neighbors (JSON content)
r = requests.get('{}{}/neighbors/{}'.format(api,endpoint,gene))
r.headers

{'Content-Type': 'application/json', 'Cache-Control': 'no-transform, max-age=7200', 'Connection': 'keep-alive', 'Vary': 'Accept, Accept-Encoding', 'Server': 'nginx/1.10.0 (Ubuntu)', 'Transfer-Encoding': 'chunked', 'Date': 'Thu, 25 May 2017 01:50:43 GMT', 'Content-Encoding': 'gzip'}

In [5]:
# Read results 
r.json()

{'edges': [{'meta': {'isDefinedBy': ['https://data.monarchinitiative.org/ttl/ncbigene.ttl'],
    'lbl': ['http://purl.obolibrary.org/obo/IAO_0000136']},
   'obj': 'NCBIGene:358',
   'pred': 'IAO:0000136',
   'sub': 'PMID:19306058'},
  {'meta': {'isDefinedBy': ['https://data.monarchinitiative.org/ttl/ncbigene.ttl'],
    'lbl': ['http://purl.obolibrary.org/obo/IAO_0000136']},
   'obj': 'NCBIGene:358',
   'pred': 'IAO:0000136',
   'sub': 'PMID:9405233'},
  {'meta': {'isDefinedBy': ['https://data.monarchinitiative.org/ttl/ncbigene.ttl'],
    'lbl': ['http://purl.obolibrary.org/obo/IAO_0000136']},
   'obj': 'NCBIGene:358',
   'pred': 'IAO:0000136',
   'sub': 'PMID:22334691'},
  {'meta': {'isDefinedBy': ['https://data.monarchinitiative.org/ttl/ncbigene.ttl'],
    'lbl': ['http://purl.obolibrary.org/obo/IAO_0000136']},
   'obj': 'NCBIGene:358',
   'pred': 'IAO:0000136',
   'sub': 'PMID:17894331'},
  {'meta': {'isDefinedBy': ['https://data.monarchinitiative.org/ttl/ncbigene.ttl'],
    'lbl': [

In [6]:
with open('{}_neighbors.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [7]:
neighbors_df = json_normalize(r.json(), 'nodes')
neighbors_df.head()

Unnamed: 0,id,lbl,meta
0,PMID:21612401,,"{'types': ['cliqueLeader', 'Node', 'NamedIndiv..."
1,PMID:16596446,,"{'types': ['cliqueLeader', 'Node', 'NamedIndiv..."
2,PMID:17012249,,"{'types': ['cliqueLeader', 'Node', 'NamedIndiv..."
3,PMID:26074259,,"{'types': ['cliqueLeader', 'Node', 'NamedIndiv..."
4,PMID:17273788,,"{'types': ['cliqueLeader', 'Node', 'NamedIndiv..."


In [8]:
neighbors_df[neighbors_df.id.str.startswith('NCBIGene')]
# ncbi:1273 is replaced by ncbi:358 (discontinued)

Unnamed: 0,id,lbl,meta
126,NCBIGene:358,AQP1,{'synonym': ['channel-like integral membrane p...
140,NCBIGene:1273,CO,{'http://www.w3.org/2000/01/rdf-schema#label':...


In [9]:
# Analyze neighbours' types
neighbors_df.id.unique()
print('nodes: {}'.format(len(neighbors_df.id.unique())))

nodes: 288


In [10]:
neighbors_df['node_type'] = neighbors_df.id.apply(lambda x: x.split(':')[0])
neighbors_df.node_type.value_counts()

PMID           279
NCBIGene         2
OMIM             1
CHR              1
SO               1
HGNC             1
NCBITaxon        1
ENSEMBL          1
MonarchData      1
Name: node_type, dtype: int64

In [11]:
# conclusion: neighbors are taxon, chr position, so, xref (ensembl, hgnc, omim) and provenance

In [12]:
# Filters
# Filter by interaction_type. BUT what are the strings per interaction_type???

In [13]:
# get reachable nodes (JSON content)
r = requests.get('{}{}/reachablefrom/{}'.format(api,endpoint,gene))
r.headers

{'Content-Type': 'application/json', 'Cache-Control': 'no-transform, max-age=7200', 'Connection': 'keep-alive', 'Vary': 'Accept, Accept-Encoding', 'Server': 'nginx/1.10.0 (Ubuntu)', 'Date': 'Thu, 25 May 2017 01:50:47 GMT', 'Content-Length': '447', 'Content-Encoding': 'gzip'}

In [14]:
# Read results 
r.json()
reach_df = json_normalize(r.json(), 'nodes')
reach_df.head()

Unnamed: 0,id,lbl,meta
0,MonarchData:ncbigene.ttl,,"{'types': ['Ontology', 'cliqueLeader', 'Node']}"
1,CHR:9606chr7p14.3,,"{'types': ['cliqueLeader', 'Node', 'NamedIndiv..."
2,NCBITaxon:9606,Homo sapiens,"{'synonym': ['man', 'humans', 'human'], 'http:..."
3,SO:0001217,protein_coding_gene,"{'synonym': ['protein coding gene'], 'http://w..."


In [15]:
reach_df.id.unique()
print('nodes: {}'.format(len(reach_df.id.unique())))

nodes: 4


In [16]:
reach_df['node_type'] = reach_df.id.apply(lambda x: x.split(':')[0])
reach_df.node_type.value_counts()

SO             1
CHR            1
MonarchData    1
NCBITaxon      1
Name: node_type, dtype: int64

In [17]:
# conclusion: reachables are taxon, chr, so

## Edge Queries (Monarch)

In [18]:
# api address
api = 'https://api.monarchinitiative.org/api'
endpoint = '/bioentity'

In [19]:
# get gene info
r = requests.get('{}{}/gene/{}'.format(api,endpoint,gene))
#r = requests.get('https://api.monarchinitiative.org/api/bioentity/gene/%s/phenotypes/'%gene, headers={'Accept':'application/json'})
r.headers
r.json()

{'categories': ['gene', 'sequence feature'],
 'chromosome': {'categories': None,
  'consider': None,
  'deprecated': None,
  'description': None,
  'id': None,
  'label': None,
  'replaced_by': None,
  'synonyms': None,
  'taxon': {'id': None, 'label': None},
  'types': None,
  'xrefs': None},
 'consider': None,
 'deprecated': None,
 'description': None,
 'disease_associations': [{'evidence_graph': {'edges': [{'obj': 'ECO:0000033',
      'pred': 'RO:0002558',
      'sub': 'MONARCH:8e8f5c993a9ac478fb84f162b39dfdd2b5cf56fe'},
     {'obj': 'DOID:899',
      'pred': 'RO:0002607',
      'sub': ':.well-known/genid/NCBIGene358-MESHD015529VL'},
     {'obj': 'NCBIGene:358',
      'pred': 'GENO:0000418',
      'sub': ':.well-known/genid/NCBIGene358-MESHD015529VL'},
     {'obj': ':.well-known/genid/NCBIGene358-MESHD015529VL',
      'pred': 'OBAN:association_has_subject',
      'sub': 'MONARCH:8e8f5c993a9ac478fb84f162b39dfdd2b5cf56fe'},
     {'obj': 'PMID:18988797',
      'pred': 'dc:source',
    

In [20]:
with open('{}_id.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [21]:
# get gene-gene interactions
r = requests.get('{}{}/gene/{}/interactions/'.format(api,endpoint,gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 42


In [22]:
with open('{}_interactions.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [23]:
ggi_df = json_normalize(r.json(), 'objects')
ggi_df.columns = ['gene_id']
ggi_df.head(2)

Unnamed: 0,gene_id
0,NCBIGene:100996717
1,NCBIGene:10188


In [24]:
ggi_df.to_csv('{}_ggi.tsv'.format(path), sep='\t', index=False, header=True)

In [25]:
# get gene-phenotype
r = requests.get('{}{}/gene/{}/phenotypes/'.format(api,endpoint,gene))
nassociations = len(r.json()['associations'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 0


In [26]:
# Get gene-disease
r = requests.get('{}{}/gene/{}/diseases/'.format(api,endpoint,gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 13


In [27]:
with open('{}_diseases.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [28]:
r_dict = r.json()
gda_df = json_normalize(r_dict, 'objects')
gda_df.columns = ['disease_id']
gda_df.head(2)

Unnamed: 0,disease_id
0,DOID:0060164
1,DOID:10763


In [29]:
gda_df.to_csv('{}_gene_disease.tsv'.format(path), sep='\t', index=False, header=True)

In [30]:
# get gene-function
r = requests.get('{}{}/gene/{}/function/'.format(api, endpoint, gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))
r.json()

Number of nodes associated are 0


{'associations': [],
 'compact_associations': None,
 'facet_counts': {'isa_partof_closure': {}, 'taxon_label': {}},
 'facet_pivot': None,
 'numFound': None,
 'objects': [],
 'start': None}

In [31]:
# get gene-expressedInAnatomy
r = requests.get('{}{}/gene/{}/expressed/'.format(api, endpoint, gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 20


In [32]:
with open('{}_expressed.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [33]:
r_data = r.json()
gaa_df = json_normalize(r_data, 'objects')
gaa_df.columns = ['anatomy_id']
gaa_df.head(2)

Unnamed: 0,anatomy_id
0,UBERON:0000007
1,UBERON:0000057


In [34]:
gaa_df.to_csv('{}_gene_anatomy.tsv'.format(path), sep='\t', index=False, header=True)

In [35]:
# get gene-pub
r = requests.get('{}{}/gene/{}/pubs/'.format(api, endpoint, gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 0


In [36]:
# get gene-homolog
r = requests.get('{}{}/gene/{}/homologs/'.format(api, endpoint, gene))
nassociations = len(r.json()['objects'])
print('Number of nodes associated are {}'.format(nassociations))

Number of nodes associated are 49


In [37]:
with open('{}_homologs.json'.format(path), 'w') as f:
    json.dump(r.json(), f, sort_keys=True, indent=4)

In [38]:
r_dict = r.json()
gha_df = json_normalize(r_dict, 'objects')
gha_df.columns = ['homolog_id']
gha_df.head(2)

Unnamed: 0,homolog_id
0,FlyBase:FBgn0015872
1,FlyBase:FBgn0033635


In [39]:
gha_df.to_csv('{}_gene_homolog.tsv'.format(path), sep='\t', index=False, header=True)

## Query Wikidata for Knowldege.Bio

In [40]:
# api address:
api = 'https://query.wikidata.org/sparql'

In [41]:
def generate_table(header, results):
    df = {}
    for res_d in results:
        for head in header:
            df[head] = []
        
    for res_d in results:
        for head in header:
            try:
                value = res_d[head]['value']
            except:
                value = 'NA'
            if value.startswith('http'):
                namespace, value = value.rsplit('/', 1)
            aux = df[head]
            aux.append(value)
            df[head] = aux
            
    try:
        results_df = pd.DataFrame.from_dict(df)
    except e:
        print(e)
        print(df)
        
    results_df = results_df[header]
    return results_df

In [42]:
# get NCBIGene:
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P351 ?id .} # ncbi gene
  values ?id {"358"}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

In [43]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [44]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df

Unnamed: 0,id,item,itemLabel,altLabel,itemDesc
0,358,Q14905238,AQP1,CO|AQP-CHIP|AQP1|aquaporin 1 (Colton blood gro...,protein-coding gene in the species Homo sapiens


In [45]:
df.to_csv('{}_subject_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [46]:
# get NCBIGene (ggi):
# get input_list
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_ggi.tsv')
input_df['id'] = input_df.gene_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"100996717" "10188" "102466755" "10253" "10488" "11007" "11043" "123722" "125115" "158405" "22806" "22807" "23281" "2353" "26575" "284001" "285622" "30008" "339834" "358" "373" "386675" "386682" "3881" "3884" "4188" "54507" "54793" "55118" "64651" "64753" "6477" "6925" "7185" "7186" "79734" "80817" "83755" "83899" "85291" "85376" "8601"'

In [47]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P351 ?id .} # ncbi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

In [48]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [49]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)

# merge input with response
input_df = input_df[['gene_id']]
input_df['id'] = input_df.gene_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['gene_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head(2)

Unnamed: 0,gene_id,item,itemLabel,altLabel,itemDesc
0,NCBIGene:100996717,Q20764587,LOC100996717,LOC100996717|notch homolog 2 N-terminal-like p...,protein-coding gene in the species Homo sapiens
1,NCBIGene:10188,Q18035104,TNK2,ACK|ACK-1|ACK1|p21cdc42Hs|TNK2|tyrosine kinase...,protein-coding gene in the species Homo sapiens


In [50]:
output_df.to_csv('{}_ncbi_interactors_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [51]:
# get gene-disease

# get input_list
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_gene_disease.tsv')
input_df = input_df[input_df.disease_id.str.contains('DOID')]
input_df['id'] = input_df.disease_id.apply(lambda x: '"' + str(x) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"DOID:0060164" "DOID:10763" "DOID:14115" "DOID:7998" "DOID:8850" "DOID:899"'

In [52]:
# get doid: 
query = """SELECT DISTINCT ?doid ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P31 wd:Q12136 } # P31 = instanceOf
  UNION
  {?item wdt:P279 wd:Q12136 .} # P279 = subClassOf
  ?item wdt:P699 ?doid . # P699 = DO ID
  values ?doid {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?doid ?itemLabel ?itemDesc"""

In [53]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [54]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

Unnamed: 0,doid,item,itemLabel,altLabel,itemDesc
0,DOID:899,Q2027937,choledochal cyst,choledochal cyst|Congenital choledochal cyst|C...,congenital disorder of digestive system
1,DOID:14115,Q1128440,toxic shock syndrome,TSS|toxic shock|Toxic shock syndrome|Toxic Sho...,Human disease


In [55]:
# merge input with response
input_df = input_df[['disease_id']]
input_df = input_df.rename(columns={'disease_id':'doid'})
input_df['id'] = input_df.doid
output_df = input_df.merge(df, how="left")
output_df = output_df[['doid', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df = output_df.rename(columns={'doid':'disease_id'})
output_df.head(2)

Unnamed: 0,disease_id,item,itemLabel,altLabel,itemDesc
0,DOID:0060164,Q3297888,pain disorder,,
1,DOID:10763,Q41861,hypertension,High blood pressure (& [essential hypertension...,high blood pressure & essential hypertension


In [56]:
doid_df = output_df

In [57]:
# get mesh
# get input_list
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_gene_disease.tsv')
input_df = input_df[input_df.disease_id.str.contains('MESH')]
input_df['id'] = input_df.disease_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"D002277" "D003528" "D004489" "D008325" "D013119" "D015674"'

In [58]:
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  ?item wdt:P486 ?id . # P486 = MESH ID
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

In [59]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [60]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

Unnamed: 0,id,item,itemLabel,altLabel,itemDesc
0,D002277,Q7577496,spindle cell carcinoma,sarcomatoid carcinoma|spindle cell carcinoma (...,A squamous cell carcinoma that is usually comp...
1,D015674,Q30015729,Animal Mammary Neoplasms,Animal Mammary Neoplasm,Tumors or cancer of the MAMMARY GLAND in anima...


In [61]:
# merge input with response
input_df = input_df[['disease_id']]
input_df['id'] = input_df.disease_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['disease_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head(2)

Unnamed: 0,disease_id,item,itemLabel,altLabel,itemDesc
0,MESH:D002277,Q7577496,spindle cell carcinoma,sarcomatoid carcinoma|spindle cell carcinoma (...,A squamous cell carcinoma that is usually comp...
1,MESH:D002277,Q33525,carcinoma,epithelioma|malignant Epithelioma,A cell type cancer that has_material_basis_in ...


In [62]:
mesh_df = output_df

In [63]:
# get omim
# get input_list
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_gene_disease.tsv')
input_df = input_df[input_df.disease_id.str.contains('OMIM')]
input_df['id'] = input_df.disease_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"263200"'

In [64]:
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  ?item wdt:P492 ?id .
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

In [65]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [66]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

Unnamed: 0,id,item,itemLabel,altLabel,itemDesc
0,263200,Q1044327,Caroli disease,,


In [67]:
# merge input with response
input_df = input_df[['disease_id']]
input_df['id'] = input_df.disease_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['disease_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head()

Unnamed: 0,disease_id,item,itemLabel,altLabel,itemDesc
0,OMIM:263200,Q1044327,Caroli disease,,


In [68]:
omim_df = output_df

In [69]:
disease_df = pd.concat([doid_df,mesh_df,omim_df])
disease_df

Unnamed: 0,disease_id,item,itemLabel,altLabel,itemDesc
0,DOID:0060164,Q3297888,pain disorder,,
1,DOID:10763,Q41861,hypertension,High blood pressure (& [essential hypertension...,high blood pressure & essential hypertension
2,DOID:14115,Q1128440,toxic shock syndrome,TSS|toxic shock|Toxic shock syndrome|Toxic Sho...,Human disease
3,DOID:7998,Q16499,hyperthyroidism,hyperthyreosis|overactive thyroid,A thyroid gland disease that involves an over ...
4,DOID:8850,Q18558026,salivary gland cancer,malignant neoplasm of salivary gland|malignant...,An oral cavity cancer that is located_in the s...
5,DOID:899,Q2027937,choledochal cyst,choledochal cyst|Congenital choledochal cyst|C...,congenital disorder of digestive system
0,MESH:D002277,Q7577496,spindle cell carcinoma,sarcomatoid carcinoma|spindle cell carcinoma (...,A squamous cell carcinoma that is usually comp...
1,MESH:D002277,Q33525,carcinoma,epithelioma|malignant Epithelioma,A cell type cancer that has_material_basis_in ...
2,MESH:D003528,Q18556510,salivary gland adenoid cystic carcinoma,adenoid cystic cancer|adenoid cystic carcinoma...,Human disease
3,MESH:D003528,Q356005,Adenoid cystic carcinoma,,


In [70]:
disease_df.to_csv('{}_diseases_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [71]:
# get uberon
# get input_list
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_gene_anatomy.tsv')
input_df['id'] = input_df.anatomy_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"0000007" "0000057" "0000059" "0000399" "0001154" "0001162" "0001385" "0001388" "0001831" "0001976" "0002021" "0002082" "0002113" "0002127" "0002190" "0002384" "0004358" "0004359" "0004720" "0007318"'

In [72]:
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  ?item wdt:P1554 ?id .
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

In [73]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [74]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

Unnamed: 0,id,item,itemLabel,altLabel,itemDesc
0,2190,Q30015788,subcutaneous adipose tissue,,
1,2082,Q30015780,cardiac ventricle,,


In [75]:
# merge input with response
input_df = input_df[['anatomy_id']]
input_df['id'] = input_df.anatomy_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['anatomy_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head(2)

Unnamed: 0,anatomy_id,item,itemLabel,altLabel,itemDesc
0,UBERON:0000007,Q156871,pituitary gland,hypophysis,endocrine gland
1,UBERON:0000057,Q9386,urethra,,


In [76]:
output_df.to_csv('{}_anatomies_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [77]:
# get orthologs (ncbigene)
# get input_list
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_gene_homolog.tsv')
input_df = input_df[input_df.homolog_id.str.contains('NCBIGene')]
input_df['id'] = input_df.homolog_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"100069916" "100567659" "106997467" "25240" "282653" "343" "359" "361" "362" "363" "403732" "420384" "4284" "463330" "559284" "816186" "817123" "818255" "818293" "818294" "818487" "819204" "819564" "820870" "822259" "823898" "824510" "824647" "824862" "825316" "827446" "827956" "828051" "828439" "829662" "831947" "834794" "836187" "838359" "839235" "841648" "843653"'

In [78]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P351 ?id .} # ncbi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

In [79]:
r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [80]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

Unnamed: 0,id,item,itemLabel,altLabel,itemDesc
0,817123,Q30016643,TIP4;1,,
1,361,Q14902502,AQP4,AQP4|aquaporin 4|HMIWC2|MIWC|WCH4,protein-coding gene in the species Homo sapiens


In [81]:
# merge input with response
input_df = input_df[['homolog_id']]
input_df['id'] = input_df.homolog_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['homolog_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head(2)

Unnamed: 0,homolog_id,item,itemLabel,altLabel,itemDesc
0,NCBIGene:100069916,Q30016192,Aqp1,,
1,NCBIGene:100567659,Q30016565,Aqp1,,


In [82]:
ncbigene_df = output_df

In [83]:
# get orthologs (fly) 
# get input_list
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_gene_homolog.tsv')
input_df = input_df[input_df.homolog_id.str.contains('FlyBase')]
input_df['id'] = input_df.homolog_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"FBgn0015872" "FBgn0033635" "FBgn0034882" "FBgn0034884"'

In [84]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P3852 ?id .} # ncbi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

# merge input with response
input_df = input_df[['homolog_id']]
input_df['id'] = input_df.homolog_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['homolog_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
fly_df = output_df
output_df.head(2)

Unnamed: 0,homolog_id,item,itemLabel,altLabel,itemDesc
0,FlyBase:FBgn0015872,Q29717328,Drip,I|DRIP|CG9023 gene product from transcript CG9...,protein-coding gene in the species Drosophila ...
1,FlyBase:FBgn0033635,Q29717311,Prip,CG7777 gene product from transcript CG7777-RB|...,protein-coding gene in the species Drosophila ...


In [85]:
# get orthologs (mgi)
# get input_list
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_gene_homolog.tsv')
input_df = input_df[input_df.homolog_id.str.contains('MGI')]
input_df['id'] = input_df.homolog_id.apply(lambda x: '"' + str(x) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"MGI:103201"'

In [86]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P671 ?id .} # mgi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

# merge input with response
input_df = input_df[['homolog_id']]
input_df['id'] = input_df.homolog_id
output_df = input_df.merge(df, how="left")
output_df = output_df[['homolog_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
mgi_df = output_df
output_df.head(2)

Unnamed: 0,homolog_id,item,itemLabel,altLabel,itemDesc
0,MGI:103201,Q14905239,Aqp1,Aqp1|CHIP28|aquaporin 1,protein-coding gene in the species Mus musculus


In [87]:
# get orthologs (zfin) 
# get input_list
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_gene_homolog.tsv')
input_df = input_df[input_df.homolog_id.str.contains('ZFIN')]
input_df['id'] = input_df.homolog_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"ZDB-GENE-030131-7764" "ZDB-GENE-100408-2"'

In [88]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P3870 ?id .} # ncbi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

# merge input with response
input_df = input_df[['homolog_id']]
input_df['id'] = input_df.homolog_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['homolog_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
zfin_df = output_df
output_df.head(2)

Unnamed: 0,homolog_id,item,itemLabel,altLabel,itemDesc
0,ZFIN:ZDB-GENE-030131-7764,Q29763978,aqp1a.1,"aquaporin 1a (Colton blood group), tandem dupl...",protein-coding gene in the species Danio rerio
1,ZFIN:ZDB-GENE-100408-2,Q30016640,aqp1l,,


In [89]:
output_df = pd.concat([fly_df,mgi_df,ncbigene_df,zfin_df])
output_df

Unnamed: 0,homolog_id,item,itemLabel,altLabel,itemDesc
0,FlyBase:FBgn0015872,Q29717328,Drip,I|DRIP|CG9023 gene product from transcript CG9...,protein-coding gene in the species Drosophila ...
1,FlyBase:FBgn0033635,Q29717311,Prip,CG7777 gene product from transcript CG7777-RB|...,protein-coding gene in the species Drosophila ...
2,FlyBase:FBgn0034882,Q29723074,Eglp1,Dmel_CG5398|Entomoglyceroporin 1|anon-WO014051...,protein-coding gene in the species Drosophila ...
3,FlyBase:FBgn0034884,Q29723261,Eglp3,Dmel_CG17662|Entomoglyceroporin 3|Aqp17662|CG1...,protein-coding gene in the species Drosophila ...
0,MGI:103201,Q14905239,Aqp1,Aqp1|CHIP28|aquaporin 1,protein-coding gene in the species Mus musculus
0,NCBIGene:100069916,Q30016192,Aqp1,,
1,NCBIGene:100567659,Q30016565,Aqp1,,
2,NCBIGene:106997467,Q30016602,Aqp1,,
3,NCBIGene:25240,Q24420581,Aqp1,Aqp1|CHIP28|aquaporin 1,protein-coding gene in the species Rattus norv...
4,NCBIGene:282653,Q30016633,Aqp1,,


In [90]:
#output_df = pd.concat([ncbigene_df,fly_df, mgi_df, zfin_df])
output_df.to_csv('{}_homologs_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [91]:
# get orthologs (all id mapped to ncbigene)
# get input_list
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_gene_homolog_ncbigene.tsv')
input_df = input_df[input_df.homolog_ncbigene_id.str.contains('NCBIGene')]
input_df['id'] = input_df.homolog_ncbigene_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"36236" "36237" "37736" "37738" "11826" "100069916" "100567659" "106997467" "25240" "282653" "343" "359" "361" "362" "363" "403732" "420384" "4284" "463330" "559284" "816186" "817123" "818255" "818293" "818294" "818487" "819204" "819564" "820870" "822259" "823898" "824510" "824647" "824862" "825316" "827446" "827956" "828051" "828439" "829662" "831947" "834794" "836187" "838359" "839235" "841648" "843653" "335821"'

In [92]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P351 ?id .} # ncbi gene
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [93]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

# merge input with response
input_df = input_df[['homolog_ncbigene_id']]
input_df['id'] = input_df.homolog_ncbigene_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['homolog_ncbigene_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head(2)

Unnamed: 0,homolog_ncbigene_id,item,itemLabel,altLabel,itemDesc
0,NCBIGene:36236,Q29717328,Drip,I|DRIP|CG9023 gene product from transcript CG9...,protein-coding gene in the species Drosophila ...
1,NCBIGene:36237,Q29717311,Prip,CG7777 gene product from transcript CG7777-RB|...,protein-coding gene in the species Drosophila ...


In [94]:
# no concat 
output_df.to_csv('{}_homologs_ncbigene_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [96]:
# get paralogs
# get the input list of proteins
input_df = pd.read_table('/home/nuria/workspace/ngly1_hg/aqp1_human_expansion/aqp1_human_paralogs.tsv')
input_df['id'] = input_df.protein_id.apply(lambda x: '"' + str(x.split(':')[1]) + '"')
input_l = list(input_df['id'])
input_s = ' '.join(input_l)
input_s

'"P29972" "O94778" "P41181" "P55087" "P55064" "Q13520" "P30301"'

In [97]:
# query
query = """SELECT DISTINCT ?id ?item ?itemLabel (group_concat(distinct ?itemaltLabel; separator="|") as ?altLabel) ?itemDesc
WHERE
{
  {?item wdt:P352 ?id .} # uniprot id
  values ?id {""" + input_s + """}
  OPTIONAL{
  ?item skos:altLabel ?itemaltLabel .
    FILTER(LANG(?itemaltLabel) = "en")
  ?item schema:description ?itemDesc .
    FILTER(LANG(?itemDesc) = "en")
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
group by ?item ?id ?itemLabel ?itemDesc"""

r = requests.post(api, data={'query': query}, headers={'Accept':'application/sparql-results+json'})
#r.json()

In [101]:
header_l = r.json()['head']['vars']
results_l = r.json()['results']['bindings']
df = generate_table(header_l, results_l)
df.head(2)

# merge input with response
input_df = input_df[['protein_id']]
input_df['id'] = input_df.protein_id.apply(lambda x: x.split(':')[1])
output_df = input_df.merge(df, how="left")
output_df = output_df[['protein_id', 'item', 'itemLabel', 'altLabel', 'itemDesc']]
output_df.head(2)

Unnamed: 0,protein_id,item,itemLabel,altLabel,itemDesc
0,UniProt:P29972,Q2083074,Aquaporin 1 (Colton blood group),AQP1|Aquaporin-CHIP|Water channel protein for ...,mammalian protein found in Homo sapiens
1,UniProt:O94778,Q21106460,Aquaporin 8,AQP8|aquaporin-8,mammalian protein found in Homo sapiens


In [102]:
output_df.to_csv('{}_paralogs_concept_kb.tsv'.format(path), sep='\t', index=False, header=True)

In [95]:
# get taxon, chr, so