# Query WikiData to get Biomedical Entities

We will get the nodes (and later some edges) for our biomedical graph from WikiData

In [1]:
import pandas as pd
from pathlib import Path
from data_tools.df_processing import char_combine_iter
from data_tools.wiki import node_query_pipeline

In [2]:
nodes = []

# Diseases

In [3]:
q = """ SELECT DISTINCT ?disease ?diseaseLabel ?umlscui ?snomed_ct ?doid ?mesh ?mondo ?omim ?orpha
        WHERE {

          # Initial typing for Disease 
          # Either instance of Disease of has a Disease Ontology ID
          {?disease wdt:P31 wd:Q12136}UNION{?disease wdt:P699 ?doid}.

          OPTIONAL {?disease wdt:P2892 ?umlscui .}
          OPTIONAL {?disease wdt:P5806 ?snomed_ct. }
          OPTIONAL {?disease wdt:P699 ?doid. }
          OPTIONAL {?disease wdt:P486 ?mesh. }
          OPTIONAL {?disease wdt:P5270 ?mondo. }
          OPTIONAL {?disease wdt:P492 ?omim. }
          OPTIONAL {?disease wdt:P1550 ?orpha. }

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [4]:
dis_curi_map = {'umlscui': 'UMLS', 'snomed_ct': 'SNOMED', 'mesh': 'MESH', 
                'doid': 'DOID', 'mondo': 'MONDO', 'omim': 'OMIM', 'orpha': 'ORPHA'}

res = node_query_pipeline(q, dis_curi_map, 'disease')
nodes.append(res)
nodes[0].head()

Unnamed: 0,id,name,label,xrefs
0,Q1002195,autosomal recessive limb-girdle muscular dystr...,Disease,DOID:0110297|MONDO:0012248|OMIM:609308|UMLS:C1...
1,Q1003534,bulbar syndrome,Disease,
2,Q1004647,bullous pemphigoid,Disease,DOID:8506|MESH:D010391|MONDO:0019082|UMLS:C003...
3,Q1016288,buried bumper syndrome,Disease,
4,Q1016605,Burkitt lymphoma,Disease,DOID:8584|MESH:D002051|OMIM:113970|UMLS:C0006413


# Compounds

In [5]:
q = """SELECT DISTINCT ?compound ?compoundLabel ?kegg_drug ?chebi ?drugbank_id ?umlscui ?chembl_id ?unii ?ikey ?pubchem_cid ?rxnorm ?mesh_supplemental_record_ui ?mesh_descriptor_ui
        WHERE {

          # Initial typing for Compound
          ?compound wdt:P31 wd:Q11173 .

          OPTIONAL { ?compound wdt:P665 ?kegg_drug .}
          OPTIONAL { ?compound wdt:P683 ?chebi .}
          OPTIONAL { ?compound wdt:P715 ?drugbank_id .}
          OPTIONAL { ?compound wdt:P2892 ?umlscui .}
          OPTIONAL { ?compound wdt:P592 ?chembl_id .}
          OPTIONAL { ?compound wdt:P652 ?unii .}
          OPTIONAL { ?compound wdt:P3350 ?ikey .}
          OPTIONAL { ?compound wdt:P662 ?pubchem_cid .}
          OPTIONAL { ?compound wdt:P3345 ?rxnorm .}
          OPTIONAL { ?compound wdt:P6680 ?mesh_supplemental_record_ui .}
          OPTIONAL { ?compound wdt:P486 ?mesh_descriptor_ui .}

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [6]:
chem_curi_map = {'unii': 'UNII', 
    'rxnorm': 'RxCUI', 
    'drugbank_id': 'DB', 
    'umlscui': 'UMLS', 
    'chebi': 'CHEBI', 
    'chembl_id': 'CHEMBL',
    'kegg_drug': 'KEGG', 
    'ikey': 'IKEY', 
    'pubchem_cid': 'PCID', 
    'mesh_supplemental_record_ui': 'MESH', 
    'mesh_descriptor_ui': 'MESH'}

res = node_query_pipeline(q, chem_curi_map, 'compound')
nodes.append(res)
nodes[1].head()

Unnamed: 0,id,name,label,xrefs
0,Q1002152,fluoroiodomethane,Compound,PCID:13981373
1,Q1002165,fluticasone,Compound,CHEBI:5134|CHEMBL:CHEMBL1201396|KEGG:C07815|KE...
2,Q1003185,fosfestrol,Compound,CHEBI:4532|CHEMBL:CHEMBL1200598|KEGG:C08145|KE...
3,Q1003526,(S)-(+)-bulbocapnine,Compound,CHEBI:3211|CHEMBL:CHEMBL157912|KEGG:C09367|PCI...
4,Q1007138,gabaculine,Compound,CHEBI:29585|CHEMBL:CHEMBL1394922|PCID:3445


# Phenotype

In [7]:
q = """SELECT DISTINCT ?phenotype ?phenotypeLabel ?hpo ?mesh ?omim ?snomed
        WHERE {

          # Initial typing for phenotype
          {?phenotype wdt:P31 wd:Q169872.}UNION{?phenotype wdt:P3841 ?hpo}

          # Xrefs associated with phenotypes
          OPTIONAL {?phenotype wdt:P3841 ?hpo .}
          OPTIONAL {?phenotype wdt:P486 ?mesh . }
          OPTIONAL {?phenotype wdt:P492 ?omim . }
          OPTIONAL {?phenotype wdt:P5806 ?snomed . }

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [8]:
res = node_query_pipeline(q, {'mesh': 'MESH', 'omim': 'OMIM', 'hpo':'HP', 'snomed': 'SNOMED'}, 'phenotype')
nodes.append(res)
nodes[2].head()

Unnamed: 0,id,name,label,xrefs
0,Q1016605,Burkitt lymphoma,Phenotype,HP:0030080|MESH:D002051|OMIM:113970
1,Q101971,wart,Phenotype,HP:0200043|MESH:D014860
2,Q101991,inflammation,Phenotype,MESH:D007249
3,Q10265550,acid–base imbalance,Phenotype,HP:0004360|MESH:D000137
4,Q1027995,pyloric stenosis,Phenotype,HP:0002021|MESH:D011707


# Gene

Genes are too numerous and will require filtering to a single taxon in order for the query to finish successfully.

For now we will only extract human genes, but in the future we will do the same for infectious taxa.

In [9]:
q = """SELECT DISTINCT ?gene ?geneLabel ?entrez ?symbol ?hgnc ?omim ?ensembl
        WHERE {{

          # Initial typing for Gene
          ?gene wdt:P31 wd:Q7187.
          ?gene wdt:P703 wd:{tax}.

          OPTIONAL{{?gene wdt:P351 ?entrez .}}
          OPTIONAL{{?gene wdt:P353 ?symbol .}}
          OPTIONAL{{?gene wdt:P354 ?hgnc .}}
          OPTIONAL{{?gene wdt:P492 ?omim .}}
          OPTIONAL{{?gene wdt:P594 ?ensembl .}}

          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}
        }}"""

human_tax_wd_id = 'Q15978631' 
q = q.format(tax=human_tax_wd_id)

In [10]:
gene_curi_map = {'entrez': 'NCBIGene', 'symbol': 'SYM', 'hgnc':'HGNC', 'omim':'OMIM', 'ensembl':'ENSG'}
res = node_query_pipeline(q, gene_curi_map, 'gene')
nodes.append(res)
nodes[3].head()

Unnamed: 0,id,name,label,xrefs
0,Q1022703,ZMPSTE24,Gene,ENSG:ENSG00000084073|HGNC:12877|NCBIGene:10269...
1,Q1145906,HADHA,Gene,ENSG:ENSG00000084754|HGNC:4801|NCBIGene:3030|O...
2,Q1161264,ETFDH,Gene,ENSG:ENSG00000171503|HGNC:3483|NCBIGene:2110|O...
3,Q1181014,DNM1,Gene,ENSG:ENSG00000106976|HGNC:2972|NCBIGene:1759|O...
4,Q11868650,SPEF2,Gene,ENSG:ENSG00000152582|HGNC:26293|NCBIGene:79925...


# Protein

In [11]:
q = """SELECT DISTINCT ?protein ?proteinLabel ?uniprot
        WHERE {{

          # Initial typing for Protein
          ?protein wdt:P31 wd:Q8054.
          ?protein wdt:P703 wd:{tax}.

          OPTIONAL{{?protein wdt:P352 ?uniprot .}}
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}
        }}"""
q = q.format(tax=human_tax_wd_id)

In [12]:
res = node_query_pipeline(q, {'uniprot':'UniProt'}, 'protein')
nodes.append(res)
nodes[4].head()

Unnamed: 0,id,name,label,xrefs
0,Q1024612,C-X-C motif chemokine receptor 5,Protein,UniProt:P32302
1,Q1032902,"Mucin 16, cell surface associated",Protein,UniProt:Q8WXI7
2,Q1056532,CD44 molecule (Indian blood group),Protein,UniProt:P16070
3,Q1058190,CD9 molecule,Protein,UniProt:P21926
4,Q1069599,C-C motif chemokine ligand 17,Protein,UniProt:Q92583


# Pathway

In [13]:
q = """SELECT DISTINCT ?pathway ?pathwayLabel ?react ?wpid
        WHERE {

          # Initial typing for Pathway
          ?pathway wdt:P31 wd:Q4915012 .

          OPTIONAL{?pathway wdt:P3937 ?react .}
          OPTIONAL{?pathway wdt:P2410 ?wpid .}

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [14]:
res = node_query_pipeline(q, {'react':'REACT', 'wpid':'WP'}, 'pathway')
nodes.append(res)
nodes[5].head()

Unnamed: 0,id,name,label,xrefs
0,Q14859896,cellular amino acid metabolic process,Pathway,
1,Q14864262,protein metabolic process,Pathway,
2,Q14866112,apoptotic signaling pathway,Pathway,
3,Q1502576,gene regulatory network,Pathway,
4,Q210973,cell signaling,Pathway,


# Molecular Function

In [15]:
q = """SELECT DISTINCT ?molecular_function ?molecular_functionLabel ?goid
        WHERE {

          # Initial typing for molecular Function
          ?molecular_function wdt:P31 wd:Q14860489 .
          ?molecular_function wdt:P686 ?goid

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [16]:
res = node_query_pipeline(q, {'goid':'GO'}, 'molecular_function')
nodes.append(res)
nodes[6].head()

Unnamed: 0,id,name,label,xrefs
0,Q1012651,ribonuclease P activity,Molecular Function,GO:0004526
1,Q13667380,metal ion binding,Molecular Function,GO:0046872
2,Q13667398,lipoprotein particle receptor binding,Molecular Function,GO:0070325
3,Q14326094,protein serine/threonine kinase activity,Molecular Function,GO:0004674
4,Q14326101,serine-type peptidase activity,Molecular Function,GO:0008236


# Biological Process

In [17]:
q = """SELECT DISTINCT ?biological_process ?biological_processLabel ?goid
        WHERE {

          # Initial typing for molecular Function
          ?biological_process wdt:P31 wd:Q2996394 .
          ?biological_process wdt:P686 ?goid

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [18]:
res = node_query_pipeline(q, {'goid':'GO'}, 'biological_process')
nodes.append(res)
nodes[7].head()

Unnamed: 0,id,name,label,xrefs
0,Q1057,metabolism,Biological Process,GO:0008152
1,Q105726,urination,Biological Process,GO:0060073
2,Q1067506,vasoconstriction,Biological Process,GO:0042310
3,Q1068809,cell cycle checkpoint,Biological Process,GO:0000075
4,Q10746904,intramembranous ossification,Biological Process,GO:0001957


# Cellular Component

In [19]:
q = """SELECT DISTINCT ?cellular_component ?cellular_componentLabel ?goid
    WHERE {

      # Initial typing for Cellular Component
      ?cellular_component wdt:P31 wd:Q5058355 .
      ?cellular_component wdt:P686 ?goid

      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
    }"""

In [20]:
res = node_query_pipeline(q, {'goid':'GO'}, 'cellular_component')
nodes.append(res)
nodes[8].head()

Unnamed: 0,id,name,label,xrefs
0,Q1051872,caveola,Cellular Component,GO:0005901
1,Q1065756,melanosome,Cellular Component,GO:0042470
2,Q1134046,plasmodesma,Cellular Component,GO:0009506
3,Q1136436,costamere,Cellular Component,GO:0043034
4,Q1140236,mitochondrial crista,Cellular Component,GO:0030061


# Anatomy

In [21]:
q = """SELECT DISTINCT ?anatomy ?anatomyLabel ?uberon ?mesh
        WHERE {

          # Anatomical Strucutres
          ?anatomy wdt:P1554 ?uberon
          
          OPTIONAL{?anatomy wdt:P486 ?mesh .}

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [22]:
res = node_query_pipeline(q, {'uberon':'UBERON', 'mesh': 'MESH'}, 'anatomy')
nodes.append(res)
nodes[9].head()

Unnamed: 0,id,name,label,xrefs
0,Q1001337,Mesencephalic nucleus of trigeminal nerve,Anatomy,UBERON:0001718
1,Q1002789,posterior ethmoidal foramen,Anatomy,UBERON:0018654
2,Q1003805,Nucleus ambiguus,Anatomy,UBERON:0001719
3,Q101004,aorta,Anatomy,MESH:D001011|UBERON:0000947
4,Q1029907,stomach,Anatomy,MESH:D013270|UBERON:0000945


# Put them all together

In [23]:
nodes = pd.concat(nodes, sort=False, ignore_index=True)
len(nodes)

419648

In [24]:
nodes['id'].nunique()

418518

In [25]:
nodes[nodes['id'].duplicated(keep=False)].sort_values('id').head(50)

Unnamed: 0,id,name,label,xrefs
4,Q1016605,Burkitt lymphoma,Disease,DOID:8584|MESH:D002051|OMIM:113970|UMLS:C0006413
281782,Q1016605,Burkitt lymphoma,Phenotype,HP:0030080|MESH:D002051|OMIM:113970
281783,Q101971,wart,Phenotype,HP:0200043|MESH:D014860
9,Q101971,wart,Disease,MESH:D014860|UMLS:C3665596
20,Q1027995,pyloric stenosis,Disease,DOID:12639|MESH:D011707|UMLS:C0034194
281786,Q1027995,pyloric stenosis,Phenotype,HP:0002021|MESH:D011707
32,Q1036696,hypothermia,Disease,MESH:D007035|UMLS:C0020672
281791,Q1036696,hypothermia,Phenotype,HP:0002045|MESH:D007035
281792,Q10434599,bradyopsia,Phenotype,HP:0030511|MESH:C564243|OMIM:608415
36,Q10434599,bradyopsia,Disease,DOID:0050335|MESH:C564243|MONDO:0012033|OMIM:6...


In [26]:
nodes[nodes['id'].duplicated(keep=False)].sort_values('id').tail(50)

Unnamed: 0,id,name,label,xrefs
283694,Q943897,gastroparesis,Phenotype,HP:0002578|MESH:D018589|OMIM:MTHU005170
15370,Q943897,gastroparesis,Disease,DOID:11914|MESH:D018589|MONDO:0006769|OMIM:MTH...
283697,Q945238,peripheral neuropathy,Phenotype,HP:0009830|MESH:D010523
15374,Q945238,peripheral neuropathy,Disease,DOID:574|MESH:D010523|MONDO:0020126|UMLS:C0031...
283698,Q947813,encephalomalacia,Phenotype,HP:0040197|MESH:D004678
15378,Q947813,encephalomalacia,Disease,DOID:2034|MESH:D004678|MONDO:0006741|UMLS:C001...
15383,Q950591,scotoma,Disease,DOID:9335|MESH:D012607|MONDO:0004758|UMLS:C002...
283700,Q950591,scotoma,Phenotype,HP:0000575|MESH:D012607
15384,Q950838,germinoma,Disease,DOID:3304|MESH:D018237|MONDO:0002598|UMLS:C020...
283701,Q950838,germinoma,Phenotype,HP:0100620|MESH:D018237


In [27]:
nodes['label'].value_counts()

Compound              266348
Gene                   58919
Biological Process     30290
Protein                25578
Disease                15434
Molecular Function     11316
Cellular Component      4281
Pathway                 3026
Anatomy                 2508
Phenotype               1948
Name: label, dtype: int64

## Save

In [28]:
this_name = '01a_WikiData_Nodes'
out_dir = Path('../2_pipeline/').joinpath(this_name, 'out')

# Make the output directory if doesn't already exist
out_dir.mkdir(parents=True, exist_ok=True)

nodes.to_csv(out_dir.joinpath('nodes.csv'), index=False)