In [1]:
import pandas as pd
from pathlib import Path
from biothings_client import get_client
from metapaths.tools import obo_tools as ot
from wikidataintegrator.wdi_core import WDItemEngine
from tools.processing import strip_chars, expand_col_on_char, combine_group_rows_on_char
from tools.biothings_processing import get_dotval
from hetnet_ml.src import graph_tools as gt

In [2]:
name = '07_Taxonomic_cause_of_disease'
out_dir = (Path('../2_pipeline').joinpath(name).joinpath('out')).resolve()
if not out_dir.exists():
    out_dir.mkdir(parents=True)

# Taxa that cause diease

First we will load a table of names of infectious taxa and the diseases they cause.  This information in this table was [adapted from wikipedia.](https://en.wikipedia.org/wiki/List_of_infectious_diseases)

In [3]:
tax_dis = pd.read_csv('../0_data/manual/infectious-taxa.csv')

In [4]:
tax_dis.head(2)

Unnamed: 0,infectious_agent,common_name
0,Acinetobacter baumannii,Acinetobacter infections
1,Actinomyces israelii,Actinomycosis


In [5]:
# Fix weird formatting issues
for c in tax_dis:
    tax_dis[c] = tax_dis[c].str.strip()

## Map taxa using NCBITaxonomy ontology

Must resort to string matching from the start...

In [6]:
data_dir = Path('../2_pipeline/00_download_data/out/').resolve()
tax_items = ot.get_ontology_nodes(data_dir.joinpath('ncbitaxon.obo'))

In [7]:
tax_items.head(2)

Unnamed: 0,alt_id,id,name,xref,id_src
0,,NCBITaxon:1,root,GC_ID:1,NCBITaxon
1,,NCBITaxon:10,Cellvibrio,GC_ID:11|PMID:12710603|PMID:24105943,NCBITaxon


In [8]:
tax_items['strp_name'] = tax_items['name'].str.lower().apply(strip_chars)
name_to_id = tax_items.set_index('strp_name')['id'].to_dict()

In [9]:
tax_dis['strp_name'] = tax_dis['infectious_agent'].str.lower()
tax_dis['strp_name'] = tax_dis['strp_name'].str.split(' \(', expand=True)[0]
tax_dis['strp_name'] = tax_dis['strp_name'].apply(strip_chars)

In [10]:
tax_dis['tax_id'] = tax_dis['strp_name'].map(name_to_id)
tax_dis.count()

infectious_agent    277
common_name         277
strp_name           277
tax_id              191
dtype: int64

In [11]:
idx = tax_dis['tax_id'].isnull()
print(idx.sum())
tax_dis[idx]

86


Unnamed: 0,infectious_agent,common_name,strp_name,tax_id
3,Propionibacterium propionicus,Actinomycosis,propionibacteriumpropionicus,
5,HIV,AIDS (Acquired immunodeficiency syndrome),hiv,
7,Anaplasma species,Anaplasmosis,anaplasmaspecies,
12,Junin virus,Argentine hemorrhagic fever,juninvirus,
18,multiple bacteria,Bacterial meningitis,multiplebacteria,
19,multiple bacteria,Bacterial pneumonia,multiplebacteria,
20,List of bacterial vaginosis microbiota,Bacterial vaginosis,listofbacterialvaginosismicrobiota,
22,Balantidium coli,Balantidiasis,balantidiumcoli,
25,BK virus,BK virus infection,bkvirus,
29,Machupo virus,Bolivian hemorrhagic fever,machupovirus,


## Disease mappings

Map to diseases already in the via string matching.

In [12]:
nodes = pd.read_csv('../2_pipeline/07_Filter_non_human_annotations/out/nodes_all.csv', dtype=str)

In [13]:
disease_name_map = nodes[nodes[':LABEL'] == 'Disease'].set_index('name')['identifier:ID'].to_dict()

strp_d_name = {strip_chars(k).lower(): v for k, v in disease_name_map.items()}

tax_dis['strp_name'] = tax_dis['common_name'].str.lower()
tax_dis['strp_name'] = tax_dis['strp_name'].str.split(' \(', expand=True)[0]
tax_dis['strp_name'] = tax_dis['strp_name'].apply(strip_chars)

In [14]:
tax_dis['mesh_id'] = tax_dis['strp_name'].map(strp_d_name)

In [15]:
tax_dis_found = tax_dis.dropna(subset=['tax_id', 'mesh_id']).copy()
tax_dis_notfound = tax_dis[tax_dis['tax_id'].isnull() | tax_dis['mesh_id'].isnull()].copy()

In [16]:
tax_dis_found.sample()

Unnamed: 0,infectious_agent,common_name,strp_name,tax_id,mesh_id
9,Anisakis,Anisakiasis,anisakiasis,NCBITaxon:6268,MESH:D017129


### Querying wikidata

Wikidata also has some Taxon to Disease items already in the database. We will extract these with a query.

In [17]:
wd_query = """SELECT Distinct ?dis ?disLabel ?mesh_id ?doid ?taxLabel ?tax_id
WHERE 
{
  ?dis wdt:P31 wd:Q12136.
  OPTIONAL{?dis wdt:P486 ?mesh_id}.
  OPTIONAL{?dis wdt:P699 ?doid}.
  {?dis wdt:P828 ?tax} UNION {?tax wdt:P1542 ?dis}.
  ?tax wdt:P685 ?tax_id.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}"""

In [18]:
qres = WDItemEngine.execute_sparql_query(wd_query, as_dataframe=True)
qres['mesh_id'] = 'MESH:' + qres['mesh_id']
qres['tax_id'] = 'NCBITaxon:' + qres['tax_id']
qres = qres.drop('dis', axis=1)
qres = qres.dropna(subset=['doid', 'mesh_id'], how='all')
qres = qres.rename(columns={'disLabel': 'common_name', 'taxLabel': 'infectious_agent'})
qres.head(5)

Unnamed: 0,common_name,doid,mesh_id,infectious_agent,tax_id
0,balantidiasis,DOID:12386,MESH:D001447,Balantidium coli,NCBITaxon:71585
1,influenza,DOID:8469,MESH:D007251,influenza virus,NCBITaxon:11308
2,West Nile fever,DOID:2366,MESH:D014901,West Nile virus,NCBITaxon:11082
3,West Nile fever,DOID:4121,MESH:D014901,West Nile virus,NCBITaxon:11082
4,contagious pustular dermatitis,DOID:8771,MESH:D004474,Parapoxvirus ovis,NCBITaxon:10258


In [19]:
len(qres)

219

In [20]:
tax_dis_found = pd.concat([qres, tax_dis_found], sort=False).drop_duplicates(subset=['mesh_id', 'tax_id'])
len(tax_dis_found)

258

## biothings_client for querying string

We can also use the biothings client to query strings for ones we have missed.

In [21]:
def remove_paren(s):
    return s.split('(')[0].strip()

def get_inner_paren(s):
    start = s.index('(')+1
    end = s.index(')')
    return s[start:end]

def pipe_combine_paren(s):
    if '(' not in s:
        return s
    before_paren = remove_paren(s)
    in_paren = get_inner_paren(s)
    return before_paren + '|' + in_paren
        

In [22]:
# Split Parenthesis on common names and infectious agents
tax_dis_notfound['common_name'] = tax_dis_notfound['common_name'].apply(pipe_combine_paren)
tax_dis_notfound = expand_col_on_char(tax_dis_notfound, 'common_name', '|')

tax_dis_notfound['infectious_agent'] = tax_dis_notfound['infectious_agent'].apply(pipe_combine_paren)
tax_dis_notfound = expand_col_on_char(tax_dis_notfound, 'infectious_agent', '|')

# Lowercase for queries
tax_dis_notfound['dis_qname'] = tax_dis_notfound['common_name'].str.lower()
tax_dis_notfound['tax_qname'] = tax_dis_notfound['infectious_agent'].str.lower()

# Get the values to query
dis_to_query = tax_dis_notfound[tax_dis_notfound['mesh_id'].isnull()]['dis_qname'].unique().tolist()
tax_to_query = tax_dis_notfound[tax_dis_notfound['tax_id'].isnull()]['tax_qname'].unique().tolist()

In [23]:
# Special formatting for biothings_client bug....
dis_to_query = ['"{}"'.format(d) for d in dis_to_query]
dis_to_query[:4]

['"african sleeping sickness"',
 '"african trypanosomiasis"',
 '"aids"',
 '"acquired immunodeficiency syndrome"']

In [24]:
md = get_client('disease')

In [25]:
# See potential fields for names...
[k for k in md.get_fields() if 'name' in k or 'label' in k]

['ctd.bp_related_to_disease.go_name',
 'ctd.cc_related_to_disease.go_name',
 'ctd.chemical_related_to_disease.chemical_name',
 'ctd.mf_related_to_disease.go_name',
 'ctd.pathway_related_to_disease.pathway_name',
 'disgenet.genes_related_to_disease.gene_name',
 'disgenet.xrefs.disease_name',
 'hpo.disease_name',
 'mondo.label']

In [26]:
scopes=['mondo.label', 'hpo.disease_name','disgenet.xrefs.disease_name']
fields=['ctd.mesh','ctd.omim','mondo.xrefs','disgenet.xrefs', 'hpo.omim']+scopes

In [27]:
res = md.querymany(dis_to_query, scopes=scopes, fields=fields)
exact_mondo_res = [r for r in res if strip_chars(r['query']) == 
                   strip_chars(r.get('mondo', dict()).get('label', '').lower())]

exact_hpo_res = [r for r in res if strip_chars(r['query']) == 
                 strip_chars(r.get('hpo', dict()).get('disease_name', '').lower())]

exact_disgenet_res = []

for r in res:
    name = get_dotval(r, 'disgenet.xrefs.disease_name')

    if type(name) == list:
        names = [strip_chars(n).lower() for n in name]
        if strip_chars(r['query']).lower() in names:
            exact_disgenet_res.append(r)

    elif pd.isnull(name):
        continue 

    else:
        if strip_chars(r['query']).lower() == strip_chars(name).lower():
            exact_disgenet_res.append(r)


querying 1-157...done.
Finished.
36 input query terms found dup hits:
	[('african trypanosomiasis', 2), ('aids', 7), ('acquired immunodeficiency syndrome', 2), ('amoebiasi
85 input query terms found no hit:
	['african sleeping sickness', 'arcanobacterium haemolyticum infection', 'astrovirus infection', 'bac
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [28]:
names = []
mesh_ids = []
doids = []
omim_ids = []

for r in exact_mondo_res:
    mesh = get_dotval(r, 'mondo.xrefs.mesh')
    doid = get_dotval(r, 'mondo.xrefs.doid')
    omim = get_dotval(r, 'mondo.xrefs.omim')

    # Look at CTD for things that were missed
    if pd.isnull(mesh):
        mesh = get_dotval(r, 'ctd.mesh')
    if pd.isnull(omim):
        omim = get_dotval(r, 'ctd.omim')

    names.append(r['query'])
    mesh_ids.append(mesh)
    doids.append(doid)
    omim_ids.append(omim)
    
mondo_res = pd.DataFrame({'name': names, 'mesh_id': mesh_ids, 'doid': doids, 'omim_id': omim_ids})
mondo_res['mesh_id'] = 'MESH:' + mondo_res['mesh_id']
mondo_res['omim_id'] = 'OMIM:' + mondo_res['omim_id']
mondo_res.head(5)

Unnamed: 0,name,mesh_id,doid,omim_id
0,african trypanosomiasis,MESH:D014353,,
1,aids,MESH:D000163,DOID:635,
2,argentine hemorrhagic fever,,DOID:0050194,
3,bacterial meningitis,MESH:D016920,DOID:9470,
4,bacterial pneumonia,MESH:D018410,DOID:874,


In [29]:
mondo_res.dropna(subset=['omim_id'])

Unnamed: 0,name,mesh_id,doid,omim_id
19,fatal familial insomnia,MESH:D034062,DOID:0050433,OMIM:600072


In [30]:
names = []
mesh_ids = []
omim_ids = []

for r in exact_hpo_res:
    omim = get_dotval(r, 'hpo.omim')    
    mesh = get_dotval(r, 'ctd.mesh')
    
    if pd.isnull(omim):
        omim = get_dotval(r, 'ctd.omim')
    
    names.append(r['query'])
    mesh_ids.append(mesh)
    omim_ids.append(omim)
    
hpo_res = pd.DataFrame({'name': names, 'mesh_id': mesh_ids, 'omim_id': omim_ids})
hpo_res['mesh_id'] = 'MESH:' + hpo_res['mesh_id']
hpo_res['omim_id'] = 'OMIM:' + hpo_res['omim_id']
hpo_res.head(5)

Unnamed: 0,name,mesh_id,omim_id
0,creutzfeldt–jakob disease,,OMIM:123400
1,crimean-congo hemorrhagic fever,MESH:D006479,
2,dengue fever,MESH:D003715,
3,ebola hemorrhagic fever,MESH:D019142,
4,fatal familial insomnia,MESH:D034062,OMIM:600072


In [31]:
def add_prefix_list_or_string(items, prefix):
    if type(items) == list:
        return '|'.join([prefix+i for i in set(items)])
    elif type(items) == str:
        return prefix + items
    else:
        return items

In [32]:
names = []
mesh_ids = []
doids = []
omim_ids = []

for r in exact_disgenet_res:
    mesh = get_dotval(r, 'disgenet.xrefs.mesh')
    doid = get_dotval(r, 'disgenet.xrefs.doid')
    omim = get_dotval(r, 'disgenet.xrefs.omim', list_ok=False)
    
    # Look at CTD for things that were missed
    if type(mesh) != list and pd.isnull(mesh):
        mesh = get_dotval(r, 'ctd.mesh')
    if pd.isnull(omim):
        omim = get_dotval(r, 'ctd.omim')

    names.append(r['query'])
    
    mesh_ids.append(add_prefix_list_or_string(mesh, 'MESH:'))
    doids.append(add_prefix_list_or_string(doid, 'DOID:'))
    omim_ids.append(omim)
    
disgenet_res = pd.DataFrame({'name': names, 'mesh_id': mesh_ids, 'doid': doids, 'omim_id': omim_ids})
disgenet_res['omim_id'] = 'OMIM:' + disgenet_res['omim_id']
disgenet_res

Unnamed: 0,name,mesh_id,doid,omim_id
0,african trypanosomiasis,MESH:D014353,DOID:10113,
1,african trypanosomiasis,MESH:D014353,DOID:10113,
2,acquired immunodeficiency syndrome,MESH:D000163,DOID:635,
3,acquired immunodeficiency syndrome,MESH:D000163,DOID:635,
4,bacterial vaginosis,MESH:D016585,DOID:3901,
5,creutzfeldt–jakob disease,MESH:D007562,DOID:11949,OMIM:123400
6,creutzfeldt–jakob disease,MESH:C565143|MESH:D007562,DOID:11949,OMIM:123400
7,dengue fever,MESH:D003715|MESH:D019595,DOID:12206|DOID:9682,
8,fatal familial insomnia,MESH:D034062,DOID:0050433,OMIM:600072
9,helicobacter pylori infection,,,


In [33]:
found_dis = pd.concat([mondo_res, hpo_res, disgenet_res], sort=False, ignore_index=False)

In [34]:
found_dis = combine_group_rows_on_char(found_dis, 'name').replace('', float('nan'))

In [35]:
found_dis

Unnamed: 0,name,mesh_id,doid,omim_id
0,acquired immunodeficiency syndrome,MESH:D000163,DOID:635,
1,african trypanosomiasis,MESH:D014353,DOID:10113,
2,aids,MESH:D000163,DOID:635,
3,argentine hemorrhagic fever,,DOID:0050194,
4,bacterial meningitis,MESH:D016920,DOID:9470,
5,bacterial pneumonia,MESH:D018410,DOID:874,
6,bacterial vaginosis,MESH:D016585,DOID:3901|DOID:3385,
7,bartonellosis,MESH:D001474,DOID:11102,
8,black piedra,MESH:D010854,DOID:12711,
9,bolivian hemorrhagic fever,,DOID:0050195,


### Query MyTaxon.info

In [36]:
tax_to_query = ['"{}"'.format(d) for d in tax_to_query]
tax_to_query[:4]

['"propionibacterium propionicus"',
 '"hiv"',
 '"anaplasma species"',
 '"junin virus"']

In [37]:
mt = get_client('taxon')

In [38]:
[k for k in mt.get_fields() if 'name' in k or 'label' in k]

['common_name', 'genbank_common_name', 'scientific_name', 'uniprot_name']

In [39]:
scopes = [k for k in mt.get_fields() if 'name' in k]
fields = ['taxid']+scopes

res = mt.querymany(tax_to_query, scopes=scopes, fields=fields, size=100000)

querying 1-78...done.
Finished.
33 input query terms found dup hits:
	[('hiv', 1000), ('junin virus', 3), ('bk virus', 5), ('varicella zoster virus', 3), ('clostridium di
33 input query terms found no hit:
	['propionibacterium propionicus', 'anaplasma species', 'multiple bacteria', 'list of bacterial vagin
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [40]:
exact_sname = [r for r in res if strip_chars(r.get('scientific_name', '')).lower() == strip_chars(r['query'])]
exact_cname = [r for r in res if strip_chars(r.get('common_name', '').lower()) == strip_chars(r['query'])]
exact_uname = [r for r in res if strip_chars(r.get('uniprot_name', '')).lower() == strip_chars(r['query'])]
exact_gname = [r for r in res if strip_chars(r.get('genbank_common_name', '').lower()) == strip_chars(r['query'])]

In [41]:
tax_map = dict()
for item in exact_sname + exact_cname + exact_uname + exact_gname:
    tax_map[item['query']] = 'NCBITaxon:' + str(item['taxid'])

In [42]:
tax_dis_notfound['tax_id'] = tax_dis_notfound['tax_id'].fillna(tax_dis_notfound['tax_qname'].map(tax_map))

In [43]:
tax_dis_notfound = tax_dis_notfound.merge(found_dis, left_on='dis_qname', right_on='name', how='left')
tax_dis_notfound.head(2)

Unnamed: 0,infectious_agent,common_name,strp_name,tax_id,mesh_id_x,dis_qname,tax_qname,name,mesh_id_y,doid,omim_id
0,Propionibacterium propionicus,Actinomycosis,actinomycosis,,MESH:D000196,actinomycosis,propionibacterium propionicus,,,,
1,Trypanosoma brucei,African sleeping sickness,africansleepingsickness,NCBITaxon:5691,,african sleeping sickness,trypanosoma brucei,,,,


In [44]:
tax_dis_notfound['mesh_id_x'] = tax_dis_notfound['mesh_id_x'].fillna(tax_dis_notfound['mesh_id_y'])
tax_dis_notfound = tax_dis_notfound.rename(columns={'mesh_id_x': 'mesh_id'}).drop('mesh_id_y', axis=1)
tax_dis_notfound.head(2)

Unnamed: 0,infectious_agent,common_name,strp_name,tax_id,mesh_id,dis_qname,tax_qname,name,doid,omim_id
0,Propionibacterium propionicus,Actinomycosis,actinomycosis,,MESH:D000196,actinomycosis,propionibacterium propionicus,,,
1,Trypanosoma brucei,African sleeping sickness,africansleepingsickness,NCBITaxon:5691,,african sleeping sickness,trypanosoma brucei,,,


In [45]:
tax_dis_notfound.count()

infectious_agent    253
common_name         253
strp_name           253
tax_id              138
mesh_id             113
dis_qname           253
tax_qname           253
name                 66
doid                 56
omim_id               2
dtype: int64

In [46]:
null_dis = tax_dis_notfound['doid'].isnull() & \
           tax_dis_notfound['mesh_id'].isnull() & \
           tax_dis_notfound['omim_id'].isnull() 
null_tax = tax_dis_notfound['tax_id'].isnull()

missing_info = tax_dis_notfound[null_dis | null_tax]

In [47]:
len(missing_info)

208

In [48]:
tax_dis_notfound['dis_id'] = tax_dis_notfound['mesh_id'].fillna(tax_dis_notfound['omim_id']).fillna(tax_dis_notfound['doid'])
tax_dis_notfound = tax_dis_notfound.dropna(subset=['dis_id', 'tax_id'])

tax_dis_notfound = expand_col_on_char(tax_dis_notfound, 'dis_id', '|')

tax_dis_notfound = tax_dis_notfound.drop_duplicates(subset=['tax_id', 'dis_id'])

In [49]:
tax_dis_found['dis_id'] = tax_dis_found['mesh_id'].fillna(tax_dis_found['doid']) 


In [50]:
tax_dis_final = pd.concat([tax_dis_found, tax_dis_notfound], sort=False, ignore_index=True).drop_duplicates(subset=['tax_id', 'dis_id'])
len(tax_dis_final)

288

In [51]:
tax_dis_final['alt_disease_ids']  = tax_dis_final.apply(lambda row: 
        '|'.join([m for m in row[['mesh_id', 'doid', 'omim_id']].dropna() 
                  if m != row['dis_id'] ]), axis=1).replace('', float('Nan'))
tax_dis_final.head(2)

Unnamed: 0,common_name,doid,mesh_id,infectious_agent,tax_id,strp_name,dis_id,dis_qname,tax_qname,name,omim_id,alt_disease_ids
0,balantidiasis,DOID:12386,MESH:D001447,Balantidium coli,NCBITaxon:71585,,MESH:D001447,,,,,DOID:12386
1,influenza,DOID:8469,MESH:D007251,influenza virus,NCBITaxon:11308,,MESH:D007251,,,,,DOID:8469


In [52]:
tax_name_map = tax_items.set_index('id')['name']
tax_nodes = pd.Series(tax_dis_final['tax_id'].unique(), name='id').to_frame()
tax_nodes['name'] = tax_nodes['id'].map(tax_name_map)
tax_nodes['label'] = 'Taxonomy'

tax_nodes.head()

Unnamed: 0,id,name,label
0,NCBITaxon:71585,Balantioides coli,Taxonomy
1,NCBITaxon:11308,Orthomyxoviridae,Taxonomy
2,NCBITaxon:11082,West Nile virus,Taxonomy
3,NCBITaxon:10258,Orf virus,Taxonomy
4,NCBITaxon:10239,Viruses,Taxonomy


In [53]:
dis_ids = set(disease_name_map.values())
dis_nodes = (tax_dis_final.query('dis_id not in @dis_ids')[['dis_id', 'common_name', 'alt_disease_ids']]
                .rename(columns={'dis_id': 'id', 'common_name': 'name'}).drop_duplicates())
dis_nodes['name'] = dis_nodes['name'].str.title()
dis_nodes['label'] = 'Disease'
dis_nodes = gt.order_cols(dis_nodes)
dis_nodes.head()

Unnamed: 0,id,name,label,alt_disease_ids
6,DOID:14421,Brugia Malayi Filariasis,Disease,
65,MESH:C536125,Nocardiosis,Disease,DOID:2312
77,DOID:0050398,Carrion'S Disease,Disease,
103,DOID:13310,Diphtheritic Peritonitis,Disease,
104,DOID:0050481,Endemic Typhus,Disease,


In [54]:
edges = tax_dis_final.rename(columns={'tax_id': 'start_id', 'dis_id': 'end_id'})[['start_id', 'end_id']]
edges['type'] = 'Causes_TcD'
edges['source'] = 'Wikipedia'
edges['evidence'] = 'curated'
edges.head(10)

Unnamed: 0,start_id,end_id,type,source,evidence
0,NCBITaxon:71585,MESH:D001447,Causes_TcD,Wikipedia,curated
1,NCBITaxon:11308,MESH:D007251,Causes_TcD,Wikipedia,curated
2,NCBITaxon:11082,MESH:D014901,Causes_TcD,Wikipedia,curated
3,NCBITaxon:10258,MESH:D004474,Causes_TcD,Wikipedia,curated
4,NCBITaxon:10239,MESH:D011024,Causes_TcD,Wikipedia,curated
5,NCBITaxon:11089,MESH:D015004,Causes_TcD,Wikipedia,curated
6,NCBITaxon:6279,DOID:14421,Causes_TcD,Wikipedia,curated
7,NCBITaxon:186538,MESH:D019142,Causes_TcD,Wikipedia,curated
8,NCBITaxon:5693,MESH:D014355,Causes_TcD,Wikipedia,curated
9,NCBITaxon:10279,MESH:D008976,Causes_TcD,Wikipedia,curated


In [56]:
nodes = pd.concat([tax_nodes, dis_nodes], sort=False)
nodes.to_csv(out_dir.joinpath('new_nodes.csv'), index=False)
edges.to_csv(out_dir.joinpath('edges.csv'), index=False)