# Cleaning up some DOID info

It was noticed that some disease ontology IDs have a cross-reference to another disease ontology ID... they should all have their own separate entity at this point, so we need to figure out which is the correct identifier and merge all instances of the false one into the correct one.  We will do this by taking the true name for the identifier from DO directly, then see wheather the node's ID or it's cross-reference maps to the same name as the node's name.

In [1]:
import pandas as pd
from pathlib import Path
from data_tools.wiki import get_curi_xrefs
from data_tools.df_processing import expand_col_on_char, combine_group_cols_on_char

import obonet
import networkx

  from tqdm.autonotebook import tqdm


In [2]:
this_name = '10b_Diseae_Phenotype_Cleanup_preliminary_DOID_cleaning'
out_dir = Path('../2_pipeline').resolve().joinpath(this_name, 'out')

out_dir.mkdir(parents=True, exist_ok=True)

In [3]:
nw_name = '09_GAUSS_Geneset_to_Phenotype'
nw_dir = Path('../2_pipeline').resolve().joinpath(nw_name, 'out')

In [4]:
nodes = pd.read_csv(nw_dir.joinpath('nodes.csv'), dtype=str)
edges = pd.read_csv(nw_dir.joinpath('edges.csv'), dtype=str)

In [5]:
doid = 'https://github.com/DiseaseOntology/HumanDiseaseOntology/raw/master/src/ontology/doid.obo'
graph = obonet.read_obo(doid)
len(graph)

10334

In [6]:
id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}

In [7]:
nwid_to_name = nodes.set_index('id')['name'].to_dict()

In [8]:
diseases = nodes.query('label == "Disease"')

In [9]:
doid_xref = get_curi_xrefs(diseases, 'DOID')
doid_xref['id_name'] = doid_xref['id'].map(id_to_name)
doid_xref['xref_name'] = doid_xref['xrefs'].map(id_to_name)
doid_xref['nw_name'] = doid_xref['id'].map(nwid_to_name)

mismatch_doid_xref = doid_xref[doid_xref['id'] != doid_xref['xrefs']].copy()

In [10]:
disease_ids = diseases['id'].unique()

In [11]:
len(mismatch_doid_xref)

122

In [12]:
mismatch_doid_xref.query('xrefs in @disease_ids')

Unnamed: 0,id,xrefs,id_name,xref_name,nw_name
160,DOID:0040098,DOID:14482,pemphigus gestationis,pemphigoid gestationis,herpes gestationis
210,DOID:0050047,DOID:0050053,Flinders Island spotted fever,,Flinders Island spotted fever
220,DOID:0050053,DOID:0050047,,Flinders Island spotted fever,Flinders Island spotted fever
295,DOID:0050119,DOID:2365,,West Nile encephalitis,West Nile encephalitis
538,DOID:0050287,DOID:4271,,microsporidiosis,microsporidiosis
...,...,...,...,...,...
44010,DOID:9498,DOID:5870,pulmonary eosinophilia,eosinophilic pneumonia,eosinophilic pneumonia
44182,DOID:9678,DOID:9681,,cervical incompetence,cervical incompetence
44186,DOID:9681,DOID:9678,cervical incompetence,,cervical incompetence
44275,DOID:9829,DOID:0060001,,withdrawal disorder,withdrawal syndrome


These are all duplicated ids that should be merged...

In [13]:
keep_id = mismatch_doid_xref[mismatch_doid_xref['id_name'] == mismatch_doid_xref['nw_name']]['id'].unique()
mismatch_doid_xref.query('id in @keep_id and xrefs in @keep_id')

Unnamed: 0,id,xrefs,id_name,xref_name,nw_name
10779,DOID:11132,DOID:2883,prostatic hypertrophy,prostatic adenoma,prostatic hypertrophy
14121,DOID:1595,DOID:1470,endogenous depression,major depressive disorder,endogenous depression
14123,DOID:1595,DOID:2848,endogenous depression,melancholia,endogenous depression
15545,DOID:2848,DOID:1470,melancholia,major depressive disorder,melancholia
15546,DOID:2848,DOID:1595,melancholia,endogenous depression,melancholia
30122,DOID:0050783,DOID:0050784,secondary progressive multiple sclerosis,primary progressive multiple sclerosis,secondary progressive multiple sclerosis
30123,DOID:0050783,DOID:0050785,secondary progressive multiple sclerosis,progressive relapsing multiple sclerosis,secondary progressive multiple sclerosis
30129,DOID:0050784,DOID:0050783,primary progressive multiple sclerosis,secondary progressive multiple sclerosis,primary progressive multiple sclerosis
30131,DOID:0050784,DOID:0050785,primary progressive multiple sclerosis,progressive relapsing multiple sclerosis,primary progressive multiple sclerosis
30137,DOID:0050785,DOID:0050783,progressive relapsing multiple sclerosis,secondary progressive multiple sclerosis,progressive relapsing multiple sclerosis


In [14]:
fix_disease = diseases.query('id in @keep_id')
fix_idx = fix_disease.index

fix_disease = expand_col_on_char(fix_disease, 'xrefs', '|')

fix_disease.query('xrefs in @keep_id and id != xrefs')

Unnamed: 0,id,name,label,xrefs,source
24,DOID:11132,prostatic hypertrophy,Disease,DOID:2883,
34,DOID:1595,endogenous depression,Disease,DOID:1470,
36,DOID:1595,endogenous depression,Disease,DOID:2848,
45,DOID:2848,melancholia,Disease,DOID:1470,
46,DOID:2848,melancholia,Disease,DOID:1595,
68,DOID:0050783,secondary progressive multiple sclerosis,Disease,DOID:0050784,
69,DOID:0050783,secondary progressive multiple sclerosis,Disease,DOID:0050785,
75,DOID:0050784,primary progressive multiple sclerosis,Disease,DOID:0050783,
77,DOID:0050784,primary progressive multiple sclerosis,Disease,DOID:0050785,
83,DOID:0050785,progressive relapsing multiple sclerosis,Disease,DOID:0050783,


In [15]:
drop_idx = fix_disease.query('xrefs in @keep_id and id != xrefs').index
fix_disease = fix_disease.drop(drop_idx)

fix_disease = combine_group_cols_on_char(fix_disease, ['id'], ['xrefs'], sort=True, prog=False)

In [16]:
diseases = diseases.drop(fix_idx)
diseases = pd.concat([diseases, fix_disease], sort=False, ignore_index=True)

In [17]:
doid_xref = get_curi_xrefs(diseases, 'DOID')
doid_xref['id_name'] = doid_xref['id'].map(id_to_name)
doid_xref['xref_name'] = doid_xref['xrefs'].map(id_to_name)
doid_xref['nw_name'] = doid_xref['id'].map(nwid_to_name)

mismatch_doid_xref = doid_xref[doid_xref['id'] != doid_xref['xrefs']].copy()

either = mismatch_doid_xref[mismatch_doid_xref['id_name'].isnull() & mismatch_doid_xref['xref_name'].isnull()]['id'].unique()

mismatch_doid_xref.query('id in @either')

Unnamed: 0,id,xrefs,id_name,xref_name,nw_name
9701,DOID:10059,DOID:10111,,,Congenital or acquired abnormality of vagina c...
9725,DOID:10111,DOID:10059,,,Congenital or acquired abnormality of vagina c...
10710,DOID:11099,DOID:1709,,,primary Rickettsiaceae infectious disease
11799,DOID:122,DOID:284,,,abdominal cancer
12140,DOID:12647,DOID:9940,,,congenital or acquired abnormality of vulva co...
12853,DOID:13469,DOID:5000,,,secondary cutaneous syphilis
14216,DOID:1709,DOID:11099,,,rickettsiosis
15423,DOID:2778,DOID:2779,,,"tumors of body of uterus, with delivery"
15424,DOID:2779,DOID:2778,,,"tumors of body of uterus, with delivery"
15498,DOID:284,DOID:122,,,abdominal cancer


In [18]:
set(either)

{'DOID:0050341',
 'DOID:10059',
 'DOID:10111',
 'DOID:11092',
 'DOID:11099',
 'DOID:122',
 'DOID:12647',
 'DOID:13469',
 'DOID:1466',
 'DOID:1709',
 'DOID:2313',
 'DOID:2778',
 'DOID:2779',
 'DOID:284',
 'DOID:5000',
 'DOID:9940'}

In [19]:
keep_id = mismatch_doid_xref[~mismatch_doid_xref['id_name'].isnull() & mismatch_doid_xref['xref_name'].isnull()]['id'].unique()
keep_xref = mismatch_doid_xref[mismatch_doid_xref['id_name'].isnull() & ~mismatch_doid_xref['xref_name'].isnull()]['xrefs'].unique()

set(keep_id) == set(keep_xref)

True

In [20]:
mismatch_doid_xref.query('xrefs in @keep_id')

Unnamed: 0,id,xrefs,id_name,xref_name,nw_name
217,DOID:0050053,DOID:0050047,,Flinders Island spotted fever,Flinders Island spotted fever
292,DOID:0050119,DOID:2365,,West Nile encephalitis,West Nile encephalitis
535,DOID:0050287,DOID:4271,,microsporidiosis,microsporidiosis
1881,DOID:0060006,DOID:0090012,,severe combined immunodeficiency with sensitiv...,artemis deficiency
2092,DOID:0060120,DOID:0060094,,bone benign neoplasm,bone benign neoplasm
2220,DOID:0060171,DOID:0080422,,Dravet syndrome,Dravet syndrome
4117,DOID:0070076,DOID:0050880,,Koolen de Vries syndrome,Koolen de Vries syndrome
15602,DOID:2947,DOID:9682,,yellow fever,yellow fever
15884,DOID:3166,DOID:0060888,,transient myeloproliferative syndrome,transient myeloproliferative syndrome
16547,DOID:3680,DOID:11934,,head and neck cancer,head and neck cancer


In [21]:
old_to_new = mismatch_doid_xref.query('id in @keep_id').set_index('xrefs')['id'].to_dict()

old_to_new = {**mismatch_doid_xref.query('xrefs in @keep_id').set_index('id')['xrefs'].to_dict(), **old_to_new}
old_to_new = {**{_id:_id for _id in keep_id}, **old_to_new}

mismatch_doid_xref['new_id'] = mismatch_doid_xref['id'].map(old_to_new)
mismatch_doid_xref['new_id'] = mismatch_doid_xref['new_id'].fillna(mismatch_doid_xref['xrefs'].map(old_to_new))

unmap_subset = mismatch_doid_xref[mismatch_doid_xref['new_id'].isnull()]
unmap_subset

Unnamed: 0,id,xrefs,id_name,xref_name,nw_name,new_id
160,DOID:0040098,DOID:14482,pemphigus gestationis,pemphigoid gestationis,herpes gestationis,
9701,DOID:10059,DOID:10111,,,Congenital or acquired abnormality of vagina c...,
9725,DOID:10111,DOID:10059,,,Congenital or acquired abnormality of vagina c...,
10710,DOID:11099,DOID:1709,,,primary Rickettsiaceae infectious disease,
11799,DOID:122,DOID:284,,,abdominal cancer,
12140,DOID:12647,DOID:9940,,,congenital or acquired abnormality of vulva co...,
12853,DOID:13469,DOID:5000,,,secondary cutaneous syphilis,
13785,DOID:14482,DOID:0040098,pemphigoid gestationis,pemphigus gestationis,herpes gestationis,
14216,DOID:1709,DOID:11099,,,rickettsiosis,
15423,DOID:2778,DOID:2779,,,"tumors of body of uterus, with delivery",


In [22]:
len(unmap_subset)

28

In [23]:
unmap_subset.drop_duplicates(subset='nw_name', keep=False)

Unnamed: 0,id,xrefs,id_name,xref_name,nw_name,new_id
10710,DOID:11099,DOID:1709,,,primary Rickettsiaceae infectious disease,
14216,DOID:1709,DOID:11099,,,rickettsiosis,
19580,DOID:5603,DOID:0050523,T-cell acute lymphoblastic leukemia,adult T-cell leukemia,acute T cell leukemia,
29508,DOID:0050523,DOID:5603,adult T-cell leukemia,T-cell acute lymphoblastic leukemia,Adult T-cell leukemia/lymphoma,


These two are very close, we'll do them by hand

In [24]:
old_to_new = {**{'DOID:11099': 'DOID:1709', 'DOID:1709': 'DOID:1709',
                 'DOID:0050523': 'DOID:0050523', 'DOID:5603':'DOID:0050523'}, **old_to_new}

In [25]:
mismatch_doid_xref['new_id'] = mismatch_doid_xref['id'].map(old_to_new)
mismatch_doid_xref['new_id'] = mismatch_doid_xref['new_id'].fillna(mismatch_doid_xref['xrefs'].map(old_to_new))
unmap_subset = mismatch_doid_xref[mismatch_doid_xref['new_id'].isnull()]
unmap_subset

Unnamed: 0,id,xrefs,id_name,xref_name,nw_name,new_id
160,DOID:0040098,DOID:14482,pemphigus gestationis,pemphigoid gestationis,herpes gestationis,
9701,DOID:10059,DOID:10111,,,Congenital or acquired abnormality of vagina c...,
9725,DOID:10111,DOID:10059,,,Congenital or acquired abnormality of vagina c...,
11799,DOID:122,DOID:284,,,abdominal cancer,
12140,DOID:12647,DOID:9940,,,congenital or acquired abnormality of vulva co...,
12853,DOID:13469,DOID:5000,,,secondary cutaneous syphilis,
13785,DOID:14482,DOID:0040098,pemphigoid gestationis,pemphigus gestationis,herpes gestationis,
15423,DOID:2778,DOID:2779,,,"tumors of body of uterus, with delivery",
15424,DOID:2779,DOID:2778,,,"tumors of body of uterus, with delivery",
15498,DOID:284,DOID:122,,,abdominal cancer,


In [26]:
dup_ids = unmap_subset.drop_duplicates(subset='nw_name')['id'].unique()
dup_xref = unmap_subset.drop_duplicates(subset='nw_name')['xrefs'].unique()

In [27]:
len(set(unmap_subset[['id', 'xrefs']].stack()))

24

In [28]:
len(unmap_subset.query('id in @dup_xref'))

12

In [29]:
old_to_new = {**unmap_subset.query('id in @dup_ids').set_index('xrefs')['id'].to_dict(), **old_to_new}
old_to_new = {**{_id:_id for _id in dup_ids}, **old_to_new}

mismatch_doid_xref['new_id'] = mismatch_doid_xref['id'].map(old_to_new)
mismatch_doid_xref['new_id'] = mismatch_doid_xref['new_id'].fillna(mismatch_doid_xref['xrefs'].map(old_to_new))
mismatch_doid_xref.count()

id           108
xrefs        108
id_name       54
xref_name     54
nw_name      108
new_id       108
dtype: int64

In [30]:
len(mismatch_doid_xref['new_id'].drop_duplicates())

51

In [31]:
to_query = mismatch_doid_xref['id'].unique()
to_fix = diseases.query('id in @to_query')
fix_idx = to_fix.index

diseases = diseases.drop(fix_idx)

In [32]:
to_fix = expand_col_on_char(to_fix, 'xrefs', '|')
to_fix['id'] = to_fix['id'].map(old_to_new)
fixed = combine_group_cols_on_char(to_fix, ['id'], ['xrefs'], sort=True, prog=False)

len(fixed)

51

In [33]:
diseases = pd.concat([diseases, fixed], sort=False, ignore_index=True)

In [34]:
fixed_ids = list(old_to_new.keys())

In [35]:
fix_edges = edges.query('start_id in @fixed_ids or end_id in @fixed_ids').copy()
fix_idx = fix_edges.index

edges = edges.drop(fix_idx)

In [36]:
fix_edges['start_id'] = fix_edges['start_id'].map(old_to_new).fillna(fix_edges['start_id'])
fix_edges['end_id'] = fix_edges['end_id'].map(old_to_new).fillna(fix_edges['end_id'])

In [37]:
fix_edges = combine_group_cols_on_char(fix_edges, ['start_id', 'end_id', 'type'], sort=True, prog=False)

edges = pd.concat([edges, fix_edges], ignore_index=True, sort=False)

In [38]:
pd.DataFrame({'id_old': old_to_new.keys(), 'id': old_to_new.values()}).to_csv(out_dir.joinpath('old_to_new_map.csv'), index=False)

In [40]:
nodes = pd.concat([nodes.query('label != "Disease"'), diseases], sort=False, ignore_index=True)

In [39]:
nodes.to_csv(out_dir.joinpath('nodes.csv'), index=False)
edges.to_csv(out_dir.joinpath('edges.csv'), index=False)