# De-duplication

It's come to light that some of the diseases and phenotypes with the same names more than 1 identifier... These should be merged into one unique concept

In [1]:
import pandas as pd
from pathlib import Path
from data_tools import df_processing as dfp
from data_tools import graphs as gt

  from tqdm.autonotebook import tqdm


In [2]:
nw_name = '10c_Diseae_Phenotype_Cleanup_do_hp_node_merging'
nw_dir = Path('../2_pipeline').resolve().joinpath(nw_name, 'out')

owl_name = '10a_Disease_Pheno_Cleanup_parse_owls'
owl_dir = nw_dir.parent.parent.joinpath(owl_name, 'out')

In [3]:
this_name = '10d_Diseae_Phenotype_Cleanup_deduplication'
out_dir = Path('../2_pipeline').resolve().joinpath(this_name, 'out')

out_dir.mkdir(parents=True, exist_ok=True)

In [4]:
do_info = pd.read_csv(owl_dir.joinpath('DO_node_info.csv'))

In [5]:
nodes = pd.read_csv(nw_dir.joinpath('nodes.csv'), dtype=str)
edges = pd.read_csv(nw_dir.joinpath('edges.csv'), dtype=str)

In [6]:
id_to_name = nodes.set_index('id')['name'].to_dict()

In [7]:
nodes['name'] = nodes['name'].str.lower()

In [8]:
nodes.query('id == "DOID:4952"')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
322395,DOID:4952,postpoliomyelitis syndrome,Disease,DOID:4952|GARD:4454|ICD10CM:G14|MESH:D016262|M...,,Postpolio syndrome,,


# Issue 1: Duplicated Names...

In [9]:
pheno_dis = nodes.query('label == "Phenotype" or label == "Disease"').copy()

Do we mostly have disease-disease duplications, or Disease-Phenotype duplications?

In [10]:
comb_type = pheno_dis[pheno_dis['name'].duplicated(keep=False)].groupby('name')['label'].apply(tuple)
comb_type.value_counts()

(Disease, Phenotype)                                                                                                                                                                                    234
(Disease, Disease)                                                                                                                                                                                      233
(Phenotype, Phenotype)                                                                                                                                                                                   23
(Disease, Disease, Phenotype)                                                                                                                                                                            15
(Disease, Phenotype, Phenotype)                                                                                                                                                         

In [11]:
do_info.query('id == "DOID:0070309" or id == "DOID:50701"')

Unnamed: 0,id,name,synonyms,xrefs,alt_ids,subsets
2172,DOID:0070309,absence epilepsy,,MESH:D004832,,


## Disease-Disease Duplications

In [12]:
dual_dis = comb_type[comb_type == ('Disease', 'Disease')].index.tolist()
pheno_dis.query('name in @dual_dis').sort_values('name').head(6)

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
313735,DOID:0050600,abcd syndrome,Disease,DOID:0050600|OMIM:600501,,"ABCDS|albinism, black lock, cell migration dis...",,
325581,MESH:C535334,abcd syndrome,Disease,,,,,
322546,DOID:50701,absence epilepsy,Disease,MESH:D004832,,,,
315266,DOID:0070309,absence epilepsy,Disease,DOID:0070309|MESH:D004832,,,,
325684,MESH:C536129,achromatopsia 3,Disease,,,,,
316068,DOID:0110008,achromatopsia 3,Disease,DOID:0110008|GARD:9650|MONDO:0009875|OMIM:2623...,,ACHM1|ACHM3|Pingelapese blindness|RMCH1|rod mo...,,


### Dual DOID...

In [13]:
pheno_dis['do'] = pheno_dis['id'].str.startswith('DOID:')
pheno_dis['hp'] = pheno_dis['id'].str.startswith('HP:')
pheno_dis['curi'] = pheno_dis['id'].apply(lambda s: s.split(':')[0])

num_do = pheno_dis.query('name in @dual_dis').groupby('name')['do'].sum()
dual_doid = num_do[num_do > 1].index.tolist()

pheno_dis.query('name in @dual_doid').sort_values('name')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,do,hp,curi
315266,DOID:0070309,absence epilepsy,Disease,DOID:0070309|MESH:D004832,,,,,True,False,DOID
322546,DOID:50701,absence epilepsy,Disease,MESH:D004832,,,,,True,False,DOID
322532,DOID:50605,acrodermatitis enteropathica,Disease,MESH:C538178,,,,,True,False,DOID
313740,DOID:0050605,acrodermatitis enteropathica,Disease,DOID:0050605|GARD:5723|MESH:C538178|MONDO:0008...,,,,,True,False,DOID
322536,DOID:50631,allan-herndon-dudley syndrome,Disease,MESH:D038901,,,,,True,False,DOID
...,...,...,...,...,...,...,...,...,...,...,...
313326,DOID:0050175,tick-borne encephalitis,Disease,DOID:0050175|GARD:5216|ICD10CM:A84.1|ICD9CM:06...,,Central European encephalitis|Far Eastern TBE|...,DOID:0050176|DOID:10251|DOID:10252,tick-borne_infectious_disease,True,False,DOID
313415,DOID:0050266,tungiasis,Disease,DOID:0050266|GARD:393|MESH:D058285|MONDO:00194...,,,,,True,False,DOID
322473,DOID:50266,tungiasis,Disease,MESH:D058267|MESH:D058285,,,,,True,False,DOID
322524,DOID:50562,west syndrome,Disease,MESH:D013036,,,,,True,False,DOID


In [14]:
current_doids = do_info['id'].unique()

dual_do_current = pheno_dis.query('name in @dual_doid and id in @current_doids')
dual_do_expired = pheno_dis.query('name in @dual_doid and id not in @current_doids')

mrg_map = dual_do_current[['id', 'name']].merge(dual_do_expired[['id', 'name']], on='name', suffixes=('', '_old'))

In [15]:
depricate_names = pheno_dis.query('do and id not in @current_doids')['name'].unique()
len(depricate_names)

170

In [16]:
some_current_do = pheno_dis.query('name in @depricate_names and id in @current_doids')
some_expired_do = pheno_dis.query('do and id not in @current_doids')

mrg_map2 = some_current_do[['id', 'name']].merge(some_expired_do[['id', 'name']], on='name', suffixes=('', '_old'))
mrg_map2

Unnamed: 0,id,name,id_old
0,DOID:0050052,rocky mountain spotted fever,DOID:50052
1,DOID:0050073,invasive aspergillosis,DOID:50073
2,DOID:0050147,otomycosis,DOID:50147
3,DOID:0050156,idiopathic pulmonary fibrosis,DOID:50156
4,DOID:0050169,cutaneous lupus erythematosus,DOID:50169
...,...,...,...
59,DOID:10955,strongyloidiasis,DOID:50009
60,DOID:162,cancer,DOID:60072
61,DOID:2377,multiple sclerosis,DOID:50785
62,DOID:5603,t-cell acute lymphoblastic leukemia,DOID:50523


In [17]:
mrg_map = pd.concat([mrg_map, mrg_map2], sort=False, ignore_index=True).drop_duplicates()

### DOID + Other

In [18]:
one_doid = num_do[num_do == 1].index.tolist()
pheno_dis.query('name in @one_doid').sort_values('name')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,do,hp,curi
313735,DOID:0050600,abcd syndrome,Disease,DOID:0050600|OMIM:600501,,"ABCDS|albinism, black lock, cell migration dis...",,,True,False,DOID
325581,MESH:C535334,abcd syndrome,Disease,,,,,,False,False,MESH
316068,DOID:0110008,achromatopsia 3,Disease,DOID:0110008|GARD:9650|MONDO:0009875|OMIM:2623...,,ACHM1|ACHM3|Pingelapese blindness|RMCH1|rod mo...,,,True,False,DOID
325684,MESH:C536129,achromatopsia 3,Disease,,,,,,False,False,MESH
313761,DOID:0050629,aicardi-goutieres syndrome,Disease,DOID:0050629|GARD:575|ICD10CM:G31.8|MONDO:0009...,,Cree encephalitis,,DO_rare_slim,True,False,DOID
...,...,...,...,...,...,...,...,...,...,...,...
326192,MESH:C565346,tuberous sclerosis 1,Disease,,,,,,False,False,MESH
326260,MESH:C566021,tuberous sclerosis 2,Disease,,,,,,False,False,MESH
315610,DOID:0080325,tuberous sclerosis 2,Disease,DOID:0080325|OMIM:613254,,,,,True,False,DOID
325748,MESH:C536709,winchester syndrome,Disease,,,,,,False,False,MESH


In [19]:
keep = pheno_dis.query('name in @one_doid and do')
drop = pheno_dis.query('name in @one_doid and not do')

mrg_map2 = keep[['id', 'name']].merge(drop[['id', 'name']], on='name', suffixes=('', '_old'))

In [20]:
mrg_map = pd.concat([mrg_map, mrg_map2], sort=False, ignore_index=True).drop_duplicates()

In [21]:
mrg_map

Unnamed: 0,id,name,id_old
0,DOID:0050052,rocky mountain spotted fever,DOID:50052
1,DOID:0050073,invasive aspergillosis,DOID:50073
2,DOID:0050147,otomycosis,DOID:50147
3,DOID:0050156,idiopathic pulmonary fibrosis,DOID:50156
4,DOID:0050169,cutaneous lupus erythematosus,DOID:50169
...,...,...,...
134,DOID:50421,streptococcal infections,MESH:D013290
135,DOID:50597,schistosoma mansoni infection,MESH:D012550
136,DOID:5746,ovarian serous cystadenocarcinoma,WD:Q53673344
137,DOID:8158,complement component 5 deficiency,MESH:C537005


### 2 non-DOID

In [22]:
no_doid = num_do[num_do == 0].index.tolist()
pheno_dis.query('name in @no_doid').sort_values('name')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,do,hp,curi
328066,MONDO:0014420,acid-labile subunit deficiency,Disease,MONDO:0014420|OMIM:615961|UMLS:C3900122,,,,,False,False,MONDO
328802,OMIM:615961,acid-labile subunit deficiency,Disease,,,,,,False,False,OMIM
328857,OMIM:616459,al-raqad syndrome,Disease,,,,,,False,False,OMIM
328126,MONDO:0014648,al-raqad syndrome,Disease,MONDO:0014648|OMIM:616459|UMLS:C4085595,,,,,False,False,MONDO
328024,MONDO:0014219,"alacrima, achalasia, and mental retardation sy...",Disease,MONDO:0014219|OMIM:615510|UMLS:C3809738,,,,,False,False,MONDO
...,...,...,...,...,...,...,...,...,...,...,...
327740,MONDO:0011518,wiedemann-steiner syndrome,Disease,MONDO:0011518|OMIM:605130|UMLS:C1854630|UMLS:C...,,,,,False,False,MONDO
328985,OMIM:617321,yao syndrome,Disease,,,,,,False,False,OMIM
328280,MONDO:0015019,yao syndrome,Disease,MONDO:0015019|OMIM:617321|UMLS:C4310620,,,,,False,False,MONDO
328233,MONDO:0014936,zttk syndrome,Disease,MONDO:0014936|OMIM:617140|UMLS:C4310696,,,,,False,False,MONDO


In [23]:
dual_non_doid = pheno_dis.query('name in @no_doid').copy()

dual_non_doid.groupby('name')['curi'].apply(tuple).value_counts()

(MONDO, OMIM)     71
(UMLS, UMLS)       9
(MONDO, MONDO)     9
(MESH, OMIM)       5
(MESH, MONDO)      4
(MESH, MESH)       2
(MESH, WD)         2
(UMLS, WD)         1
(MESH, UMLS)       1
Name: curi, dtype: int64

In [24]:
dual_non_doid['curi'].unique()

array(['MESH', 'MONDO', 'OMIM', 'UMLS', 'WD'], dtype=object)

Lets use this order:

    MESH > UMLS > OMIM > MONDO > WD
    
This is pretty much order of easiest to map to other ontologies, to most difficult.
    
In cases of conflict, honestly don't care...

In [25]:
curi_order = {'MESH': 0, 'UMLS': 1, 'OMIM': 2, 'MONDO': 3, 'WD':4}

dual_non_doid['curi_rank'] = dual_non_doid['curi'].map(curi_order)

In [26]:
keep = dual_non_doid.sort_values(['curi_rank', 'id']).drop_duplicates(subset=['name'])
drop = dual_non_doid.sort_values(['curi_rank', 'id'], ascending=False).drop_duplicates(subset=['name'])

mrg_map2 = keep[['id', 'name']].merge(drop[['id', 'name']], on='name', suffixes=('', '_old'))

In [27]:
mrg_map2

Unnamed: 0,id,name,id_old
0,MESH:C535705,mandibuloacral dysplasia with type a lipodystr...,OMIM:248370
1,MESH:C535706,mandibuloacral dysplasia with type b lipodystr...,OMIM:608612
2,MESH:C562935,myopathy with abnormal lipid metabolism,MONDO:0009703
3,MESH:C563481,otofaciocervical syndrome,MONDO:0008163
4,MESH:C563985,melanoma-pancreatic cancer syndrome,MONDO:0011713
...,...,...,...
99,MONDO:0011937,exfoliative ichthyosis,MONDO:0017339
100,MONDO:0014315,mitchell-riley syndrome,MONDO:0017400
101,MONDO:0017049,boylan dew greco syndrome,MONDO:0022025
102,MONDO:0021201,skin infection,MONDO:0024294


In [28]:
mrg_map = pd.concat([mrg_map, mrg_map2], sort=False, ignore_index=True).drop_duplicates()
mrg_map

Unnamed: 0,id,name,id_old
0,DOID:0050052,rocky mountain spotted fever,DOID:50052
1,DOID:0050073,invasive aspergillosis,DOID:50073
2,DOID:0050147,otomycosis,DOID:50147
3,DOID:0050156,idiopathic pulmonary fibrosis,DOID:50156
4,DOID:0050169,cutaneous lupus erythematosus,DOID:50169
...,...,...,...
238,MONDO:0011937,exfoliative ichthyosis,MONDO:0017339
239,MONDO:0014315,mitchell-riley syndrome,MONDO:0017400
240,MONDO:0017049,boylan dew greco syndrome,MONDO:0022025
241,MONDO:0021201,skin infection,MONDO:0024294


## Phentype-Phenotype Duplications

Should be easy...

In [29]:
dual_pheno = comb_type[comb_type == ('Phenotype', 'Phenotype')].index.tolist()
pheno_dis.query('name in @dual_pheno').sort_values('name')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,do,hp,curi
584982,WD:Q143877,cogwheel rigidity,Phenotype,,,,,,False,False,WD
580469,HP:0002396,cogwheel rigidity,Phenotype,MESH:D009127|SNOMED:55630000|UMLS:C0151564,,,,,False,True,HP
585020,WD:Q21110084,dark urine,Phenotype,,,,,,False,False,WD
584571,HP:0040319,dark urine,Phenotype,,,,,,False,True,HP
585023,WD:Q21110117,decreased pulmonary function,Phenotype,,,,,,False,False,WD
581903,HP:0005952,decreased pulmonary function,Phenotype,MESH:D012131,,,,,False,True,HP
584840,HP:0410019,epigastric pain,Phenotype,,,,,,False,True,HP
585180,WD:Q3589142,epigastric pain,Phenotype,,,,,,False,False,WD
585089,WD:Q21119928,facial edema,Phenotype,,,,,,False,False,WD
579379,HP:0000282,facial edema,Phenotype,SNOMED:445088006|UMLS:C0542571,,,,,False,True,HP


Nice in that all of these are HPO vs Something else... easy to chouse HPO versions

In [30]:
pheno_conflict = pheno_dis.query('name in @dual_pheno').copy()

In [31]:
pheno_conflict['hp'] = pheno_conflict['curi'] == "HP"

In [32]:
def create_map(keep, drop):
    return keep[['id', 'name']].merge(drop[['id', 'name']], on='name', suffixes=('', '_old'))

In [33]:
keep = pheno_conflict.query('hp')
drop = pheno_conflict.query('not hp')

mrg_map2 = keep[['id', 'name']].merge(drop[['id', 'name']], on='name', suffixes=('', '_old'))

In [34]:
mrg_map2

Unnamed: 0,id,name,id_old
0,HP:0000282,facial edema,WD:Q21119928
1,HP:0000465,webbed neck,WD:Q3539327
2,HP:0000543,optic disc pallor,WD:Q7098794
3,HP:0000711,restlessness,WD:Q21118175
4,HP:0000736,short attention span,WD:Q21515325
5,HP:0001347,hyperreflexia,WD:Q1429154
6,HP:0001350,slurred speech,WD:Q21117191
7,HP:0001508,failure to thrive,MESH:D005183
8,HP:0002098,respiratory distress,WD:Q7315912
9,HP:0002179,opisthotonus,WD:Q1756015


In [35]:
mrg_map = pd.concat([mrg_map, mrg_map2], sort=False, ignore_index=True).drop_duplicates()
mrg_map

Unnamed: 0,id,name,id_old
0,DOID:0050052,rocky mountain spotted fever,DOID:50052
1,DOID:0050073,invasive aspergillosis,DOID:50073
2,DOID:0050147,otomycosis,DOID:50147
3,DOID:0050156,idiopathic pulmonary fibrosis,DOID:50156
4,DOID:0050169,cutaneous lupus erythematosus,DOID:50169
...,...,...,...
260,HP:0031473,hostility,MESH:D006791
261,HP:0040264,jaw pain,WD:Q21119540
262,HP:0040319,dark urine,WD:Q21110084
263,HP:0410019,epigastric pain,WD:Q3589142


## Disease-Phenotype Conflicts

In [36]:
het_names = comb_type[comb_type == ('Disease', 'Phenotype')].index.tolist()
hetero_conflict = pheno_dis.query('name in @het_names').copy()
hetero_conflict.sort_values('name')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,do,hp,curi
314456,DOID:0060336,3-methylglutaconic aciduria,Disease,DOID:0060336|ICD10CM:E71.111|MESH:C579867|MOND...,,,,DO_rare_slim,True,False,DOID
581096,HP:0003535,3-methylglutaconic aciduria,Phenotype,MESH:C579867|OMIM:250950,,,,,False,True,HP
324301,DOID:7693,abdominal aortic aneurysm,Disease,DOID:7693|EFO:0004214|GARD:9181|MESH:D017544|M...,,"AORTIC ANEURYSM, FAMILIAL ABDOMINAL 1",,NCIthesaurus,True,False,DOID
581596,HP:0005112,abdominal aortic aneurysm,Phenotype,UMLS:C4025248,,,,,False,True,HP
583569,HP:0011224,ablepharon,Phenotype,SNOMED:13401001|SNOMED:708541009|UMLS:C0266574...,,,,,False,True,HP
...,...,...,...,...,...,...,...,...,...,...,...
323376,DOID:60674,ventricular arrhythmia,Disease,MESH:C536334|MESH:D017180,,,,,True,False,DOID
582135,HP:0006562,viral hepatitis,Phenotype,SNOMED:3738000|UMLS:C0042721,,,,,False,True,HP
320294,DOID:1884,viral hepatitis,Disease,DOID:1884|MESH:D006524|MESH:D006525|MONDO:0025...,,Viral hepatitis with hepatic coma|animal viral...,DOID:12548|DOID:1331|DOID:1885,,True,False,DOID
314356,DOID:0060236,xanthinuria,Disease,DOID:0060236|MESH:C562584|MESH:C566358|MONDO:0...,,xanthine dehydrogenase deficiency|xanthine oxi...,,DO_FlyBase_slim|DO_rare_slim,True,False,DOID


In [37]:
hetero_conflict['do'] = hetero_conflict['do'].astype(int)

In [38]:
hetero_conflict.groupby('name')['do'].sum().value_counts()

1    178
0     56
Name: do, dtype: int64

In [39]:
no_do_conf = hetero_conflict.groupby('name')['do'].sum()
no_do_conf = no_do_conf[no_do_conf == 0].index.tolist()

hetero_conflict.query("name == @no_do_conf").sort_values('name').head(30)

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,do,hp,curi
583569,HP:0011224,ablepharon,Phenotype,SNOMED:13401001|SNOMED:708541009|UMLS:C0266574...,,,,,0,True,HP
329682,WD:Q322370,ablepharon,Disease,,,,,,0,False,WD
583830,HP:0012050,anasarca,Phenotype,MESH:D004487|SNOMED:16740003|SNOMED:442433009|...,,,,,0,True,HP
329834,WD:Q486485,anasarca,Disease,,,,,,0,False,WD
581470,HP:0004823,anisopoikilocytosis,Phenotype,UMLS:C2675920,,,,,0,True,HP
329240,UMLS:C2675920,anisopoikilocytosis,Disease,UMLS:C2675920,,,,,0,False,UMLS
328431,MONDO:0019780,anotia,Disease,MESH:C537772|MONDO:0019780|OMIM:600674|UMLS:C0...,,,,,0,False,MONDO
583252,HP:0009892,anotia,Phenotype,MESH:D065817|SNOMED:57436000|UMLS:C0702139|UML...,,,,,0,True,HP
584195,HP:0025478,atrial standstill,Phenotype,,,,,,0,True,HP
326075,MESH:C563984,atrial standstill,Disease,,,,,,0,False,MESH


In [40]:
hetero_conflict.query("name == @no_do_conf").groupby('name')['hp'].sum().value_counts()

True    56
Name: hp, dtype: int64

So if there's no DOID we can convert it to an HPO and call it a Phenotype, otherwise, we can convert it to a DOID and call it a disease

In [41]:
keep = hetero_conflict.query('do == 1')
keep_names = keep['name'].unique()
drop = hetero_conflict.query('do == 0 and name in @keep_names')

mrg2 = create_map(keep, drop)

keep = hetero_conflict.query('hp == 1 and name not in @keep_names')
keep_names = keep['name'].unique()
drop = hetero_conflict.query('hp == 0 and name in @keep_names')

mrg3 = create_map(keep, drop)

In [42]:
len(mrg2)

178

In [43]:
len(mrg3)

56

In [44]:
len(mrg2) + len(mrg3) == hetero_conflict['name'].nunique()

True

In [45]:
mrg_map = pd.concat([mrg_map, mrg2, mrg3], sort=False, ignore_index=True).drop_duplicates()
mrg_map

Unnamed: 0,id,name,id_old
0,DOID:0050052,rocky mountain spotted fever,DOID:50052
1,DOID:0050073,invasive aspergillosis,DOID:50073
2,DOID:0050147,otomycosis,DOID:50147
3,DOID:0050156,idiopathic pulmonary fibrosis,DOID:50156
4,DOID:0050169,cutaneous lupus erythematosus,DOID:50169
...,...,...,...
494,HP:0031295,left atrial enlargement,UMLS:C0238705
495,HP:0031814,palilalia,WD:Q1757666
496,HP:0100578,lipoatrophy,WD:Q3500973
497,HP:0100806,sepsis,MESH:D016470


## 3 or more IDs per name

In [46]:
size = pheno_dis[pheno_dis['name'].duplicated(keep=False)].groupby('name').apply(len) 
size[size > 2].sort_values(ascending=False)

name
prostate cancer                    18
hyperinsulinemic hypoglycemia       9
primary congenital glaucoma         4
muscle spasm                        4
pain                                4
vitamin b12 deficiency              3
ductal carcinoma in situ            3
glucose intolerance                 3
glomerulosclerosis                  3
dyscalculia                         3
congenital muscular dystrophy       3
insomnia                            3
congenital adrenal hyperplasia      3
chronic myelomonocytic leukemia     3
ankylosis                           3
amenorrhea                          3
hypophosphatemia                    3
lattice corneal dystrophy           3
lamb-shaffer syndrome               3
visual agnosia                      3
lung abscess                        3
meconium ileus                      3
microcephaly                        3
osteoblastoma                       3
pheochromocytoma                    3
scoliosis                           3
stomach

In [47]:
comb_type[size[size == 3].index.tolist()].value_counts()

(Disease, Disease, Phenotype)      15
(Disease, Phenotype, Phenotype)     9
(Disease, Disease, Disease)         1
Name: label, dtype: int64

### 2 pheno, 1 disease

In [48]:
this_names = comb_type[comb_type == ('Disease', 'Phenotype', 'Phenotype')].index.tolist()
curr_conflict = pheno_dis.query('name in @this_names').copy()

In [49]:
curr_conflict.sort_values('name')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,do,hp,curi
584893,MESH:D000568,amenorrhea,Phenotype,MESH:D000568,,,,,False,False,MESH
579298,HP:0000141,amenorrhea,Phenotype,UMLS:C2219717,,,,,False,True,HP
319698,DOID:13938,amenorrhea,Disease,DOID:13938|ICD10CM:N91.2|ICD9CM:626.0|MESH:D00...,,absence of menstruation|amenia,,NCIthesaurus,True,False,DOID
584894,MESH:D000844,ankylosis,Phenotype,MESH:D000844,,,,,False,False,MESH
584388,HP:0031013,ankylosis,Phenotype,,,,,,False,True,HP
320532,DOID:227,ankylosis,Disease,DOID:227|ICD10CM:M24.6|ICD9CM:718.5|MESH:D0008...,,,,,True,False,DOID
580486,HP:0002442,dyscalculia,Phenotype,SNOMED:47916000|UMLS:C1411876|UMLS:C4280576,,,,,False,True,HP
584934,MESH:D060705,dyscalculia,Phenotype,MESH:D060705,,,,,False,False,MESH
319054,DOID:12568,dyscalculia,Disease,DOID:12568|MESH:D060705|MONDO:0001552|UMLS:C08...,,Mathematics disorder|disorder of arithmetical ...,,,True,False,DOID
318052,DOID:10603,glucose intolerance,Disease,DOID:10603|ICD10CM:R73.03|MESH:D018149|MONDO:0...,,Glucose: intolerance|Glucose: malabsorption|Ma...,,NCIthesaurus,True,False,DOID


In [50]:
curr_conflict.groupby('name')['do'].sum()

name
amenorrhea                   True
ankylosis                    True
dyscalculia                  True
glucose intolerance          True
lattice corneal dystrophy    True
microcephaly                 True
scoliosis                    True
stomach cancer               True
visual agnosia               True
Name: do, dtype: bool

All have their disease annotated as a DOID... we'll set them to diseases.

In [51]:
keep = curr_conflict.query('do == True')
keep_names = keep['name'].unique()
drop = curr_conflict.query('do == False and name in @keep_names')

mrg2 = create_map(keep, drop)

In [52]:
mrg_map = pd.concat([mrg_map, mrg2], sort=False, ignore_index=True).drop_duplicates()
mrg_map

Unnamed: 0,id,name,id_old
0,DOID:0050052,rocky mountain spotted fever,DOID:50052
1,DOID:0050073,invasive aspergillosis,DOID:50073
2,DOID:0050147,otomycosis,DOID:50147
3,DOID:0050156,idiopathic pulmonary fibrosis,DOID:50156
4,DOID:0050169,cutaneous lupus erythematosus,DOID:50169
...,...,...,...
512,DOID:13938,amenorrhea,MESH:D000568
513,DOID:227,ankylosis,HP:0031013
514,DOID:227,ankylosis,MESH:D000844
515,DOID:8943,lattice corneal dystrophy,HP:0001149


In [53]:
mrg_map['id_old'].nunique(), mrg_map['id'].nunique()

(517, 508)

### 2 Disease + Pheno

In [54]:
this_names = comb_type[comb_type == ('Disease', 'Disease', 'Phenotype')].index.tolist()
curr_conflict = pheno_dis.query('name in @this_names').copy()

In [55]:
curr_conflict.groupby('name')['do'].sum()

name
allergy                            2.0
chronic myelomonocytic leukemia    2.0
congenital adrenal hyperplasia     2.0
congenital muscular dystrophy      2.0
ductal carcinoma in situ           2.0
glomerulosclerosis                 2.0
hypophosphatemia                   2.0
insomnia                           1.0
lung abscess                       2.0
meconium ileus                     0.0
osteoblastoma                      2.0
pheochromocytoma                   2.0
torticollis                        1.0
trigonocephaly                     0.0
vitamin b12 deficiency             2.0
Name: do, dtype: float64

Those with 2 DOIDs we'll treat like a DO-DO conflict above, Those with 1, we'll take the DOID, those wtih 0, we'll see if there's an HPO Id and take that... othereise do something else

#### Dual DO

In [56]:
num_do = curr_conflict.groupby('name')['do'].sum()
dual_doid = num_do[num_do > 1].index.tolist()

dual_do_current = curr_conflict.query('name in @dual_doid and id in @current_doids')
dual_do_expired = curr_conflict.query('name in @dual_doid and id not in @current_doids')

In [57]:
dual_do_current['name'].nunique(), dual_do_expired['name'].nunique()

(10, 11)

One had 2 doids, neither current....

In [58]:
found_names = dual_do_current['name'].unique()
dual_do_expired.query('name not in @found_names')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,do,hp,curi
323303,DOID:60056,allergy,Disease,MESH:D000707|MESH:D006967|MESH:D065631,,,,,True,False,DOID
323362,DOID:60496,allergy,Disease,MESH:D006967|MESH:D012141,,,,,True,False,DOID
583983,HP:0012393,allergy,Phenotype,MESH:D006967|SNOMED:419076005|UMLS:C1527304,,,,,False,True,HP


Does not seem to be correct.... All 3 instances have the same MESH Xref, though: `MESH:D006967`

In [59]:
xref_res = dfp.expand_col_on_char(do_info, 'xrefs', '|', True)[['id', 'name', 'xrefs']].query('xrefs == "MESH:D006967"')
xref_res

Unnamed: 0,id,name,xrefs
14964,DOID:1205,allergic disease,MESH:D006967


In [60]:
to_rename = dual_do_expired.query('name not in @found_names').copy()
# Remove them
dual_do_expired = dual_do_expired.drop(to_rename.index)
to_rename['name'] = xref_res.iloc[0].loc['name']

In [61]:
dual_do_expired.sort_values('name')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,do,hp,curi
322505,DOID:50458,chronic myelomonocytic leukemia,Disease,MESH:D015477,,,,,True,False,DOID
583960,HP:0012325,chronic myelomonocytic leukemia,Phenotype,MESH:D015477,,,,,False,True,HP
322575,DOID:50811,congenital adrenal hyperplasia,Disease,MESH:D000312,,,,,True,False,DOID
582775,HP:0008258,congenital adrenal hyperplasia,Phenotype,MESH:D000312|SNOMED:237751000|UMLS:C0001627,,,,,False,True,HP
322521,DOID:50557,congenital muscular dystrophy,Disease,MESH:D009136,,,,,True,False,DOID
581192,HP:0003741,congenital muscular dystrophy,Phenotype,SNOMED:240059009|UMLS:C0699743,,,,,False,True,HP
584239,HP:0030075,ductal carcinoma in situ,Phenotype,MESH:D002285,,,,,False,True,HP
323308,DOID:60074,ductal carcinoma in situ,Disease,MESH:D002285,,,,,True,False,DOID
322589,DOID:50851,glomerulosclerosis,Disease,MESH:D005923,,,,,True,False,DOID
579276,HP:0000096,glomerulosclerosis,Phenotype,SNOMED:197661001|SNOMED:82646005|UMLS:C0178664,,,,,False,True,HP


In [62]:
mrg2 = create_map(dual_do_current, dual_do_expired)
mrg3 = create_map(xref_res, to_rename)

In [63]:
mrg_map = pd.concat([mrg_map, mrg2, mrg3], sort=False, ignore_index=True).drop_duplicates()
mrg_map.tail(20)

Unnamed: 0,id,name,id_old
510,DOID:12568,dyscalculia,MESH:D060705
511,DOID:13938,amenorrhea,HP:0000141
512,DOID:13938,amenorrhea,MESH:D000568
513,DOID:227,ankylosis,HP:0031013
514,DOID:227,ankylosis,MESH:D000844
515,DOID:8943,lattice corneal dystrophy,HP:0001149
516,DOID:8943,lattice corneal dystrophy,MESH:C537935
518,DOID:0050336,hypophosphatemia,HP:0002148
520,DOID:0050557,congenital muscular dystrophy,HP:0003741
522,DOID:0050731,vitamin b12 deficiency,HP:0100502


#### Single DO

In [64]:
sing_doid = num_do[num_do == 1].index.tolist()
keep = curr_conflict.query('name in @sing_doid and do')
drop = curr_conflict.query('name in @sing_doid and not do')

mrg2 = create_map(keep, drop)
mrg2

Unnamed: 0,id,name,id_old
0,DOID:50433,insomnia,MESH:D012892
1,DOID:50433,insomnia,HP:0100785
2,DOID:50840,torticollis,MESH:D014103
3,DOID:50840,torticollis,HP:0000473


In [65]:
mrg_map = pd.concat([mrg_map, mrg2], sort=False, ignore_index=True).drop_duplicates()

#### No DO, but hpo

In [66]:
no_doid = num_do[num_do < 1].index.tolist()
keep = curr_conflict.query('name in @no_doid and hp')
drop = curr_conflict.query('name in @no_doid and not hp')

mrg2 = create_map(keep, drop)
mrg2

Unnamed: 0,id,name,id_old
0,HP:0000243,trigonocephaly,MONDO:0000156
1,HP:0000243,trigonocephaly,MONDO:0013774
2,HP:0004401,meconium ileus,OMIM:614665
3,HP:0004401,meconium ileus,UMLS:C0546982


In [67]:
mrg_map = pd.concat([mrg_map, mrg2], sort=False, ignore_index=True).drop_duplicates()

### 3 Disease

In [68]:
this_names = comb_type[comb_type == ('Disease', 'Disease', 'Disease')].index.tolist()
curr_conflict = pheno_dis.query('name in @this_names').copy()

In [69]:
curr_conflict

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,do,hp,curi
328161,MONDO:0014778,lamb-shaffer syndrome,Disease,MONDO:0014778|MONDO:0017782|OMIM:616803|UMLS:C...,,,,,False,False,MONDO
328346,MONDO:0017782,lamb-shaffer syndrome,Disease,MONDO:0014778|MONDO:0017782|OMIM:616803|UMLS:C...,,,,,False,False,MONDO
328883,OMIM:616803,lamb-shaffer syndrome,Disease,,,,,,False,False,OMIM


In [70]:
curr_conflict['order'] = curr_conflict['curi'].map(curi_order)

In [71]:
keep = curr_conflict.sort_values('order').drop_duplicates(subset=['name'], keep='first')
keep_ids = keep['id'].unique()

drop = curr_conflict.query('id not in @keep_ids')

# Check that all names have been accounted for in the keep dataframe
assert(keep['name'].nunique() == drop['name'].nunique())

In [72]:
mrg2 = create_map(keep, drop)
mrg2

Unnamed: 0,id,name,id_old
0,OMIM:616803,lamb-shaffer syndrome,MONDO:0014778
1,OMIM:616803,lamb-shaffer syndrome,MONDO:0017782


In [73]:
mrg_map = pd.concat([mrg_map, mrg2], sort=False, ignore_index=True).drop_duplicates()

### 4 conflcits

In [74]:
comb_type[size[size == 4].index.tolist()].value_counts()

(Disease, Disease, Disease, Phenotype)        2
(Disease, Phenotype, Phenotype, Phenotype)    1
Name: label, dtype: int64

In [75]:
this_names = comb_type[comb_type == ('Disease', 'Phenotype', 'Phenotype', 'Phenotype')].index.tolist()
curr_conflict = pheno_dis.query('name in @this_names').copy()

keep = curr_conflict.query('do')
drop = curr_conflict.query('not do')

mrg2 = create_map(keep, drop)
mrg2

Unnamed: 0,id,name,id_old
0,DOID:0050593,primary congenital glaucoma,HP:0008007
1,DOID:0050593,primary congenital glaucoma,OMIM:613085
2,DOID:0050593,primary congenital glaucoma,OMIM:613086


In [76]:
this_names = comb_type[comb_type == ('Disease', 'Disease', 'Disease', 'Phenotype')].index.tolist()
curr_conflict = pheno_dis.query('name in @this_names').copy()
curr_conflict.groupby('name')['do'].sum()

name
muscle spasm    False
pain             True
Name: do, dtype: bool

In [77]:
this_names = comb_type[comb_type == ('Disease', 'Disease', 'Disease', 'Phenotype')].index.tolist()
curr_conflict = pheno_dis.query('name in @this_names').copy()
curr_conflict.groupby('name')['hp'].sum()

name
muscle spasm    True
pain            True
Name: hp, dtype: bool

Pain will be a disease, and muscle spasam a phenotype

In [78]:
keep = curr_conflict.query('do')
keep_names = keep['name'].unique()

keep2 = curr_conflict.query('hp and name not in @keep_names')
keep = pd.concat([keep, keep2], sort=False)

keep_ids = keep['id'].unique()

drop = curr_conflict.query('id not in @keep_ids')

mrg3 = create_map(keep, drop)
mrg3

Unnamed: 0,id,name,id_old
0,DOID:60145,pain,MESH:D000758
1,DOID:60145,pain,MESH:D000759
2,DOID:60145,pain,HP:0012531
3,HP:0003394,muscle spasm,MESH:D007442
4,HP:0003394,muscle spasm,MESH:D009119
5,HP:0003394,muscle spasm,MESH:D014691


In [79]:
mrg_map = pd.concat([mrg_map, mrg2, mrg3], sort=False, ignore_index=True).drop_duplicates()
mrg_map

Unnamed: 0,id,name,id_old
0,DOID:0050052,rocky mountain spotted fever,DOID:50052
1,DOID:0050073,invasive aspergillosis,DOID:50073
2,DOID:0050147,otomycosis,DOID:50147
3,DOID:0050156,idiopathic pulmonary fibrosis,DOID:50156
4,DOID:0050169,cutaneous lupus erythematosus,DOID:50169
...,...,...,...
544,DOID:60145,pain,MESH:D000759
545,DOID:60145,pain,HP:0012531
546,HP:0003394,muscle spasm,MESH:D007442
547,HP:0003394,muscle spasm,MESH:D009119


### More than 4

In [80]:
this_names = size[size > 4].index.tolist()

curr_conflict = pheno_dis.query('name in @this_names').copy()
curr_conflict.groupby('name')['do'].sum()

name
hyperinsulinemic hypoglycemia    True
prostate cancer                  True
Name: do, dtype: bool

In [81]:
keep = curr_conflict.query('do')
drop = curr_conflict.query('not do')

mrg2 = create_map(keep, drop)

In [82]:
mrg_map = pd.concat([mrg_map, mrg2], sort=False, ignore_index=True).drop_duplicates()

In [83]:
'Merged {} Disease and Phenotype entities into {}.'.format(mrg_map['id'].nunique()+mrg_map['id_old'].nunique(), mrg_map['id'].nunique())

'Merged 1093 Disease and Phenotype entities into 519.'

### Final checks before merge

In [84]:
old_ids = set(mrg_map['id_old'])
new_ids = set(mrg_map['id'])

overlap = old_ids & new_ids
len(overlap)

0

In [85]:
mrg_ids = list(old_ids | new_ids)

to_keep = nodes.query('id not in @mrg_ids')
to_mrg = nodes.query('id in @mrg_ids')

len(to_mrg)

1093

In [86]:
keep = to_mrg.query('id in @new_ids')
drop = to_mrg.query('id in @old_ids')

drop = drop.rename(columns={'id': 'id_old'}).merge(mrg_map[['id', 'id_old']], on='id_old', how='left')

drop['xrefs'] = drop['id_old'] + '|' + drop['xrefs']
drop['xrefs'] = drop['xrefs'].fillna(drop['id_old'])
drop = drop.drop('id_old', axis=1)

fixed_nodes = dfp.combine_group_cols_on_char(pd.concat([keep, drop], sort=False), ['id'], 
                                             [c for c in keep.columns if c not in ['id', 'name', 'label']], 
                                             sort=True, prog=False)

fixed_nodes['label'].value_counts()

Disease      438
Phenotype     81
Name: label, dtype: int64

In [87]:
nodes = pd.concat([to_keep, fixed_nodes], sort=False).sort_values(['label', 'id']).reset_index(drop=True)
nodes

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
0,UBERON:0000002,cervix,Anatomy,MESH:D002584|UBERON:0000002,,,,
1,UBERON:0000004,human nose,Anatomy,MESH:D009666|UBERON:0000004,,,,
2,UBERON:0000005,chemosensory organ,Anatomy,,,,,
3,UBERON:0000006,islet of langerhans,Anatomy,MESH:D007515|UBERON:0000006,,,,
4,UBERON:0000007,pituitary gland,Anatomy,MESH:D010902|UBERON:0000007,,,,
...,...,...,...,...,...,...,...,...
792407,WD:Q9094469,ssdna virus,Taxon,,,,,
792408,WD:Q9094478,positive-sense single-stranded rna virus,Taxon,,,,,
792409,WD:Q9285327,negative-sense single strand rna virus,Taxon,,,,,
792410,WD:Q934657,warble fly,Taxon,,,,,


In [88]:
edges = gt.re_id_edges(edges, mrg_map, old_id_col='id_old', new_id_col='id')
print('{:,}'.format(len(edges)))
edges = dfp.combine_group_cols_on_char(edges, ['start_id', 'type', 'end_id'], sort=True, prog=True)
print('{:,}'.format(len(edges)))

9,420,646


HBox(children=(FloatProgress(value=0.0, description='total_progress', max=6.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='dsrc_type', max=131808.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='comp_type', max=131808.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='p_val', max=131808.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='source', max=131808.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='license', max=131808.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='pmids', max=131808.0, style=ProgressStyle(description_wid…



9,281,977


# Issue 2: Xrefs to other nodes

X-refs in our network should mean 'concepts that have been subsumed by this node' Therefore an X-ref should never refer to the identifier of another node in the network.  

However, becasue some external identifier vocuabularies can be more general than what's in our network, a multiple nodes containing the same external xref can be possible, as long as the CURI is different from the primary identifier

In [89]:
xref_map = dfp.expand_col_on_char(nodes, 'xrefs', '|', True)

In [90]:
xref_map = xref_map.query('id != xrefs')
xref_map

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
0,UBERON:0000002,cervix,Anatomy,MESH:D002584,,,,
2,UBERON:0000004,human nose,Anatomy,MESH:D009666,,,,
4,UBERON:0000006,islet of langerhans,Anatomy,MESH:D007515,,,,
6,UBERON:0000007,pituitary gland,Anatomy,MESH:D010902,,,,
9,UBERON:0000009,submucosa,Anatomy,UBERON:0004925,,,,
...,...,...,...,...,...,...,...,...
1542642,NCBITaxon:1960534,influenza b virus,Taxon,NCBITaxon:682049,,,,
1546362,NCBITaxon:407754,influenza a virus,Taxon,NCBITaxon:11320,,,,
1548828,NCBITaxon:682049,influenza b virus,Taxon,NCBITaxon:11520,,,,
1548829,NCBITaxon:682049,influenza b virus,Taxon,NCBITaxon:1600125,,,,


## Potential problem a: duplicated xrefs from the same CURI

In [91]:
node_ids = edges[['start_id', 'end_id']].stack().unique()

In [92]:
duped_xref = xref_map[xref_map['xrefs'].duplicated(keep=False)].sort_values('xrefs').copy()
duped_xref['id_curi'] = duped_xref['id'].apply(lambda s: s.split(':')[0])
duped_xref['xref_curi'] = duped_xref['xrefs'].apply(lambda s: s.split(':')[0])

duped_xref.query('id_curi == xref_curi and id in @node_ids')['label'].value_counts()

Protein     2033
Disease       21
Taxon         12
Compound       1
Name: label, dtype: int64

In [93]:
combo = gt.combine_nodes_and_edges(nodes, edges)

def qq(start_id, end_id=None, cols=None):
    if cols is None:
        cols = ['start_id', 'end_id', 'start_name', 'type', 'end_name', 'start_label', 'end_label', 'source', 'dsrc_type']
    
    if end_id is not None:
        return combo.query('start_id == @start_id and end_id == @end_id')[cols]
    else:
        return combo.query('start_id == @start_id or end_id == @start_id')[cols]

#### Compounds

We did a good job of wangling compounds in notebooks 7... so shouldn't be any issue here...

In [94]:
duped_xref.query('id_curi == xref_curi and label == "Compound"')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,id_curi,xref_curi
173431,CHEBI:48432,ile(5)-angiotensin ii,Compound,CHEBI:2719,,,,,CHEBI,CHEBI


In [95]:
duped_xref.query('xrefs == "CHEBI:2719"')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,id_curi,xref_curi
173431,CHEBI:48432,ile(5)-angiotensin ii,Compound,CHEBI:2719,,,,,CHEBI,CHEBI
1221996,NCBIGene:183,agt,Gene,CHEBI:2719,,,,,NCBIGene,CHEBI


Appears to be a gene and chemical version of the same protein substance...

#### Diseases

We've done a lot of work on diseases, these are small, but bigger than we'd Like

In [96]:
duped_xref.query('id_curi == xref_curi and label == "Disease" and id in @node_ids')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,id_curi,xref_curi
820763,DOID:0050785,progressive relapsing multiple sclerosis,Disease,DOID:0050783,,PRMS|Progressive-relapsing MS,,,DOID,DOID
820753,DOID:0050784,primary progressive multiple sclerosis,Disease,DOID:0050783,,PPMS|Primary-progressive MS,,,DOID,DOID
820764,DOID:0050785,progressive relapsing multiple sclerosis,Disease,DOID:0050784,,PRMS|Progressive-relapsing MS,,,DOID,DOID
820743,DOID:0050783,secondary progressive multiple sclerosis,Disease,DOID:0050784,,SPMS|Secondary-progressive MS,,,DOID,DOID
820755,DOID:0050784,primary progressive multiple sclerosis,Disease,DOID:0050785,,PPMS|Primary-progressive MS,,,DOID,DOID
820744,DOID:0050783,secondary progressive multiple sclerosis,Disease,DOID:0050785,,SPMS|Secondary-progressive MS,,,DOID,DOID
826703,DOID:0060859,salmonellosis,Disease,DOID:11092,,Salmonella infection,,,DOID,DOID
849121,DOID:12785,diabetic polyneuropathy,Disease,DOID:11503,,Diabetes mellitus with polyneuropathy|Diabetic...,,,DOID,DOID
845078,DOID:11503,diabetic autonomic neuropathy,Disease,DOID:12785,,Diabetic autonomic neuropathy,,NCIthesaurus,DOID,DOID
826704,DOID:0060859,salmonellosis,Disease,DOID:1466,,Salmonella infection,,,DOID,DOID


In [97]:
duped_xref.query('id_curi == xref_curi and label == "Disease" and id in @node_ids and id not in @current_doids')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,id_curi,xref_curi


All these seem fine... mostly mis-mappings of the xref, or maping to obsolete terms... I'm going to ignore it

### Taxon

In [98]:
duped_xref.query('id_curi == xref_curi and label == "Taxon" and id in @node_ids')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,id_curi,xref_curi
1541035,NCBITaxon:1600125,influenza b virus,Taxon,NCBITaxon:11520,,,,,NCBITaxon,NCBITaxon
1542639,NCBITaxon:1960534,influenza b virus,Taxon,NCBITaxon:11520,,,,,NCBITaxon,NCBITaxon
1548828,NCBITaxon:682049,influenza b virus,Taxon,NCBITaxon:11520,,,,,NCBITaxon,NCBITaxon
1542640,NCBITaxon:1960534,influenza b virus,Taxon,NCBITaxon:1600125,,,,,NCBITaxon,NCBITaxon
1538751,NCBITaxon:11520,influenza b virus,Taxon,NCBITaxon:1600125,,,,,NCBITaxon,NCBITaxon
1548829,NCBITaxon:682049,influenza b virus,Taxon,NCBITaxon:1600125,,,,,NCBITaxon,NCBITaxon
1538752,NCBITaxon:11520,influenza b virus,Taxon,NCBITaxon:1960534,,,,,NCBITaxon,NCBITaxon
1541037,NCBITaxon:1600125,influenza b virus,Taxon,NCBITaxon:1960534,,,,,NCBITaxon,NCBITaxon
1548830,NCBITaxon:682049,influenza b virus,Taxon,NCBITaxon:1960534,,,,,NCBITaxon,NCBITaxon
1542642,NCBITaxon:1960534,influenza b virus,Taxon,NCBITaxon:682049,,,,,NCBITaxon,NCBITaxon


Some of these are a problem and highlight our second, more important issue... Xrefs that map to other node items...

in this case its all one taxa, influenza b virus. So we'll go with the cannonical id for that `11520`

In [99]:
keep_id = 'NCBITaxon:11520'

subsumed_ids = duped_xref.query('id_curi == xref_curi and label == "Taxon" and id in @node_ids')[['id', 'xrefs']].stack().unique()
subsumed_ids = [sid for sid in subsumed_ids if sid != keep_id]

new_mrg_map = pd.DataFrame({'id': [keep_id for i in range(len(subsumed_ids))], 'id_old': subsumed_ids})
new_mrg_map

Unnamed: 0,id,id_old
0,NCBITaxon:11520,NCBITaxon:1600125
1,NCBITaxon:11520,NCBITaxon:1960534
2,NCBITaxon:11520,NCBITaxon:682049


In [100]:
duped_xref.query('id_curi == xref_curi and label == "Protein" and id in @node_ids').sort_values('xrefs')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets,id_curi,xref_curi
1438130,UniProt:A0A068W9M3,egrg_000749900.3,Protein,UniProt:A0A068W6U3,,,,,UniProt,UniProt
1437684,UniProt:A0A068W7A3,egrg_000749900.1,Protein,UniProt:A0A068W6U3,,,,,UniProt,UniProt
1438131,UniProt:A0A068W9M3,egrg_000749900.3,Protein,UniProt:A0A068W7A3,,,,,UniProt,UniProt
1437595,UniProt:A0A068W6U3,egrg_000749900.2,Protein,UniProt:A0A068W7A3,,,,,UniProt,UniProt
1437596,UniProt:A0A068W6U3,egrg_000749900.2,Protein,UniProt:A0A068W9M3,,,,,UniProt,UniProt
...,...,...,...,...,...,...,...,...,...,...
1537861,UniProt:V9PWX7,smp_089320.1,Protein,UniProt:V9PWX8,,,,,UniProt,UniProt
1518046,UniProt:Q6TMX9,nicotinic acetylcholine receptor,Protein,UniProt:V9TII8,,,,,UniProt,UniProt
1474677,UniProt:G4VN55,nicotinic acetylcholine receptor,Protein,UniProt:V9TII8,,,,,UniProt,UniProt
1474674,UniProt:G4VN54,nicotinic acetylcholine receptor non alpha,Protein,UniProt:V9TMX9,,,,,UniProt,UniProt


Too many proteins to handle right now, and most of them have the also follow inder Issue B Below...

## B: Nodes with xrefs to other nodes

In [101]:
xref_mrg = pd.merge(xref_map.drop_duplicates(subset=['id']).drop('xrefs', axis=1), xref_map, left_on=['id'], right_on=['xrefs'], how='inner', suffixes=['_id', '_xref'])

In [102]:
xref_mrg['id_curi'] = xref_mrg['id_id'].apply(lambda s: s.split(':')[0])
xref_mrg['xref_curi'] = xref_mrg['id_xref'].apply(lambda s: s.split(':')[0])

In [103]:
xref_mrg['label_id'].value_counts()

Protein      5451
Disease       116
Anatomy        56
Phenotype      18
Taxon          14
Gene           10
Compound        2
Name: label_id, dtype: int64

In [104]:
xref_mrg.query('label_id == "Disease"')

Unnamed: 0,id_id,name_id,label_id,source_id,synonyms_id,alt_ids_id,subsets_id,id_xref,name_xref,label_xref,xrefs,source_xref,synonyms_xref,alt_ids_xref,subsets_xref,id_curi,xref_curi
58,DOID:0040098,pemphigus gestationis,Disease,,,,DO_IEDB_slim,DOID:14482,pemphigoid gestationis,Disease,DOID:0040098,,Gestational herpes|herpes gestationis,,NCIthesaurus,DOID,DOID
59,DOID:0050047,flinders island spotted fever,Disease,,Thai tick typhus,DOID:0050048,gram-negative_bacterial_infectious_disease|tic...,DOID:0050053,obsolete rickettsia honei spotted fever,Disease,DOID:0050047,,,,gram-negative_bacterial_infectious_disease|tic...,DOID,DOID
60,DOID:0050053,obsolete rickettsia honei spotted fever,Disease,,,,gram-negative_bacterial_infectious_disease|tic...,DOID:0050047,flinders island spotted fever,Disease,DOID:0050053,,Thai tick typhus,DOID:0050048,gram-negative_bacterial_infectious_disease|tic...,DOID,DOID
61,DOID:0050119,obsolete west nile virus neurological syndrome,Disease,,WNND|WNNS|West nile neuroinvasive disease,,zoonotic_infectious_disease,DOID:2365,west nile encephalitis,Disease,DOID:0050119,,West Nile Fever with encephalitis|West Nile fe...,,zoonotic_infectious_disease,DOID,DOID
62,DOID:0050134,cutaneous mycosis,Disease,,,,,DOID:1563,dermatomycosis,Disease,DOID:0050134,,,,,DOID,DOID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,MESH:D002828,choristoma,Disease,,,,,HP:0002282,gray matter heterotopia,Phenotype,MESH:D002828,,,,,MESH,HP
170,MESH:D009410,nerve degeneration,Disease,,,,,DOID:1289,neurodegenerative disease,Disease,MESH:D009410,,Neurodegenerative disease|degenerative disease,DOID:4874,DO_FlyBase_slim|NCIthesaurus,MESH,DOID
171,MESH:D058267,pulicosis,Disease,,,,,DOID:0050266,tungiasis,Disease,MESH:D058267,,,,,MESH,DOID
172,MESH:D060050,stable angina,Disease,,,,,DOID:60164,pain relief,Disease,MESH:D060050,,,,,MESH,DOID


In [105]:
xref_mrg.query('label_id == "Disease"')[['id_curi', 'xref_curi']].apply(tuple, axis=1).value_counts()

(DOID, DOID)    110
(MESH, DOID)      4
(UMLS, HP)        1
(MESH, HP)        1
dtype: int64

DOID, DOID should just remian and have the offending xref removed....

In [106]:
def remove_xref(xref_str, xref_to_remove, sep_char='|'):
    """
    Remvoes a single xref from a string of xrefs sparated by `sep_char`.
    
    For example 'DOID:0040098|DOID:0050053|DOID:0050119' separeated by `sep_char`='|' with xref_to_remove='DOID:0050053'
    becomes:
      'DOID:0040098|DOID:0050119'
    """
    
    xref_str = xref_str.replace(xref_to_remove, '')
    
    # only xref
    if xref_str == '':
        return float('nan')
   
    # removed from middle
    xref_str = xref_str.replace(2*sep_char, sep_char)
    # removed from start
    xref_str = xref_str.lstrip(sep_char)
    # removed from end
    xref_str = xref_str.rstrip(sep_char)
    return xref_str
    

In [107]:
combo_type = xref_mrg.query('label_id == "Disease"')[['id_curi', 'xref_curi']].apply(tuple, axis=1)
dual_do_idx = combo_type[combo_type == ('DOID', 'DOID')].index

xref_mrg.loc[dual_do_idx, 'name_id'].str.startswith('obsolete').sum()

48

A lot are obsolete, so we'll fix those and merge them...

In [108]:
obsolete_idx = xref_mrg.loc[dual_do_idx][xref_mrg.loc[dual_do_idx, 'name_id'].str.startswith('obsolete')].index

overlap = set(xref_mrg.loc[obsolete_idx]['id_id']) & set(xref_mrg.loc[obsolete_idx]['id_xref'])

In [109]:
lens = {}
for oid in overlap:
    lens[oid] = len(qq(oid))

In [110]:
lens

{'DOID:284': 0,
 'DOID:2779': 0,
 'DOID:0050341': 3518,
 'DOID:5000': 0,
 'DOID:1709': 42,
 'DOID:13469': 0,
 'DOID:12647': 2,
 'DOID:10059': 0,
 'DOID:1466': 0,
 'DOID:2778': 1,
 'DOID:11099': 0,
 'DOID:2313': 0,
 'DOID:9940': 0,
 'DOID:10111': 0,
 'DOID:122': 0,
 'DOID:11092': 0}

most are unpopulated... I just don't htink its worth doing anything with these...

In [111]:
lens = {}
for oid in xref_mrg.loc[obsolete_idx]['id_id']:
    if oid not in overlap:
        lens[oid] = len(qq(oid))

In [112]:
lens

{'DOID:0050053': 0,
 'DOID:0050119': 0,
 'DOID:0050287': 0,
 'DOID:0050309': 0,
 'DOID:0050329': 0,
 'DOID:0050399': 0,
 'DOID:0060052': 0,
 'DOID:0060120': 0,
 'DOID:0080013': 0,
 'DOID:12052': 0,
 'DOID:12255': 0,
 'DOID:12269': 0,
 'DOID:1550': 0,
 'DOID:157': 0,
 'DOID:2370': 0,
 'DOID:2376': 0,
 'DOID:2947': 0,
 'DOID:3166': 31,
 'DOID:3680': 0,
 'DOID:4121': 0,
 'DOID:5928': 0,
 'DOID:8524': 0,
 'DOID:9195': 0,
 'DOID:9678': 0,
 'DOID:9829': 0,
 'DOID:9915': 0,
 'DOID:9929': 0}

In [113]:
nonzero = [k for k, v in lens.items() if v > 0]

lens1 = {}
for oid in xref_mrg.loc[obsolete_idx].query('id_id in @nonzero')['id_xref']:
    if oid not in overlap:
        lens1[oid] = len(qq(oid))
lens1

{'DOID:0060888': 4}

In [114]:
nodes.query('id == "DOID:0060888"')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
314941,DOID:0060888,transient myeloproliferative syndrome,Disease,DOID:0060888|DOID:3166|GARD:12765|ICD10CM:D47....,,MST|TAM|transient abnormal myelopoiesis|transi...,,


In [115]:
nodes.query('id == "DOID:3166"')

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
321132,DOID:3166,obsolete leukemoid reaction,Disease,DOID:0060888|DOID:3166|ICD10CM:D72.823|ICD9CM:...,,,,NCIthesaurus


Obsolete has this NCIThesaurs tag.... I just don't htink its worth touching these...

In [116]:
mesh_do_idx = combo_type[combo_type == ('MESH', 'DOID')].index
xref_mrg.loc[mesh_do_idx]

Unnamed: 0,id_id,name_id,label_id,source_id,synonyms_id,alt_ids_id,subsets_id,id_xref,name_xref,label_xref,xrefs,source_xref,synonyms_xref,alt_ids_xref,subsets_xref,id_curi,xref_curi
168,MESH:C537436,aromatase deficiency,Disease,,,,,DOID:0090122,aromatase excess syndrome,Disease,MESH:C537436,,AEXS|familial hyperestrogenism|hereditary prep...,,,MESH,DOID
170,MESH:D009410,nerve degeneration,Disease,,,,,DOID:1289,neurodegenerative disease,Disease,MESH:D009410,,Neurodegenerative disease|degenerative disease,DOID:4874,DO_FlyBase_slim|NCIthesaurus,MESH,DOID
171,MESH:D058267,pulicosis,Disease,,,,,DOID:0050266,tungiasis,Disease,MESH:D058267,,,,,MESH,DOID
172,MESH:D060050,stable angina,Disease,,,,,DOID:60164,pain relief,Disease,MESH:D060050,,,,,MESH,DOID


In [117]:
to_mrg = xref_mrg.loc[mesh_do_idx, ['id_id', 'id_xref']].rename(columns={'id_id': 'id_old', 'id_xref': 'id'})
to_mrg

Unnamed: 0,id_old,id
168,MESH:C537436,DOID:0090122
170,MESH:D009410,DOID:1289
171,MESH:D058267,DOID:0050266
172,MESH:D060050,DOID:60164


In [118]:
new_mrg_map = pd.concat([new_mrg_map, to_mrg], sort=False, ignore_index=True)
new_mrg_map

Unnamed: 0,id,id_old
0,NCBITaxon:11520,NCBITaxon:1600125
1,NCBITaxon:11520,NCBITaxon:1960534
2,NCBITaxon:11520,NCBITaxon:682049
3,DOID:0090122,MESH:C537436
4,DOID:1289,MESH:D009410
5,DOID:0050266,MESH:D058267
6,DOID:60164,MESH:D060050


In [119]:
mesh_hp_idx = combo_type[combo_type == ('MESH', 'HP')].index
xref_mrg.loc[mesh_hp_idx]

Unnamed: 0,id_id,name_id,label_id,source_id,synonyms_id,alt_ids_id,subsets_id,id_xref,name_xref,label_xref,xrefs,source_xref,synonyms_xref,alt_ids_xref,subsets_xref,id_curi,xref_curi
169,MESH:D002828,choristoma,Disease,,,,,HP:0002282,gray matter heterotopia,Phenotype,MESH:D002828,,,,,MESH,HP


In [120]:
to_mrg = xref_mrg.loc[mesh_hp_idx, ['id_id', 'id_xref']].rename(columns={'id_id': 'id_old', 'id_xref': 'id'})
new_mrg_map = pd.concat([new_mrg_map, to_mrg], sort=False, ignore_index=True)
new_mrg_map

Unnamed: 0,id,id_old
0,NCBITaxon:11520,NCBITaxon:1600125
1,NCBITaxon:11520,NCBITaxon:1960534
2,NCBITaxon:11520,NCBITaxon:682049
3,DOID:0090122,MESH:C537436
4,DOID:1289,MESH:D009410
5,DOID:0050266,MESH:D058267
6,DOID:60164,MESH:D060050
7,HP:0002282,MESH:D002828


## Anatomy

In [121]:
anat_issue = xref_mrg.query('label_id == "Anatomy"').copy()
anat_issue

Unnamed: 0,id_id,name_id,label_id,source_id,synonyms_id,alt_ids_id,subsets_id,id_xref,name_xref,label_xref,xrefs,source_xref,synonyms_xref,alt_ids_xref,subsets_xref,id_curi,xref_curi
0,UBERON:0000009,submucosa,Anatomy,,,,,UBERON:0004925,submucosa,Anatomy,UBERON:0000009,,,,,UBERON,UBERON
1,UBERON:0000037,ovarian follicle,Anatomy,,,,,UBERON:0001305,ovarian follicle,Anatomy,UBERON:0000037,,,,,UBERON,UBERON
2,UBERON:0000065,respiratory tract,Anatomy,,,,,UBERON:0001005,respiratory tract,Anatomy,UBERON:0000065,,,,,UBERON,UBERON
3,UBERON:0000082,kidney,Anatomy,,,,,UBERON:0002113,kidney,Anatomy,UBERON:0000082,,,,,UBERON,UBERON
4,UBERON:0000093,sulcus,Anatomy,,,,,UBERON:0014764,sulcus,Anatomy,UBERON:0000093,,,,,UBERON,UBERON
5,UBERON:0000396,vallate papilla,Anatomy,,,,,UBERON:0014389,gustatory papilla,Anatomy,UBERON:0000396,,,,,UBERON,UBERON
6,UBERON:0000915,thoracic segment of trunk,Anatomy,,,,,UBERON:0001443,thorax,Anatomy,UBERON:0000915,,,,,UBERON,UBERON
7,UBERON:0000935,anterior commissure,Anatomy,,,,,UBERON:0002694,anterior commissure,Anatomy,UBERON:0000935,,,,,UBERON,UBERON
8,UBERON:0001005,respiratory tract,Anatomy,,,,,UBERON:0000065,respiratory tract,Anatomy,UBERON:0001005,,,,,UBERON,UBERON
9,UBERON:0001015,muscle,Anatomy,,,,,UBERON:0001630,muscle,Anatomy,UBERON:0001015,,,,,UBERON,UBERON


In [122]:
import obonet

In [123]:
uber = obonet.read_obo('http://purl.obolibrary.org/obo/uberon/basic.obo')

In [124]:
uber_to_name = {id_: data.get('name') for id_, data in uber.nodes(data=True)}

In [125]:
anat_issue['uber_id_name'] = anat_issue['id_id'].map(uber_to_name)
anat_issue['uber_xref_name'] = anat_issue['id_xref'].map(uber_to_name)

In [126]:
len(anat_issue)

56

In [127]:
to_mrg = anat_issue[anat_issue['name_id'] == anat_issue['uber_id_name']][['id_id', 'id_xref']].rename(columns={'id_id': 'id', 'id_xref': 'id_old'})
to_mrg

Unnamed: 0,id,id_old
0,UBERON:0000009,UBERON:0004925
2,UBERON:0000065,UBERON:0001005
4,UBERON:0000093,UBERON:0014764
5,UBERON:0000396,UBERON:0014389
6,UBERON:0000915,UBERON:0001443
7,UBERON:0000935,UBERON:0002694
13,UBERON:0001305,UBERON:0000037
18,UBERON:0001640,UBERON:0006637
19,UBERON:0001649,UBERON:0006090
21,UBERON:0001982,UBERON:2005259


In [128]:
new_mrg_map = pd.concat([new_mrg_map, to_mrg], sort=False, ignore_index=True)

In [129]:
mrg_ids = to_mrg[['id', 'id_old']].stack().unique()

In [130]:
anat_issue.query('id_id not in @mrg_ids and id_xref not in @mrg_ids')[['id_id', 'name_id', 'uber_id_name', 'id_xref', 'name_xref', 'uber_xref_name']]

Unnamed: 0,id_id,name_id,uber_id_name,id_xref,name_xref,uber_xref_name
9,UBERON:0001015,muscle,musculature,UBERON:0001630,muscle,muscle organ
10,UBERON:0001020,nerve,nervous system commissure,UBERON:0001021,peripheral nerve,nerve
11,UBERON:0001021,peripheral nerve,nerve,UBERON:0001020,nerve,nervous system commissure
14,UBERON:0001436,phalanx,phalanx of manus,UBERON:0001449,phalanx,phalanx of pes
16,UBERON:0001449,phalanx,phalanx of pes,UBERON:0001436,phalanx,phalanx of manus
17,UBERON:0001630,muscle,muscle organ,UBERON:0001015,muscle,musculature
20,UBERON:0001758,tooth supporting structure,periodontium,UBERON:0008266,tooth supporting structure,periodontal ligament
24,UBERON:0002360,meninges,meninx,UBERON:0010743,meninges,meningeal cluster
28,UBERON:0002727,medullary laminae of thalamus,medial medullary lamina of globus pallidus,UBERON:0002765,medullary laminae of thalamus,lateral medullary lamina of globus pallidus
29,UBERON:0002765,medullary laminae of thalamus,lateral medullary lamina of globus pallidus,UBERON:0002727,medullary laminae of thalamus,medial medullary lamina of globus pallidus


## Proteins

These are a mess, we will just have to blanket fix them... 

If it's xrefed to a gene, we can keep, but if its xref'd to anotehr protein, we will just pick one and drop the remainder.

In [131]:
prot_issue = xref_mrg.query('label_id == "Protein"').copy()
prot_issue

Unnamed: 0,id_id,name_id,label_id,source_id,synonyms_id,alt_ids_id,subsets_id,id_xref,name_xref,label_xref,xrefs,source_xref,synonyms_xref,alt_ids_xref,subsets_xref,id_curi,xref_curi
202,UniProt:A0A068W6U3,egrg_000749900.2,Protein,,,,,UniProt:A0A068W7A3,egrg_000749900.1,Protein,UniProt:A0A068W6U3,,,,,UniProt,UniProt
203,UniProt:A0A068W6U3,egrg_000749900.2,Protein,,,,,UniProt:A0A068W9M3,egrg_000749900.3,Protein,UniProt:A0A068W6U3,,,,,UniProt,UniProt
204,UniProt:A0A068W7A3,egrg_000749900.1,Protein,,,,,UniProt:A0A068W6U3,egrg_000749900.2,Protein,UniProt:A0A068W7A3,,,,,UniProt,UniProt
205,UniProt:A0A068W7A3,egrg_000749900.1,Protein,,,,,UniProt:A0A068W9M3,egrg_000749900.3,Protein,UniProt:A0A068W7A3,,,,,UniProt,UniProt
206,UniProt:A0A068W7P1,egrg_000776300.2,Protein,,,,,UniProt:A0A068WAF9,egrg_000776300.1,Protein,UniProt:A0A068W7P1,,,,,UniProt,UniProt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5648,UniProt:V9TJE9,nicotinic acetylcholine receptor,Protein,,,,,UniProt:G4VE52,nicotinic acetylcholine receptor,Protein,UniProt:V9TJE9,,,,,UniProt,UniProt
5649,UniProt:V9TKF5,nicotinic acetylcholine receptor,Protein,,,,,UniProt:G4VRZ9,nicotinic acetylcholine receptor,Protein,UniProt:V9TKF5,,,,,UniProt,UniProt
5650,UniProt:V9TMX9,nicotinic acetylcholine receptor non alpha,Protein,,,,,UniProt:G4VN54,nicotinic acetylcholine receptor non alpha,Protein,UniProt:V9TMX9,,,,,UniProt,UniProt
5651,UniProt:V9TMX9,nicotinic acetylcholine receptor non alpha,Protein,,,,,UniProt:Q6TMX8,nicotinic acetylcholine receptor non alpha,Protein,UniProt:V9TMX9,,,,,UniProt,UniProt


In [132]:
from queue import Queue
from tqdm import tqdm

def get_subnets(input_adj_list):

    all_identifiers = list(input_adj_list.keys())

    subnets = defaultdict(set)
    visited = set()

    for net_id in tqdm(all_identifiers):
        if net_id not in visited:
            visited.add(net_id)
            q = Queue()
            q.put(net_id)

            while not q.empty():
                cur = q.get()
                visited.add(cur)

                for neighbour in input_adj_list[cur]:
                    subnets[net_id].add(neighbour)
                    if neighbour not in visited:
                        q.put(neighbour)
                        visited.add(neighbour)

    return subnets

In [133]:
from collections import defaultdict

adj_list = defaultdict(set)

for row in prot_issue.query('id_curi == "UniProt" and xref_curi == "UniProt"').itertuples():
    adj_list[row.id_xref].add(row.id_id)
    adj_list[row.id_id].add(row.id_xref)

In [134]:
subnets = get_subnets(adj_list)

100%|██████████| 2512/2512 [00:00<00:00, 89242.02it/s]


In [135]:
to_mrg = []
for k, v in subnets.items():
    this_mrg = pd.DataFrame({'id':[k for i in v], 'id_old': list(v)})
    to_mrg.append(this_mrg)
to_mrg = pd.concat(to_mrg)
to_mrg

Unnamed: 0,id,id_old
0,UniProt:A0A068W7A3,UniProt:A0A068W7A3
1,UniProt:A0A068W7A3,UniProt:A0A068W9M3
2,UniProt:A0A068W7A3,UniProt:A0A068W6U3
0,UniProt:A0A068WAF9,UniProt:A0A068WAF9
1,UniProt:A0A068WAF9,UniProt:A0A068W7P1
...,...,...
4,UniProt:Q70W85,UniProt:Q9B8X8
0,UniProt:Q6WVP6,UniProt:Q6WVP6
1,UniProt:Q6WVP6,UniProt:Q6TA39
0,UniProt:Q94759,UniProt:Q94759


In [136]:
to_mrg['id'].nunique()

1104

In [137]:
new_mrg_map = pd.concat([new_mrg_map, to_mrg], sort=False, ignore_index=True)

## Cleanup and merge

In [138]:
mrg_ids = new_mrg_map.stack().unique()

In [139]:
len(mrg_ids)

2556

In [140]:
len(mrg_ids) - len(new_mrg_map['id'].unique())

1431

In [141]:
not_merged = xref_mrg.query('id_id not in @mrg_ids')
len(not_merged)

1770

In [142]:
# Remove offensive Xrefs from those we don't merge

for row in tqdm(not_merged.itertuples(), total=len(not_merged)):
    q_id = row.id_id
    idx = nodes.query('id == @q_id').index.tolist()[0]
    nodes.loc[idx, 'xrefs'] = remove_xref(row.id_xref, nodes.loc[idx, 'xrefs'])

100%|██████████| 1770/1770 [00:37<00:00, 47.14it/s]


In [143]:
to_fix = nodes.query('id in @mrg_ids').copy()
to_keep = nodes.query('id not in @mrg_ids')

In [144]:
new_mrg_map.nunique()

id        1125
id_old    2535
dtype: int64

In [145]:
# make sure many to 1 and not many to many
len(new_mrg_map.set_index('id_old')['id'].to_dict()) == len(new_mrg_map)

True

In [146]:
new_mrg_map_dict = new_mrg_map.set_index('id_old')['id'].to_dict()

to_fix['id'] = to_fix['id'].map(new_mrg_map_dict).fillna(to_fix['id'])

to_fix = dfp.combine_group_cols_on_char(to_fix, ['id'], [c for c in to_fix.columns if c not in ['id', 'name', 'label']],
                                        sort=True, prog=False)

In [147]:
nodes = pd.concat([to_keep, to_fix], sort=False).sort_values(['label', 'id']).reset_index(drop=True)

In [148]:
edges = gt.re_id_edges(edges, new_mrg_map, 'id_old', 'id')

In [149]:
print('{:,}'.format(len(edges)))
edges = dfp.combine_group_cols_on_char(edges, ['start_id', 'type', 'end_id'], sort=True, prog=True)
print('{:,}'.format(len(edges)))

9,281,977


HBox(children=(FloatProgress(value=0.0, description='total_progress', max=5.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='p_val', max=15853.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='adj_p', max=15853.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='source', max=15853.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='license', max=15853.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='pmids', max=15853.0, style=ProgressStyle(description_widt…



9,263,585


In [150]:
nodes_ids = nodes['id'].unique()

In [151]:
edge_ids = edges[['start_id', 'end_id']].stack().unique()

In [152]:
set(edge_ids) - set(node_ids) == set()

True

In [153]:
# Since we have the obo, lets get the correct uberon names...
nodes['name'] = nodes['id'].map(uber_to_name).fillna(nodes['name'])

In [154]:
nodes.count()

id          790981
name        790966
label       790981
xrefs       557235
source       14322
synonyms      9695
alt_ids       2469
subsets       6864
dtype: int64

In [155]:
noname_idx = nodes[nodes['name'].isnull()].index

nodes.loc[noname_idx]

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
331008,CHEBI:144367,,Drug,3po|CHEBI:144367|IKEY:UJJUKZPBUMCSJZ-BQYQJAHWS...,informa,,,
334432,IKEY:AKLMFDDQCHURPW-ISIOAQNYSA-N,,Drug,1pyrrolidineacetamiden5ar6as7s10as9aminocarbon...,gvk|integrity,,,
335068,IKEY:CJGOZEVWXQGMCS-UHFFFAOYSA-N,,Drug,IKEY:CJGOZEVWXQGMCS-UHFFFAOYSA-N|IKEY:YCWROMXN...,gvk|informa|integrity,,,
335190,IKEY:CSOBIBXVIYAXFM-UHFFFAOYSA-N,,Drug,2910dimethoxy4oxo2246trimethylphenylimino67dih...,gvk|informa|integrity,,,
335220,IKEY:CUKZGIUJLFWPNP-UHFFFAOYSA-N,,Drug,IKEY:CUKZGIUJLFWPNP-UHFFFAOYSA-N|IKEY:YJPVTCSB...,gvk,,,
336079,IKEY:GHUYIIGPWBMOGY-KRWDZBQOSA-N,,Drug,IKEY:CYHWMBVXXDIZNZ-KRWDZBQOSA-N|IKEY:GHUYIIGP...,gvk|integrity,,,
336399,IKEY:HHRDZACHVMGHOB-UHFFFAOYSA-N,,Drug,225dimethylaminomethylfurfurylthioethylamino56...,informa,,,
336624,IKEY:HXLOHDZQBKCUCR-FNNAPWSISA-N,,Drug,3quinolinecarboxamide12dihydron3endo82r2hydrox...,gvk|informa|integrity,,,
336724,IKEY:IFPBSRFVNPCZMK-UHFFFAOYSA-N,,Drug,18naphthyridine3carboxylicacid1cyclopropyl6flu...,gvk,,,
337119,IKEY:JGPNCLKRECLYTO-UHFFFAOYSA-N,,Drug,5nitro2furaldehyde2ethylsemicarbazone|IKEY:JGP...,gvk,,,


In [156]:
nodes.loc[noname_idx, 'name'] = nodes.loc[noname_idx, 'xrefs'].apply(lambda s: s.split('|')[0])
nodes.loc[noname_idx]

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
331008,CHEBI:144367,3po,Drug,3po|CHEBI:144367|IKEY:UJJUKZPBUMCSJZ-BQYQJAHWS...,informa,,,
334432,IKEY:AKLMFDDQCHURPW-ISIOAQNYSA-N,1pyrrolidineacetamiden5ar6as7s10as9aminocarbon...,Drug,1pyrrolidineacetamiden5ar6as7s10as9aminocarbon...,gvk|integrity,,,
335068,IKEY:CJGOZEVWXQGMCS-UHFFFAOYSA-N,IKEY:CJGOZEVWXQGMCS-UHFFFAOYSA-N,Drug,IKEY:CJGOZEVWXQGMCS-UHFFFAOYSA-N|IKEY:YCWROMXN...,gvk|informa|integrity,,,
335190,IKEY:CSOBIBXVIYAXFM-UHFFFAOYSA-N,2910dimethoxy4oxo2246trimethylphenylimino67dih...,Drug,2910dimethoxy4oxo2246trimethylphenylimino67dih...,gvk|informa|integrity,,,
335220,IKEY:CUKZGIUJLFWPNP-UHFFFAOYSA-N,IKEY:CUKZGIUJLFWPNP-UHFFFAOYSA-N,Drug,IKEY:CUKZGIUJLFWPNP-UHFFFAOYSA-N|IKEY:YJPVTCSB...,gvk,,,
336079,IKEY:GHUYIIGPWBMOGY-KRWDZBQOSA-N,IKEY:CYHWMBVXXDIZNZ-KRWDZBQOSA-N,Drug,IKEY:CYHWMBVXXDIZNZ-KRWDZBQOSA-N|IKEY:GHUYIIGP...,gvk|integrity,,,
336399,IKEY:HHRDZACHVMGHOB-UHFFFAOYSA-N,225dimethylaminomethylfurfurylthioethylamino56...,Drug,225dimethylaminomethylfurfurylthioethylamino56...,informa,,,
336624,IKEY:HXLOHDZQBKCUCR-FNNAPWSISA-N,3quinolinecarboxamide12dihydron3endo82r2hydrox...,Drug,3quinolinecarboxamide12dihydron3endo82r2hydrox...,gvk|informa|integrity,,,
336724,IKEY:IFPBSRFVNPCZMK-UHFFFAOYSA-N,18naphthyridine3carboxylicacid1cyclopropyl6flu...,Drug,18naphthyridine3carboxylicacid1cyclopropyl6flu...,gvk,,,
337119,IKEY:JGPNCLKRECLYTO-UHFFFAOYSA-N,5nitro2furaldehyde2ethylsemicarbazone,Drug,5nitro2furaldehyde2ethylsemicarbazone|IKEY:JGP...,gvk,,,


In [157]:
#Fix the lowercase names...
nodes['name'] = nodes['id'].map(id_to_name).fillna(nodes['name'])

In [158]:
mrg_map.to_csv(out_dir.joinpath('merge_map_1.csv'), index=False)
new_mrg_map.to_csv(out_dir.joinpath('merge_map_2.csv'), index=False)

nodes.to_csv(out_dir.joinpath('nodes.csv'), index=False)
edges.to_csv(out_dir.joinpath('edges.csv'), index=False)