# Cleaning up Disease and Phenotype

Diseases and phenotypes have come from different data souces and have different data types.

In [1]:
import pandas as pd
from pathlib import Path
from data_tools import df_processing as dfp

import rdflib
import ontospy

  from tqdm.autonotebook import tqdm


In [2]:
this_name = '10a_Disease_Pheno_Cleanup_parse_owls'
out_dir = Path('../2_pipeline').resolve().joinpath(this_name, 'out')

out_dir.mkdir(parents=True, exist_ok=True)

In [3]:
def uri_to_id(uri):
    if '#' in uri:
        return uri.split('#')[-1]
    return uri.split('/')[-1].replace('_', ':')

In [4]:
def get_class_id(entity):
    return uri_to_id(entity.uri.toPython())

In [5]:
def get_class_name(entity):
    name = entity.bestLabel()
    if type(name) == str:
        return name
    return name.toPython()

In [6]:
def get_property_objects(entity, property_uri):
    out = []
    for t in entity.triples:
        if t[1].toPython() == property_uri:
            obj = t[2]
            if type(obj) == rdflib.term.BNode:
                obj = t[0]
                if type(obj) == rdflib.term.BNode:
                    continue
            
            obj = obj.toPython()
            
            if obj.startswith('http'):
                obj = uri_to_id(obj)
            
            out.append(obj)
    return out

In [7]:
def get_concept_info(entity):
    out = {}
    
    out['id'] = get_class_id(entity)
    out['name'] = get_class_name(entity)
    
    out['synonyms'] = get_property_objects(entity, 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym')
    out['xrefs'] = get_property_objects(entity, 'http://www.geneontology.org/formats/oboInOwl#hasDbXref')
    out['alt_ids'] = get_property_objects(entity, 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId')
    out['subsets'] = get_property_objects(entity, 'http://www.geneontology.org/formats/oboInOwl#inSubset')
    
    out = {k: v if v != [] else float('nan') for k, v in out.items()}
    
    
    return out
    

In [8]:
def get_relationship_info(entity):
    out = {}
    
    subjects = get_class_id(entity)
    objects = get_property_objects(entity, 'http://www.w3.org/2002/07/owl#someValuesFrom')
    
    out['sbjs'] = subjects
    out['objs'] = objects
    
    out = {k: v if v != [] else float('nan') for k, v in out.items()}

    
    return out

## Parse Diseases

In [9]:
%%time
do = ontospy.Ontospy('http://purl.obolibrary.org/obo/doid.owl')

CPU times: user 4min 34s, sys: 882 ms, total: 4min 35s
Wall time: 4min 36s


In [10]:
dis_info_json = list()
dis_rel_json = list()

for e in do.all_classes:
    dis_info_json.append(get_concept_info(e))
    dis_rel_json.append(get_relationship_info(e))

In [11]:
dis_info_df = pd.DataFrame(dis_info_json)

In [12]:
dfp.expand_df_on_col(dis_info_df, 'xrefs').dropna(subset=['xrefs'])['xrefs'].apply(lambda s: s.split(':')[0]).value_counts()

UMLS_CUI                   7032
OMIM                       5182
SNOMEDCT_US_2019_09_01     4823
NCI                        4687
ICD10CM                    3654
MESH                       3553
ICD9CM                     2270
GARD                       1922
ORDO                       1864
EFO                         130
KEGG                         41
MEDDRA                       34
SNOMEDCT_US_2018_03_01       14
ICDO                         14
SNOMEDCT_US_2020_03_01        8
SNOMED_CT_US_2018_03_01       3
SNOMEDCT_US_2020_09_01        2
stedman                       1
DERMO                         1
OERDO                         1
SNOMEDCT_2020_03_01           1
UMS_CUI                       1
Name: xrefs, dtype: int64

In [13]:
for col in ['synonyms', 'xrefs', 'alt_ids', 'subsets']:
    dis_info_df[col] = dis_info_df[col].apply(lambda s: dfp.char_combine_iter(s) if type(s) != float else s)

In [14]:
dis_info_df.to_csv(out_dir.joinpath('DO_node_info.csv'), index=False)

In [15]:
dis_rel_df = pd.DataFrame(dis_rel_json)

In [16]:
dis_rel_df = dis_rel_df.dropna(subset=['objs'])
dis_rel_df = dfp.expand_df_on_col(dis_rel_df, 'objs')
dis_rel_df

Unnamed: 0,sbjs,objs
0,DOID:0040001,FOODON:00002239
1,DOID:0040002,CHEBI:15365
2,DOID:0040003,CHEBI:18208
3,DOID:0040004,CHEBI:2676
4,DOID:0040005,CHEBI:29007
...,...,...
6083,DOID:9972,SYMP:0000470
6084,DOID:9986,CL:0000542
6085,DOID:9988,UBERON:0000955
6086,DOID:9988,UBERON:0002240


In [17]:
dis_rel_df['objs'].apply(lambda s: s.split(':')[0]).value_counts().head(50)

GENO         2118
UBERON       1517
SYMP          861
CL            548
NCBITaxon     397
SO            313
HP            209
CHEBI          88
FOODON         25
DOID           12
Name: objs, dtype: int64

In [18]:
dis_rel_df.to_csv(out_dir.joinpath('DO_edge_info.csv'), index=False)

## Parse Phenotypes

In [19]:
%%time
hp = ontospy.Ontospy('http://purl.obolibrary.org/obo/hp.owl')

CPU times: user 14min 53s, sys: 3.05 s, total: 14min 56s
Wall time: 14min 57s


In [20]:
pheno_info_json = list()
pheno_rel_json = list()

for e in hp.all_classes:
    pheno_info_json.append(get_concept_info(e))
    pheno_rel_json.append(get_relationship_info(e))

In [21]:
pheno_info_df = pd.DataFrame(pheno_info_json)

In [22]:
pheno_info_df.sample(10)

Unnamed: 0,id,name,synonyms,xrefs,alt_ids,subsets
26650,UBERON:0016512,lumen of duodenum,"[duodenal lumen, doudenal lumen]","[FMA:14589, EMAPA:19080]",,
14917,HP:0025259,Stiff elbow,"[Elbow stiffness, Stiff elbow]",,,
12778,HP:0011166,Focal myoclonic seizure,"[Local myoclonic seizures, Segmental myoclonic...",[UMLS:C4023501],[HP:0025191],
13915,HP:0012311,Monocytosis,[High blood monocyte number],"[UMLS:C0085702, SNOMEDCT_US:19636003]",,
12036,HP:0010387,Osteolytic defects of the phalanges of the 5th...,,[UMLS:C4023865],,
5810,HP:0001076,Glabellar hemangioma,[Glabellar capillary hemangioma],[UMLS:C1854408],,
25890,UBERON:0011300,gray matter of telencephalon,[predominantly gray regional part of telenceph...,"[BIRNLEX:1067, FMA:83911]",[UBERON:0024186],
20156,HP:0430016,Abnormality of tensor veli palatini muscle,,[UMLS:C4073198],,
21440,PR:000003460,V(D)J recombination-activating protein 2,"[RAG2, RAG-2]",,[PR:000013690],
21498,PR:000018444,"interleukin-2 receptor subunit alpha, signal p...",[IL2RA/SigPep-],,,


In [23]:
uri_col = 'id'
dfp.expand_df_on_col(pheno_info_df, uri_col).dropna(subset=[uri_col])[uri_col].apply(lambda s: s.split(':')[0]).value_counts().head(15)

HP           15530
UBERON        5389
GO            2423
CHEBI         1580
CL             706
PATO           587
PR             501
NBO            163
gene           106
MGI            101
MPATH           78
HsapDv          16
BFO             16
NCBITaxon       13
CARO             9
Name: id, dtype: int64

In [24]:
uri_col = 'xrefs'
dfp.expand_df_on_col(pheno_info_df, uri_col).dropna(subset=[uri_col])[uri_col].apply(lambda s: s.split(':')[0]).value_counts().head(15)

UMLS           14597
SNOMEDCT_US     4672
FMA             3347
EMAPA           2849
MSH             2176
MA              1956
NCIT            1840
PMID            1702
EHDAA2          1092
BTO              961
VHOG             910
EHDAA            717
MESH             670
ZFA              662
TAO              630
Name: xrefs, dtype: int64

In [25]:
hpo_idx = pheno_info_df['id'].str.startswith('HP:')
pheno_info_df[hpo_idx].sample(10)

Unnamed: 0,id,name,synonyms,xrefs,alt_ids,subsets
15816,HP:0030557,Best corrected visual acuity 0.4 LogMAR,,[UMLS:C4073030],,
19611,HP:0100860,Dilatation of Inferior mesenteric artery,,"[UMLS:C0340625, SNOMEDCT_US:195289005]",,
9200,HP:0006257,Abnormality of carpal bone ossification,[Abnormal ankle bone maturation],[UMLS:C4025075],,
9305,HP:0006436,obsolete Shortening of the tibia,,,,
5738,HP:0000993,Molluscoid pseudotumors,[Molluscoid pseudotumor],[UMLS:C1844597],,
12676,HP:0011062,Misalignment of incisors,"[Crooked front teeth, Crooked incisors, Misali...","[UMLS:C4023556, UMLS:C4280343]",,
8372,HP:0004839,Pyropoikilocytosis,[hereditary pyropoikilocytosis],"[MSH:C563004, UMLS:C0520739, SNOMEDCT_US:9434008]",[HP:0004805],
10790,HP:0009007,Biceps hypoplasia,"[Underdeveloped biceps, Hypoplastic biceps]",[UMLS:C1862499],,
15137,HP:0025493,Palmoplantar erythema,,,,
15693,HP:0030431,Osteochondroma,"[Osteochondromas, Osteocartilaginous exostoses]","[SNOMEDCT_US:52299001, SNOMEDCT_US:307573009, ...",,


In [26]:
uri_col = 'xrefs'
dfp.expand_df_on_col(pheno_info_df[hpo_idx], uri_col).dropna(subset=[uri_col])[uri_col].apply(lambda s: s.split(':')[0]).value_counts()

UMLS           13041
SNOMEDCT_US     4672
MSH             2176
Fyler            222
NCIT             219
MEDDRA            93
ICD-10            38
EPCC              13
MP                10
ORPHA              6
MPATH              4
ICD-O              3
ICD-9              3
SNOMED_CT          2
DOID               1
EFO                1
ICD9               1
COHD               1
ICD10              1
Name: xrefs, dtype: int64

In [27]:
for col in ['synonyms', 'xrefs', 'alt_ids', 'subsets']:
    pheno_info_df[col] = pheno_info_df[col].apply(lambda s: dfp.char_combine_iter(s) if type(s) != float else s)

In [28]:
pheno_info_df.to_csv(out_dir.joinpath('HP_node_info.csv'), index=False)

In [29]:
pheno_rel_df = pd.DataFrame(pheno_rel_json)

In [30]:
pheno_rel_df = pheno_rel_df.dropna(subset=['objs'])

In [31]:
pheno_rel_df = dfp.expand_df_on_col(pheno_rel_df, 'objs')
pheno_rel_df

Unnamed: 0,sbjs,objs
0,gene:symbol:report?hgnc:id=10658,NCBITaxon:9606
1,gene:symbol:report?hgnc:id=10720,NCBITaxon:9606
2,gene:symbol:report?hgnc:id=10903,NCBITaxon:9606
3,gene:symbol:report?hgnc:id=11241,NCBITaxon:9606
4,gene:symbol:report?hgnc:id=11249,NCBITaxon:9606
...,...,...
32702,UBERON:8300002,UBERON:0010708
32703,UBERON:8300003,UBERON:0010709
32704,UBERON:8300003,UBERON:0010709
32705,UBERON:8300004,UBERON:0010709


In [32]:
pheno_rel_df['objs'].apply(lambda s: s.split(':')[0]).value_counts().head(50)

UBERON                  18561
PATO                     6608
GO                       3351
CHEBI                    1716
NCBITaxon                 722
PR                        707
CL                        408
SO                        185
gene                      106
MGI                       103
NBO                        87
HP                         75
HsapDv                     29
MPATH                      18
MOD                        12
CP                         10
BFO                         4
ZDB-GENE-980526-110         1
ZDB-GENE-030131-1577        1
ZDB-GENE-980526-501         1
ZDB-GENE-041001-112         1
ZDB-GENE-000210-20          1
Name: objs, dtype: int64

In [33]:
hpo_idx = pheno_rel_df['sbjs'].str.startswith('HP:') |  pheno_rel_df['objs'].str.startswith('HP:')

In [34]:
pheno_rel_df[hpo_idx]['objs'].apply(lambda s: s.split(':')[0]).value_counts().head(50)

PATO      6325
UBERON    5663
GO         282
CL         220
HP          75
NBO         72
CHEBI       46
PR          27
MPATH       18
HsapDv      10
Name: objs, dtype: int64

In [35]:
pheno_rel_df[hpo_idx]['sbjs'].apply(lambda s: s.split(':')[0]).value_counts().head(50)

HP    12738
Name: sbjs, dtype: int64

In [36]:
def get_uri_edges(curi, edges):
    idx = edges['sbjs'].str.startswith(curi+':') | edges['objs'].str.startswith(curi+':')
    return edges[idx]

In [37]:
get_uri_edges('NCBITaxon', pheno_rel_df)['sbjs'].apply(lambda s: s.split(':')[0]).value_counts().head(50)

PR                      504
gene                    106
MGI                     101
CL                        6
ZDB-GENE-980526-110       1
ZDB-GENE-980526-501       1
ZDB-GENE-041001-112       1
ZDB-GENE-030131-1577      1
ZDB-GENE-000210-20        1
Name: sbjs, dtype: int64

In [38]:
get_uri_edges('CL', get_uri_edges('NCBITaxon', pheno_rel_df)).head(50)

Unnamed: 0,sbjs,objs
2237,CL:0000738,NCBITaxon:7742
2751,CL:0001062,NCBITaxon:9606
2774,CL:0001200,NCBITaxon:40674
2776,CL:0001201,NCBITaxon:40674
2780,CL:0001203,NCBITaxon:9606
2788,CL:0001204,NCBITaxon:9606


In [39]:
len(pheno_rel_df[hpo_idx])

12738

In [40]:
pheno_rel_df.to_csv(out_dir.joinpath('HP_edge_info.csv'), index=False)

## Parse Symptoms

In [41]:
symp = ontospy.Ontospy('http://purl.obolibrary.org/obo/symp.owl')

In [42]:
symp_info_json = list()
symp_rel_json = list()

for e in symp.all_classes:
    symp_info_json.append(get_concept_info(e))
    symp_rel_json.append(get_relationship_info(e))

In [43]:
pd.DataFrame(symp_info_json).dropna(subset=['xrefs'])

Unnamed: 0,id,name,synonyms,xrefs,alt_ids,subsets
447,SYMP:0000448,epistaxis,[nosebleed],"[UMLS_ICD9CM_2005_AUI:A0055387, SyOID:10057, I...",,
448,SYMP:0000449,cardiogenic shock,,"[UMLS_CUI:C0036980, UMLS_ICD9CM_2005_AUI:A0243...",,
449,SYMP:0000450,shock,,"[ICD9CM_2005:785.5, UMLS_CUI:C0159051, UMLS_IC...",,
450,SYMP:0000451,septic shock,,"[SyOID:10220, UMLS_CUI:C0036983, ICD9CM_2005:7...",,
451,SYMP:0000452,abnormal sputum,,"[UMLS_ICD9CM_2005_AUI:A0284206, SyOID:10681, U...",,
...,...,...,...,...,...,...
813,SYMP:0000818,localized superficial mass,,[ICD9CM_2005:782.2],,
814,SYMP:0000819,localized superficial swelling,,[ICD9CM_2005:782.2],,
815,SYMP:0000820,mass in chest,,[ICD9CM_2005:786.6],,
816,SYMP:0000821,swelling in chest,,[ICD9CM_2005:<new dbxref>],,


In [44]:
symp_df = pd.DataFrame(symp_info_json)

In [45]:
len(symp_df)

945

In [46]:
for col in ['synonyms', 'xrefs', 'alt_ids', 'subsets']:
    symp_df[col] = symp_df[col].apply(lambda s: dfp.char_combine_iter(s) if type(s) != float else s)

In [47]:
symp_df = symp_df.drop('subsets', axis=1)
symp_df.to_csv(out_dir.joinpath('SYMP_node_info.csv'), index=False)

In [48]:
pd.DataFrame(symp_rel_json).dropna(subset=['objs'])

Unnamed: 0,sbjs,objs


No relationships to save...