In [1]:
import pandas as pd

## Orphadata

orphadata: /home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/en_product4_HPO.xml 

**orphanet parsed data:** 

 associations_file: /home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/orphanet-disease-symptom.tsv
 
 diseases_file: /home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/orphanet-diseases.tsv
 
 symptoms_file: /home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/orphanet-symptoms.tsv

## Hetionet data

associations_file: https://github.com/dhimmel/medline/blob/gh-pages/data/disease-symptom-cooccurrence.tsv

diseases_file: https://github.com/dhimmel/disease-ontology/blob/gh-pages/data/slim-terms.tsv
    
Symptoms_file: https://github.com/dhimmel/mesh/blob/gh-pages/data/symptoms.tsv

#### Associations

commit = '60d611892bf387b5b23c5f2e2e3bc472cfce85f3'

url = rawgit('dhimmel', 'medline', commit, 'data/disease-symptom-cooccurrence.tsv')

disease_symptom_df = pandas.read_table(url)

disease_symptom_df = disease_symptom_df[disease_symptom_df.p_fisher < 0.005]

disease_symptom_df.head(2)

#### Diseases
commit = '75050ea2d4f60e745d3f3578ae03560a2cc0e444'

url = rawgit('dhimmel', 'disease-ontology', commit, 'data/slim-terms.tsv')

disease_df = pandas.read_table(url)

disease_df.head(2)

#### Symptoms
commit = 'a7036a37302973b15ab949aab4056d9bc062910e'

url = rawgit('dhimmel', 'mesh', commit, 'data/symptoms.tsv')

symptom_df = pandas.read_table(url)

symptom_df.head(2)

## Diseases

### Orphanet DO coverage
**Orphanet2DO mappings from MONDO (extracted from 'equivalentClass' defined classes)**

In [2]:
# load orphanet diseases
path = "/home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/"
orpha_diseases = pd.read_table("{}orphanet-diseases.tsv".format(path))
orpha_diseases.head(2)

Unnamed: 0,orphanet_code,orphanet_term
0,Orphanet:79430,Hermansky-Pudlak syndrome
1,Orphanet:99797,Anodontia


In [3]:
# Load mondo orphanet2do mappings
mondo_mappings = pd.read_table("{}mondo-orpha2do-mappings.tsv".format(path))
mondo_mappings.head(2)

Unnamed: 0,orphanet,do
0,Orphanet:205,DOID:3803
1,Orphanet:79213,DOID:12798


In [4]:
# orphadata - mondo overlap
orphanet = set(list(orpha_diseases.orphanet_code))
mondo = set(list(mondo_mappings.orphanet))
mapped = len(orphanet & mondo)
# DO coverage
coverage = (mapped*100/len(orphanet))
print('Diseases (orphanumbers) in orphadata: {}'.format(len(orphanet)))
print('Diseases (orphanumbers) in mondo with DO: {}'.format(len(mondo)))
print('Diseases (orphanumbers) in orphadata mapped to DO: {}'.format(mapped))
print('**DO coverage: {}% of diseases in orphadata**'.format(round(coverage)))

Diseases (orphanumbers) in orphadata: 2383
Diseases (orphanumbers) in mondo with DO: 1458
Diseases (orphanumbers) in orphadata mapped to DO: 612
**DO coverage: 26% of diseases in orphadata**


**Orphanet2DO mappings from DO**

In [5]:
# load orphanet diseases
path = "/home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/"
orpha_diseases = pd.read_table("{}orphanet-diseases.tsv".format(path))
# Load do orphanet2do mappings
do_mappings = pd.read_table("{}do-orpha2do-mappings.tsv".format(path))
do_mappings.head(2)

Unnamed: 0,orphanet,do
0,Orphanet:2322,DOID:0060473
1,Orphanet:166068,DOID:0060274


In [6]:
# orphadata - do overlap
orphanet = set(list(orpha_diseases.orphanet_code))
do = set(list(do_mappings.orphanet))
mapped = len(orphanet & do)
# DO coverage
coverage = (mapped*100/len(orphanet))
print('Diseases (orphanumbers) in orphanet: {}'.format(len(orphanet)))
print('Diseases (orphanumbers) in do with DO: {}'.format(len(do)))
print('Diseases (orphanumbers) in orphadata mapped to DO: {}'.format(mapped))
print('**DO coverage: {}% of diseases in orphadata**'.format(round(coverage)))

Diseases (orphanumbers) in orphanet: 2383
Diseases (orphanumbers) in do with DO: 618
Diseases (orphanumbers) in orphadata mapped to DO: 241
**DO coverage: 10% of diseases in orphadata**


### Orphanet - Hetionet Overlap

In [7]:
# Orphanet - Hetionet overlap (MONDO mappings)
# orphanet DO diseases 
orphanet_mappings = pd.read_table("{}orphanet-diseases-mondo-orpha2do.tsv".format(path))
orphanet_mappings.head(2)

Unnamed: 0,orphanumber,doid
0,Orphanet:205,DOID:3803
1,Orphanet:564,DOID:0050778


In [8]:
# hetionet DO diseases
hetionet_table = pd.read_table("data/hetionet-do-slim-terms.tsv")
hetionet_table.head(2)

Unnamed: 0,doid,name,source,pathophysiology
0,DOID:2531,hematologic cancer,DOcancerslim,neoplastic
1,DOID:1319,brain cancer,DOcancerslim,neoplastic


In [9]:
# overlap
orphanet = set(list(orphanet_mappings.doid))
hetionet = set(list(hetionet_table.doid))
common = len(orphanet & hetionet)
diff = len(orphanet - hetionet)
print('Diseases in orphanet (DO): {}'.format(len(orphanet)))
print('Diseases in hetionet (DO): {}'.format(len(hetionet)))
print('Overlap: {}'.format(common))
print('**We will introduce {} new diseases from orphadata**'.format(diff))
print('+ "NGLY1 deficiency" manually mapped to (with Elvira): DOID:0060728 \(from ORDO:404454 - OMIM:615273\)')

Diseases in orphanet (DO): 612
Diseases in hetionet (DO): 137
Overlap: 4
**We will introduce 608 new diseases from orphadata**
+ "NGLY1 deficiency" manually mapped to (with Elvira): DOID:0060728 \(from ORDO:404454 - OMIM:615273\)


In [10]:
# Orphanet - Hetionet overlap (DO mappings)
# orphanet DO diseases 
orphanet_mappings = pd.read_table("{}orphanet-diseases-do-orpha2do.tsv".format(path))
orphanet_mappings.head(2)

Unnamed: 0,orphanumber,doid
0,Orphanet:124,DOID:1339
1,Orphanet:1416,DOID:1156


In [11]:
# overlap
orphanet = set(list(orphanet_mappings.doid))
hetionet = set(list(hetionet_table.doid))
common = len(orphanet & hetionet)
diff = len(orphanet - hetionet)
print('Diseases in orphanet (DO): {}'.format(len(orphanet)))
print('Diseases in hetionet (DO): {}'.format(len(hetionet)))
print('Overlap: {}'.format(common))
print('**We will introduce {} new diseases from orphadata**'.format(diff))
print('+ "NGLY1 deficiency" manually mapped to (with Elvira): DOID:0060728 \(from ORDO:404454 - OMIM:615273\)')

Diseases in orphanet (DO): 377
Diseases in hetionet (DO): 137
Overlap: 2
**We will introduce 375 new diseases from orphadata**
+ "NGLY1 deficiency" manually mapped to (with Elvira): DOID:0060728 \(from ORDO:404454 - OMIM:615273\)


## Symptoms (Hetionet::MeSH) - Phenotypes (Orphanet::HPO)

### Orphanet MESH coverage
**hp2mesh mappings inferred through UMLS**

In [12]:
# load orphanet phenotypes
orpha_phenotypes = pd.read_table("{}orphanet-symptoms.tsv".format(path))
orpha_phenotypes.head(2)

Unnamed: 0,hp_code,hp_term
0,HP:0100962,Shyness
1,HP:0002132,Porencephaly


In [13]:
# load hp2mesh inferred mappings from UMLS
umls_mappings = pd.read_table("{}umls2016aa-hp2mesh-mappings.tsv".format(path))
umls_mappings['mesh_code'] = 'MESH:' + umls_mappings['mesh_code'].astype(str)
umls_mappings.head(2)

Unnamed: 0,hp_code,hp_term,mesh_code,mesh_term
0,HP:0009037,Segmental spinal muscular atrophy,MESH:C566670,"Spinal Muscular Atrophy, Segmental"
1,HP:0100620,Germinoma,MESH:D018237,Germinomas


In [14]:
# orphadata - umls overlap
orphanet = set(list(orpha_phenotypes.hp_code))
umls = set(list(umls_mappings.hp_code))
mapped = len(orphanet & umls)
# MESH coverage
coverage = (mapped*100/len(orphanet))
print('Phenotypes (hp) in orphanet: {}'.format(len(orphanet)))
print('Phenotypes (hp) in umls with mesh: {}'.format(len(umls)))
print('HP mapped: {}'.format(mapped))
print('**MESH coverage: {}**'.format(round(coverage)))

Phenotypes (hp) in orphanet: 4014
Phenotypes (hp) in umls with mesh: 908
HP mapped: 560
**MESH coverage: 14**


**hp2mesh mappings from HPO**

In [15]:
# load hp2mesh from HPO
hpo_mappings = pd.read_table("{}hp-hp2mesh-mappings.tsv".format(path))
hpo_mappings.head(2)

Unnamed: 0,hp_code,hp_term,mesh_code
0,HP:0200042,Skin ulcer,MESH:D012883
1,HP:0000873,Diabetes insipidus,MESH:D003919


In [16]:
# orphadata - hpo overlap
orphanet = set(list(orpha_phenotypes.hp_code))
hpo = set(list(hpo_mappings.hp_code))
mapped = len(orphanet & hpo)
# MESH coverage
coverage = (mapped*100/len(orphanet))
print('Phenotypes (hp) in orphanet: {}'.format(len(orphanet)))
print('Phenotypes (hp) in hpo with mesh: {}'.format(len(hpo)))
print('HP mapped: {}'.format(mapped))
print('**MESH coverage: {}**'.format(round(coverage)))

Phenotypes (hp) in orphanet: 4014
Phenotypes (hp) in hpo with mesh: 1036
HP mapped: 812
**MESH coverage: 20**


**hp2mesh mappings from HPO and UMLS**

In [17]:
# are umls and hpo mappings the same?
# umls - hpo mappings overlap
intersection = len(hpo & umls)
# orphanet - all mappings overlap
intersectionOrpha = len(orphanet & umls & hpo)
# use hpo + umls mappings UNION to map orphanet phenotypes to mesh
union = (umls | hpo)
unionOrpha = len(orphanet & union)
# New MESH coverage
coverage = (unionOrpha*100/len(orphanet))
#print("umls-hpo common phenotypes (hp) with mappings to mesh: {}".format(intersection))
#print("orphanet-umls-hpo common phenotypes (hp) with mappings to mesh: {}".format(intersectionOrpha))
print("Using hp2mesh mappings from both hpo and umls:")
print("HP mapped: {}".format(unionOrpha))
print('**MESH coverage: {}**'.format(round(coverage)))

Using hp2mesh mappings from both hpo and umls:
HP mapped: 1020
**MESH coverage: 25**


### Hetionet HPO coverage
**mesh2hp mappings inferred through UMLS**

In [18]:
# load Hetionet symptoms
hetionet_symptoms = pd.read_table("data/hetionet-symptoms.tsv")
hetionet_symptoms['mesh_id'] = 'MESH:' + hetionet_symptoms['mesh_id'].astype(str)
hetionet_symptoms.head(2)

Unnamed: 0,mesh_id,mesh_name,in_hsdn
0,MESH:D000006,"Abdomen, Acute",1
1,MESH:D000270,Adie Syndrome,0


In [19]:
# hetionet - umls overlap
hetionet = set(list(hetionet_symptoms.mesh_id))
umls = set(list(umls_mappings.mesh_code))
mapped = len(hetionet & umls)
# HPO coverage
coverage = (mapped*100/len(hetionet))
print('Symptoms (mesh) in hetionet: {}'.format(len(hetionet)))
print('Symptoms (mesh) in umls with hp: {}'.format(len(umls)))
print('MESH mapped: {}'.format(mapped))
print('**HPO coverage: {}**'.format(round(coverage)))

Symptoms (mesh) in hetionet: 438
Symptoms (mesh) in umls with hp: 881
MESH mapped: 70
**HPO coverage: 16**


**mesh2hp mappings from HPO**

In [20]:
# hetionet - hpo overlap
hetionet = set(list(hetionet_symptoms.mesh_id))
hpo = set(list(hpo_mappings.mesh_code))
mapped = len(hetionet & hpo)
# HPO coverage
coverage = (mapped*100/len(hetionet))
print('Symptoms (mesh) in hetionet: {}'.format(len(hetionet)))
print('Symptoms (mesh) in hpo with hp: {}'.format(len(hpo)))
print('MESH mapped: {}'.format(mapped))
print('**HPO coverage: {}**'.format(round(coverage)))

Symptoms (mesh) in hetionet: 438
Symptoms (mesh) in hpo with hp: 1027
MESH mapped: 155
**HPO coverage: 35**


**mesh2hp mappings from HPO and UMLS**

In [21]:
# are umls and hpo mappings the same?
# umls - hpo mappings overlap
intersection = len(hpo & umls)
# orphanet - all mappings overlap
intersectionHetio = len(hetionet & umls & hpo)
# use hpo + umls mappings UNION to map orphanet phenotypes to mesh
union = (umls | hpo)
unionHetio = len(hetionet & union)
# New MESH coverage
coverage = (unionHetio*100/len(hetionet))
#print("umls-hpo common symptoms (mesh) with mappings to hp: {}".format(intersection))
#print("hetionet-umls-hpo common symptoms (mesh) with mappings to hp: {}".format(intersectionHetio))
print("Using mesh2hp mappings from both hpo and umls:")
print("MESH mapped: {}".format(unionHetio))
print('**HPO coverage: {}**'.format(round(coverage)))

Using mesh2hp mappings from both hpo and umls:
MESH mapped: 165
**HPO coverage: 38**


### Orphanet - Hetionet Overlap (HPO)

In [22]:
# Orphanet - Hetionet (HPO mappings)
# orphanet HP phenotypes
orphanet_symptoms = pd.read_table("{}orphanet-symptoms.tsv".format(path))
orphanet_symptoms.head(2)

Unnamed: 0,hp_code,hp_term
0,HP:0100962,Shyness
1,HP:0002132,Porencephaly


In [23]:
# hetionet HP symptoms
hetionet_symptoms = hetionet_symptoms.merge(hpo_mappings, left_on='mesh_id', right_on='mesh_code')
hetionet_symptoms.head(2)

Unnamed: 0,mesh_id,mesh_name,in_hsdn,hp_code,hp_term,mesh_code
0,MESH:D000377,Agnosia,1,HP:0010524,Agnosia,MESH:D000377
1,MESH:D000472,"Alkalosis, Respiratory",0,HP:0001950,Respiratory alkalosis,MESH:D000472


In [24]:
# overlap
orphanet = set(list(orphanet_symptoms.hp_code))
hetionet = set(list(hetionet_symptoms.hp_code))
common = len(orphanet & hetionet)
diff = len(orphanet - hetionet)
print('Phenotypes in orphanet (HP): {}'.format(len(orphanet)))
print('Symptoms in hetionet (HP): {}'.format(len(hetionet)))
print('Overlap: {}'.format(common))
print('**We will introduce {} new phenotypes from orphadata**'.format(diff))

Phenotypes in orphanet (HP): 4014
Symptoms in hetionet (HP): 155
Overlap: 132
**We will introduce 3882 new phenotypes from orphadata**


### Orphanet - Hetionet Overlap (MESH)

In [42]:
# Orphanet - Hetionet (HPO mappings)
# orphanet MESH phenotypes (20% Phenotypes in Orphanet)
# load orphanet symptoms
orphanet_symptoms = pd.read_table("{}orphanet-symptoms.tsv".format(path))
orphanet_symptoms_hp = orphanet_symptoms
orphanet_symptoms.head(2)

Unnamed: 0,hp_code,hp_term
0,HP:0100962,Shyness
1,HP:0002132,Porencephaly


In [43]:
# map symptoms HP to MESH
orphanet_symptoms = orphanet_symptoms.merge(hpo_mappings, on='hp_code')
orphanet_symptoms_mesh = orphanet_symptoms
orphanet_symptoms.head(2)

Unnamed: 0,hp_code,hp_term_x,hp_term_y,mesh_code
0,HP:0100962,Shyness,Shyness,MESH:D012792
1,HP:0100608,Metrorrhagia,Metrorrhagia,MESH:D008796


In [44]:
# hetionet MESH symptoms
# load Hetionet symptoms
hetionet_symptoms = pd.read_table("data/hetionet-symptoms.tsv")
hetionet_symptoms['mesh_id'] = 'MESH:' + hetionet_symptoms['mesh_id'].astype(str)
hetionet_symptoms.head(2)

Unnamed: 0,mesh_id,mesh_name,in_hsdn
0,MESH:D000006,"Abdomen, Acute",1
1,MESH:D000270,Adie Syndrome,0


In [45]:
# counts
print('Orphanet symptoms (HP): {}'.format(len(orphanet_symptoms_hp)))
print('Orphanet symptoms (MESH): {}'.format(len(orphanet_symptoms_mesh)))
print('Hetionet symptoms (MESH): {}'.format(len(hetionet_symptoms)))

Orphanet symptoms (HP): 4014
Orphanet symptoms (MESH): 812
Hetionet symptoms (MESH): 438


In [48]:
# overlap
orphanet = set(list(orphanet_symptoms.mesh_code))
hetionet = set(list(hetionet_symptoms.mesh_id))
overlap = len(orphanet & hetionet)
print('Phenotypes in orphanet (MESH): {}'.format(len(orphanet)))
print('Symptoms in hetionet (MESH): {}'.format(len(hetionet)))
print('Overlap: {}'.format(overlap))

Phenotypes in orphanet (MESH): 808
Symptoms in hetionet (MESH): 438
Overlap: 132
