In [1]:
import pandas as pd

## Orphadata

orphadata: /home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/en_product4_HPO.xml 

**orphanet parsed data:** 

 associations_file: /home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/orphanet-disease-symptom.tsv
 
 diseases_file: /home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/orphanet-diseases.tsv
 
 symptoms_file: /home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/orphanet-symptoms.tsv

## Hetionet data

associations_file: https://github.com/dhimmel/medline/blob/gh-pages/data/disease-symptom-cooccurrence.tsv

diseases_file: https://github.com/dhimmel/disease-ontology/blob/gh-pages/data/slim-terms.tsv
    
Symptoms_file: https://github.com/dhimmel/mesh/blob/gh-pages/data/symptoms.tsv

#### Associations

commit = '60d611892bf387b5b23c5f2e2e3bc472cfce85f3'

url = rawgit('dhimmel', 'medline', commit, 'data/disease-symptom-cooccurrence.tsv')

disease_symptom_df = pandas.read_table(url)

disease_symptom_df = disease_symptom_df[disease_symptom_df.p_fisher < 0.005]

disease_symptom_df.head(2)

#### Diseases
commit = '75050ea2d4f60e745d3f3578ae03560a2cc0e444'

url = rawgit('dhimmel', 'disease-ontology', commit, 'data/slim-terms.tsv')

disease_df = pandas.read_table(url)

disease_df.head(2)

#### Symptoms
commit = 'a7036a37302973b15ab949aab4056d9bc062910e'

url = rawgit('dhimmel', 'mesh', commit, 'data/symptoms.tsv')

symptom_df = pandas.read_table(url)

symptom_df.head(2)

## Diseases

### Orphanet DO coverage
**Orphanet2DO mappings from MONDO (extracted from 'equivalentClass' defined classes)**

In [2]:
# load orphanet diseases
path = "/home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/"
orpha_diseases = pd.read_table("{}orphanet-diseases.tsv".format(path))
orpha_diseases.head(2)

Unnamed: 0,orphanet_code,orphanet_term
0,Orphanet:232,Sickle cell anemia
1,Orphanet:2075,Genito-palato-cardiac syndrome


In [3]:
# Load mondo orphanet2do mappings
mondo_mappings = pd.read_table("{}mondo-orpha2do-mappings.tsv".format(path))
mondo_mappings.head(2)

Unnamed: 0,orphanet,do
0,Orphanet:3193,DOID:1929
1,Orphanet:100045,DOID:0110199


In [4]:
# orphadata - mondo overlap
orphanet = set(list(orpha_diseases.orphanet_code))
mondo = set(list(mondo_mappings.orphanet))
mapped = len(orphanet & mondo)
# DO coverage
coverage = (mapped*100/len(orphanet))
print('Diseases (orphanumbers) in orphadata: {}'.format(len(orphanet)))
print('Diseases (orphanumbers) in mondo with DO: {}'.format(len(mondo)))
print('Diseases (orphanumbers) in orphadata mapped to DO: {}'.format(mapped))
print('**DO coverage: {}% of diseases in orphadata**'.format(round(coverage)))

Diseases (orphanumbers) in orphadata: 2592
Diseases (orphanumbers) in mondo with DO: 1862
Diseases (orphanumbers) in orphadata mapped to DO: 781
**DO coverage: 30% of diseases in orphadata**


**Orphanet2DO mappings from DO**

In [5]:
# load orphanet diseases
path = "/home/nuria/workspace/repurposing-hetio/rephetio-dhimmelstein/hetionet+hpo/data/"
orpha_diseases = pd.read_table("{}orphanet-diseases.tsv".format(path))
# Load do orphanet2do mappings
do_mappings = pd.read_table("{}do-orpha2do-mappings.tsv".format(path))
do_mappings.head(2)

Unnamed: 0,orphanet,do
0,Orphanet:101076,DOID:0110208
1,Orphanet:2203,DOID:9274


In [6]:
# orphadata - do overlap
orphanet = set(list(orpha_diseases.orphanet_code))
do = set(list(do_mappings.orphanet))
mapped = len(orphanet & do)
# DO coverage
coverage = (mapped*100/len(orphanet))
print('Diseases (orphanumbers) in orphanet: {}'.format(len(orphanet)))
print('Diseases (orphanumbers) in do with DO: {}'.format(len(do)))
print('Diseases (orphanumbers) in orphadata mapped to DO: {}'.format(mapped))
print('**DO coverage: {}% of diseases in orphadata**'.format(round(coverage)))

Diseases (orphanumbers) in orphanet: 2592
Diseases (orphanumbers) in do with DO: 726
Diseases (orphanumbers) in orphadata mapped to DO: 281
**DO coverage: 11% of diseases in orphadata**


### Orphanet - Hetionet Overlap

In [7]:
# Orphanet - Hetionet overlap (MONDO mappings)
# orphanet DO diseases 
orphanet_mappings = pd.read_table("{}orphanet-diseases-mondo-orpha2do.tsv".format(path))
orphanet_mappings.head(2)

Unnamed: 0,orphanumber,doid
0,Orphanet:90291,DOID:418
1,Orphanet:374,DOID:2907


In [8]:
# hetionet DO diseases
hetionet_table = pd.read_table("data/hetionet-do-slim-terms.tsv")
hetionet_table.head(2)

Unnamed: 0,doid,name,source,pathophysiology
0,DOID:2531,hematologic cancer,DOcancerslim,neoplastic
1,DOID:1319,brain cancer,DOcancerslim,neoplastic


In [9]:
# overlap
orphanet = set(list(orphanet_mappings.doid))
hetionet = set(list(hetionet_table.doid))
common = len(orphanet & hetionet)
diff = len(orphanet - hetionet)
print('Diseases in orphanet (DO): {}'.format(len(orphanet)))
print('Diseases in hetionet (DO): {}'.format(len(hetionet)))
print('Overlap: {}'.format(common))
print('**We will introduce {} new diseases from orphadata**'.format(diff))
print('+ "NGLY1 deficiency" manually mapped to (with Elvira): DOID:0060728 \(from ORDO:404454 - OMIM:615273\)')

Diseases in orphanet (DO): 781
Diseases in hetionet (DO): 137
Overlap: 6
**We will introduce 775 new diseases from orphadata**
+ "NGLY1 deficiency" manually mapped to (with Elvira): DOID:0060728 \(from ORDO:404454 - OMIM:615273\)


In [10]:
# Orphanet - Hetionet overlap (DO mappings)
# orphanet DO diseases 
orphanet_mappings = pd.read_table("{}orphanet-diseases-do-orpha2do.tsv".format(path))
orphanet_mappings.head(2)

Unnamed: 0,orphanumber,doid
0,Orphanet:101076,DOID:0110208
1,Orphanet:56,DOID:9270


In [11]:
# overlap
orphanet = set(list(orphanet_mappings.doid))
hetionet = set(list(hetionet_table.doid))
common = len(orphanet & hetionet)
diff = len(orphanet - hetionet)
print('Diseases in orphanet (DO): {}'.format(len(orphanet)))
print('Diseases in hetionet (DO): {}'.format(len(hetionet)))
print('Overlap: {}'.format(common))
print('**We will introduce {} new diseases from orphadata**'.format(diff))
print('+ "NGLY1 deficiency" manually mapped to (with Elvira): DOID:0060728 \(from ORDO:404454 - OMIM:615273\)')

Diseases in orphanet (DO): 463
Diseases in hetionet (DO): 137
Overlap: 3
**We will introduce 460 new diseases from orphadata**
+ "NGLY1 deficiency" manually mapped to (with Elvira): DOID:0060728 \(from ORDO:404454 - OMIM:615273\)


## Symptoms (Hetionet::MeSH) - Phenotypes (Orphanet::HPO)

### Orphanet MESH coverage
**hp2mesh mappings inferred through UMLS**

In [12]:
# load orphanet phenotypes
orpha_phenotypes = pd.read_table("{}orphanet-symptoms.tsv".format(path))
orpha_phenotypes.head(2)

Unnamed: 0,hp_code,hp_term
0,HP:0004912,Hypophosphatemic rickets
1,HP:0000712,Emotional lability


In [13]:
# load hp2mesh inferred mappings from UMLS
umls_mappings = pd.read_table("{}umls2016aa-hp2mesh-mappings.tsv".format(path))
umls_mappings['mesh_code'] = 'MESH:' + umls_mappings['mesh_code'].astype(str)
umls_mappings.head(2)

Unnamed: 0,hp_code,hp_term,mesh_code,mesh_term
0,HP:0000870,Hyperprolactinaemia,MESH:D006966,"Secretion, Inappropriate Prolactin"
1,HP:0003641,Hemoglobinuria,MESH:D006456,Hemoglobinuria


In [14]:
# orphadata - umls overlap
orphanet = set(list(orpha_phenotypes.hp_code))
umls = set(list(umls_mappings.hp_code))
mapped = len(orphanet & umls)
# MESH coverage
coverage = (mapped*100/len(orphanet))
print('Phenotypes (hp) in orphanet: {}'.format(len(orphanet)))
print('Phenotypes (hp) in umls with mesh: {}'.format(len(umls)))
print('HP mapped: {}'.format(mapped))
print('**MESH coverage: {}**'.format(round(coverage)))

Phenotypes (hp) in orphanet: 4543
Phenotypes (hp) in umls with mesh: 908
HP mapped: 600
**MESH coverage: 13**


**hp2mesh mappings from HPO**

In [15]:
# load hp2mesh from HPO
hpo_mappings = pd.read_table("{}hp-hp2mesh-mappings.tsv".format(path))
hpo_mappings.head(2)

Unnamed: 0,hp_code,hp_term,mesh_code
0,HP:0009829,Phocomelia,MESH:D004480
1,HP:0012622,Chronic kidney disease,MESH:D051436


In [16]:
# orphadata - hpo overlap
orphanet = set(list(orpha_phenotypes.hp_code))
hpo = set(list(hpo_mappings.hp_code))
mapped = len(orphanet & hpo)
# MESH coverage
coverage = (mapped*100/len(orphanet))
print('Phenotypes (hp) in orphanet: {}'.format(len(orphanet)))
print('Phenotypes (hp) in hpo with mesh: {}'.format(len(hpo)))
print('HP mapped: {}'.format(mapped))
print('**MESH coverage: {}**'.format(round(coverage)))

Phenotypes (hp) in orphanet: 4543
Phenotypes (hp) in hpo with mesh: 2105
HP mapped: 1461
**MESH coverage: 32**


**hp2mesh mappings from HPO and UMLS**

In [17]:
# are umls and hpo mappings the same?
# umls - hpo mappings overlap
intersection = len(hpo & umls)
# orphanet - all mappings overlap
intersectionOrpha = len(orphanet & umls & hpo)
# use hpo + umls mappings UNION to map orphanet phenotypes to mesh
union = (umls | hpo)
unionOrpha = len(orphanet & union)
# New MESH coverage
coverage = (unionOrpha*100/len(orphanet))
#print("umls-hpo common phenotypes (hp) with mappings to mesh: {}".format(intersection))
#print("orphanet-umls-hpo common phenotypes (hp) with mappings to mesh: {}".format(intersectionOrpha))
print("Using hp2mesh mappings from both hpo and umls:")
print("HP mapped: {}".format(unionOrpha))
print('**MESH coverage: {}**'.format(round(coverage)))

Using hp2mesh mappings from both hpo and umls:
HP mapped: 1463
**MESH coverage: 32**


### Hetionet HPO coverage
**mesh2hp mappings inferred through UMLS**

In [18]:
# load Hetionet symptoms
hetionet_symptoms = pd.read_table("data/hetionet-symptoms.tsv")
hetionet_symptoms['mesh_id'] = 'MESH:' + hetionet_symptoms['mesh_id'].astype(str)
hetionet_symptoms.head(2)

Unnamed: 0,mesh_id,mesh_name,in_hsdn
0,MESH:D000006,"Abdomen, Acute",1
1,MESH:D000270,Adie Syndrome,0


In [19]:
# hetionet - umls overlap
hetionet = set(list(hetionet_symptoms.mesh_id))
umls = set(list(umls_mappings.mesh_code))
mapped = len(hetionet & umls)
# HPO coverage
coverage = (mapped*100/len(hetionet))
print('Symptoms (mesh) in hetionet: {}'.format(len(hetionet)))
print('Symptoms (mesh) in umls with hp: {}'.format(len(umls)))
print('MESH mapped: {}'.format(mapped))
print('**HPO coverage: {}**'.format(round(coverage)))

Symptoms (mesh) in hetionet: 438
Symptoms (mesh) in umls with hp: 881
MESH mapped: 70
**HPO coverage: 16**


**mesh2hp mappings from HPO**

In [20]:
# hetionet - hpo overlap
hetionet = set(list(hetionet_symptoms.mesh_id))
hpo = set(list(hpo_mappings.mesh_code))
mapped = len(hetionet & hpo)
# HPO coverage
coverage = (mapped*100/len(hetionet))
print('Symptoms (mesh) in hetionet: {}'.format(len(hetionet)))
print('Symptoms (mesh) in hpo with hp: {}'.format(len(hpo)))
print('MESH mapped: {}'.format(mapped))
print('**HPO coverage: {}**'.format(round(coverage)))

Symptoms (mesh) in hetionet: 438
Symptoms (mesh) in hpo with hp: 1812
MESH mapped: 215
**HPO coverage: 49**


**mesh2hp mappings from HPO and UMLS**

In [21]:
# are umls and hpo mappings the same?
# umls - hpo mappings overlap
intersection = len(hpo & umls)
# orphanet - all mappings overlap
intersectionHetio = len(hetionet & umls & hpo)
# use hpo + umls mappings UNION to map orphanet phenotypes to mesh
union = (umls | hpo)
unionHetio = len(hetionet & union)
# New MESH coverage
coverage = (unionHetio*100/len(hetionet))
#print("umls-hpo common symptoms (mesh) with mappings to hp: {}".format(intersection))
#print("hetionet-umls-hpo common symptoms (mesh) with mappings to hp: {}".format(intersectionHetio))
print("Using mesh2hp mappings from both hpo and umls:")
print("MESH mapped: {}".format(unionHetio))
print('**HPO coverage: {}**'.format(round(coverage)))

Using mesh2hp mappings from both hpo and umls:
MESH mapped: 215
**HPO coverage: 49**


### Orphanet - Hetionet Overlap (HPO)

In [22]:
# Orphanet - Hetionet (HPO mappings)
# orphanet HP phenotypes
orphanet_symptoms = pd.read_table("{}orphanet-symptoms.tsv".format(path))
orphanet_symptoms.head(2)

Unnamed: 0,hp_code,hp_term
0,HP:0004912,Hypophosphatemic rickets
1,HP:0000712,Emotional lability


In [23]:
# hetionet HP symptoms
hetionet_symptoms = hetionet_symptoms.merge(hpo_mappings, left_on='mesh_id', right_on='mesh_code')
hetionet_symptoms.head(2)

Unnamed: 0,mesh_id,mesh_name,in_hsdn,hp_code,hp_term,mesh_code
0,MESH:D000370,Ageusia,1,HP:0000224,Decreased taste sensation,MESH:D000370
1,MESH:D000377,Agnosia,1,HP:0030222,Visual agnosia,MESH:D000377


In [24]:
# overlap
orphanet = set(list(orphanet_symptoms.hp_code))
hetionet = set(list(hetionet_symptoms.hp_code))
common = len(orphanet & hetionet)
diff = len(orphanet - hetionet)
common_percentage = round(common*100/len(hetionet))
print('Phenotypes in orphanet (HP): {}'.format(len(orphanet)))
print('Symptoms in hetionet (HP): {}'.format(len(hetionet)))
print('Overlap: {} ({}%)'.format(common,common_percentage))
print('**We will introduce {} new phenotypes from orphadata**'.format(diff))

Phenotypes in orphanet (HP): 4543
Symptoms in hetionet (HP): 310
Overlap: 233 (75%)
**We will introduce 4310 new phenotypes from orphadata**


### Orphanet - Hetionet Overlap (MESH)

In [25]:
# Orphanet - Hetionet (HPO mappings)
# orphanet MESH phenotypes (20% Phenotypes in Orphanet)
# load orphanet symptoms
orphanet_symptoms = pd.read_table("{}orphanet-symptoms.tsv".format(path))
orphanet_symptoms_hp = orphanet_symptoms
orphanet_symptoms.head(2)

Unnamed: 0,hp_code,hp_term
0,HP:0004912,Hypophosphatemic rickets
1,HP:0000712,Emotional lability


In [26]:
# map symptoms HP to MESH
orphanet_symptoms = orphanet_symptoms.merge(hpo_mappings, on='hp_code')
orphanet_symptoms_mesh = orphanet_symptoms
orphanet_symptoms.head(2)

Unnamed: 0,hp_code,hp_term_x,hp_term_y,mesh_code
0,HP:0004912,Hypophosphatemic rickets,Hypophosphatemic rickets,MESH:D063730
1,HP:0012056,Cutaneous melanoma,Cutaneous melanoma,MESH:D008545


In [27]:
# hetionet MESH symptoms
# load Hetionet symptoms
hetionet_symptoms = pd.read_table("data/hetionet-symptoms.tsv")
hetionet_symptoms['mesh_id'] = 'MESH:' + hetionet_symptoms['mesh_id'].astype(str)
hetionet_symptoms.head(2)

Unnamed: 0,mesh_id,mesh_name,in_hsdn
0,MESH:D000006,"Abdomen, Acute",1
1,MESH:D000270,Adie Syndrome,0


In [28]:
# counts
print('Orphanet symptoms (HP): {}'.format(len(orphanet_symptoms_hp)))
print('Orphanet symptoms (MESH): {}'.format(len(orphanet_symptoms_mesh.mesh_code.unique())))
print('Hetionet symptoms (MESH): {}'.format(len(hetionet_symptoms)))

Orphanet symptoms (HP): 4543
Orphanet symptoms (MESH): 1322
Hetionet symptoms (MESH): 438


In [29]:
# overlap
orphanet = set(list(orphanet_symptoms.mesh_code))
hetionet = set(list(hetionet_symptoms.mesh_id))
overlap = len(orphanet & hetionet)
overlap_percentage = round(overlap*100/len(hetionet))
print('Phenotypes in orphanet (MESH): {}'.format(len(orphanet)))
print('Symptoms in hetionet (MESH): {}'.format(len(hetionet)))
print('Overlap: {} ({}%)'.format(overlap,overlap_percentage))

Phenotypes in orphanet (MESH): 1322
Symptoms in hetionet (MESH): 438
Overlap: 179 (41%)
