In [69]:
import pandas as pd

### Get list of disease nodes

In [70]:
# read list of disease nodes from hetionet
commit = '75050ea2d4f60e745d3f3578ae03560a2cc0e444'
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/' + commit + '/data/slim-terms.tsv'
disease_df = pd.read_table(url)
disease_df.head(2)

Unnamed: 0,doid,name,source,pathophysiology
0,DOID:2531,hematologic cancer,DOcancerslim,neoplastic
1,DOID:1319,brain cancer,DOcancerslim,neoplastic


In [71]:
# add rare diseases from orphadata
# read rd from gene annotation
url = '/home/nuria/workspace/repurposing/rephetio-su/orphanet/data/orphanet-mondo-disease-gene.tsv'
orpha_gene_df = pd.read_table(url)
orpha_gene_df.head(2)

Unnamed: 0,orphanet_code,orphanet_term,orphanet_gene_id,gene_orphanumber,gene_name,gene_type,gene_symbol,dga_type,dga_status
0,DOID:0050461,Aspartylglucosaminuria,15470,119513,aspartylglucosaminidase,gene with protein product,AGA,Disease-causing germline mutation(s) in,Assessed
1,DOID:0050441,Multiple sulfatase deficiency,15570,119899,sulfatase modifying factor 1,gene with protein product,SUMF1,Disease-causing germline mutation(s) in,Assessed


In [72]:
# read rd from phenotype annotation
url = '/home/nuria/workspace/repurposing/rephetio-su/orphanet/data/orphanet-mondo-disease-symptom.tsv'
orpha_pheno_df = pd.read_table(url)
orpha_pheno_df.head(2)

Unnamed: 0,orphanet_code,orphanet_term,hp_code,hp_term
0,DOID:10923,Sickle cell anemia,HP:0004870,Chronic hemolytic anemia
1,DOID:10923,Sickle cell anemia,HP:0001878,Hemolytic anemia


In [73]:
# merge all disease sets
hetionet_df = disease_df[['doid', 'name']]
rd_gene_df = orpha_gene_df[['orphanet_code', 'orphanet_term']]
rd_pheno_df = orpha_pheno_df[['orphanet_code', 'orphanet_term']]
rd_gene_df = rd_gene_df.rename(columns={'orphanet_code':'doid', 'orphanet_term':'name'})
rd_pheno_df = rd_pheno_df.rename(columns={'orphanet_code':'doid', 'orphanet_term':'name'})
rd_gene_df = rd_gene_df.drop_duplicates()
rd_pheno_df = rd_pheno_df.drop_duplicates()
all_df = pd.concat([hetionet_df,rd_gene_df,rd_pheno_df])
rd_df = all_df.drop_duplicates()
print('#rd: {}'.format(len(rd_df)))
rd_df.head(2)

#rd: 5062


Unnamed: 0,doid,name
0,DOID:2531,hematologic cancer
1,DOID:1319,brain cancer


In [74]:
rd_df.to_csv('data/diseases.tsv', sep='\t', index=False, header=True)

In [75]:
len(rd_gene_df.doid.unique()), len(rd_pheno_df.doid.unique()), len(set(list(rd_gene_df.doid)) & set(list(rd_pheno_df.doid)))

(3569, 2592, 1235)

In [76]:
hetionet = set(list(disease_df.doid))
orpha_gene = set(list(rd_gene_df.doid))
common = (hetionet & orpha_gene)
len(hetionet), len(orpha_gene), len(common), common

(137,
 3569,
 5,
 {'DOID:0050156', 'DOID:10283', 'DOID:11555', 'DOID:12236', 'DOID:8986'})

In [77]:
hetionet = set(list(disease_df.doid))
orpha_pheno = set(list(rd_pheno_df.doid))
common = (hetionet & orpha_pheno)
len(hetionet), len(orpha_pheno), len(common), common

(137,
 2592,
 6,
 {'DOID:12236',
  'DOID:12365',
  'DOID:13378',
  'DOID:3277',
  'DOID:418',
  'DOID:8986'})

In [78]:
common = (hetionet & orpha_gene & orpha_pheno)
common

{'DOID:12236', 'DOID:8986'}