In [1]:
%matplotlib inline

import pandas as pd
from pathlib import Path
from biothings_client import get_client

from metapaths.tools import obo_tools as ot
from metapaths.tools.processing import expand_col_on_char, expand_split_col
from hetnet_ml.src import graph_tools as gt


data_dir = Path('../2_pipeline/00_download_data/out/').resolve()
this_name = '11_Incorporation_of_Phenotypes_HPO'
out_dir = Path('../2_pipeline/').joinpath(this_name).joinpath('out').resolve()

In [2]:
if not out_dir.exists():
    out_dir.mkdir(parents=True)

In [3]:
hp_nodes = ot.get_ontology_nodes(data_dir.joinpath('hpo.obo'))
hp_struct = ot.get_ontology_edges(data_dir.joinpath('hpo.obo'))

In [4]:
tab_cols = ['db', 'db_object_id', 'db_name', 'qualifier', 'hp_id', 'db_reference', 'evidence_code', 'onset_modifier', 
            'frequency', 'sex', 'modifier', 'aspect', 'date_created', 'assigned_by']

hp_anno = pd.read_csv(data_dir.joinpath('phenotype_annotation.tab'), header=None, 
                       sep='\t', dtype=str, names=tab_cols)
hp_gene = pd.read_csv(data_dir.joinpath('ALL_SOURCES_ALL_FREQUENCIES_genes_to_phenotype.txt'), 
                       sep='\t', header=None, comment='#', dtype=str,
                       names=['gene_id', 'gene_symbol', 'hp_name', 'hp_id'])

In [5]:
hp_name_map = hp_nodes.set_index('id')['name'].to_dict()
hp_anno['hp_name'] = hp_anno['hp_id'].map(hp_name_map)

In [6]:
len(hp_anno), len(hp_gene)

(166084, 142947)

In [7]:
hp_anno.head(2)

Unnamed: 0,db,db_object_id,db_name,qualifier,hp_id,db_reference,evidence_code,onset_modifier,frequency,sex,modifier,aspect,date_created,assigned_by,hp_name
0,DECIPHER,1,Wolf-Hirschhorn Syndrome,,HP:0000252,DECIPHER:1,IEA,,,,P,WOLF-HIRSCHHORN SYNDROME,HPO:skoehler,,Microcephaly
1,DECIPHER,1,Wolf-Hirschhorn Syndrome,,HP:0001249,DECIPHER:1,IEA,,,,P,WOLF-HIRSCHHORN SYNDROME,HPO:skoehler,,Intellectual disability


In [8]:
hp_gene.head(2)

Unnamed: 0,gene_id,gene_symbol,hp_name,hp_id
0,8192,CLPP,Seizures,HP:0001250
1,8192,CLPP,Short stature,HP:0004322


## Examine Genes

Genes are using entrez gene_id, so looking for overlap to the current network should be stratightforward

In [9]:
hp_genes = hp_gene['gene_id'].unique()

In [10]:
nodes = gt.remove_colons(pd.read_csv('../2_pipeline/07_Filter_non_human_annotations/out/nodes_filt.csv', dtype=str))
nodes.query('id in @hp_genes')

Unnamed: 0,id,name,label,tree_numbers,drug_bank_ids,alt_disease_ids,gene_symbol,alt_gene_ids,bio_gridids,pharm_gkbids,uni_prot_ids,uniprot_id,mesh_ids,chebi_ids
22691,2,alpha-2-macroglobulin,Gene,,,,A2M,100061692|100173946|100390764|100543551|100657...,106524|231245|246347|679499,PA24357,A0A1U7TC46|A0A2K5E7U5|A0A2K5KBI2|A0A2K6N6A9|A0...,P01023,,
22693,144568,alpha-2-macroglobulin like 1,Gene,,,,A2ML1,100061421|100127688|100152492|100347314|100407...,126860,PA142670460,A0A0D9RAZ7|A0A1S3AMH8|A0A1U7S0T5|A0A1U7SWH3|A0...,A8K2U0,,
22696,53947,"alpha 1,4-galactosyltransferase (P blood group)",Gene,,,,A4GALT,100017110|100090950|100172807|100347783|100399...,119825,PA143485570|PA24359,A0A096N8Q2|A0A0D9SD56|A0A0S2Z5J1|A0A1S2ZRQ3|A0...,Q9NPC4,,
22699,8086,aladin WD repeat nucleoporin,Gene,,,,AAAS,100063811|100154333|100218436|100356621|100405...,113759|230214,PA24361,A0A087XX44|A0A091CTK4|A0A096NFX9|A0A0P7XRK7|A0...,Q9NRG9,,
22708,79719,alpha and gamma adaptin binding protein,Gene,,,,AAGAB,100018736|100052900|100085922|100167316|100171...,122835|211824|251245,PA165478457,A0A087XSX4|A0A091HBL0|A0A091RR32|A0A091SVE7|A0...,Q6PD74,,
22715,16,alanyl-tRNA synthetase,Gene,,,,AARS,100011180|100054983|100074385|100172296|100195...,106534|231572,PA24367,A0A087QNA5|A0A087YD10|A0A091HLZ9|A0A091JF69|A0...,P49588,,
22716,57505,"alanyl-tRNA synthetase 2, mitochondrial",Gene,,,,AARS2,100013354|100067843|100155115|100231217|100337...,121569,PA162375129,A0A091DVT7|A0A091FJP6|A0A091LAT9|A0A093BS06|A0...,Q5JTZ9,,
22720,10157,aminoadipate-semialdehyde synthase,Gene,,,,AASS,100016400|100056260|100076030|100222251|100352...,115459,PA24369,A0A087VFH2|A0A091DEE2|A0A091GVL3|A0A091HV79|A0...,Q9UDR5,,
22724,18,4-aminobutyrate aminotransferase,Gene,,,,ABAT,100026169|100051470|100075373|100158654|100174...,106536|1149627|88904,PA24372,A0A096NLK8|A0A0D9R8N1|A0A1S3EUJ4|A0A1S3WN71|A0...,P80404,,
22725,19,ATP binding cassette subfamily A member 1,Gene,,,,ABCA1,100013781|100054241|100075984|100152112|100220...,106537|197900,PA24373,A0A096P0Z3|A0A0A0R2Y4|A0A0D9RFW1|A0A0N8ETW3|A0...,O95477,,


In [11]:
hp_gene['gene_id'].nunique()

4016

In [12]:
node_ids = set(nodes['id'])
set(hp_gene['gene_id']) - node_ids

{'101928376'}

Only 1 gene is missing... Don't think we'll add it, but let's at least look at what it is

In [13]:
mg = get_client('gene')
mg.getgene(101928376, fields=['name', 'symbol', 'uniprot'])

{'_id': '101928376',
 '_score': 13.905111,
 'name': 'IL12A antisense RNA 1',
 'symbol': 'IL12A-AS1'}

## Now HP to Disease Links

HP has OMIM ids for diseases... luckily our CTD network already has xrefs to OMIM for many diseases, so we will use these to merge concepts.

In [14]:
hp_anno.query('db == "OMIM"')['db_object_id'].nunique()

7605

In [15]:
hp_anno['disease_id'] = hp_anno['db'] + ':' + hp_anno['db_object_id']

In [16]:
nodes.query('label == "Disease"')['id'].nunique()

7201

In [17]:
mesh_to_omim = (expand_col_on_char(nodes.query('label == "Disease"'), 'alt_disease_ids', '|')
                    .dropna(subset=['alt_disease_ids'])
                    .set_index('id')['alt_disease_ids'].to_dict())

len(set(hp_anno.query('db == "OMIM"')['disease_id']) - set(mesh_to_omim.values()))

4930

In [18]:
missing_diseases = set(hp_anno.query('db == "OMIM"')['disease_id']) - set(mesh_to_omim.values())
hp_anno.query('disease_id in @missing_diseases')['disease_id'].nunique()

4930

In [19]:
found_diseases = set(hp_anno.query('db == "OMIM"')['disease_id']) & set(mesh_to_omim.values())
hp_to_dis = hp_anno.query('disease_id in @found_diseases').copy()

print(hp_to_dis['disease_id'].nunique())
print(len(hp_to_dis))

2675
41319


In [20]:
hp_to_dis['evidence_code'].value_counts()

IEA    21763
TAS    15392
PCS     4164
Name: evidence_code, dtype: int64

In [21]:
# Tons of edges, so remove the electronic annotations
hp_to_dis = hp_to_dis.query('evidence_code != "IEA"').copy()
hp_to_dis = hp_to_dis.drop_duplicates(subset=['disease_id', 'hp_id'])
hp_to_dis.head(4)

Unnamed: 0,db,db_object_id,db_name,qualifier,hp_id,db_reference,evidence_code,onset_modifier,frequency,sex,modifier,aspect,date_created,assigned_by,hp_name,disease_id
341,OMIM,100100,#100100 PRUNE BELLY SYNDROME; PBS;;ABDOMINAL M...,,HP:0001627,OMIM:100100,TAS,,,,P,,HPO:skoehler,,Abnormal heart morphology,OMIM:100100
434,OMIM,101000,"NEUROFIBROMATOSIS, TYPE II",,HP:0000360,PMID:1484939,PCS,,HP:0040284,,P,,HPO:probinson,12/120,Tinnitus,OMIM:101000
435,OMIM,101000,"NEUROFIBROMATOSIS, TYPE II",HP:0012833,HP:0000365,PMID:1484939,PCS,,HP:0040284,,P,,HPO:probinson,42/120,Hearing impairment,OMIM:101000
436,OMIM,101000,"NEUROFIBROMATOSIS, TYPE II",,HP:0000957,PMID:1484939,PCS,,HP:0040284,,P,,HPO:iea,43%,Cafe-au-lait spot,OMIM:101000


So a lot of HPO Items have xrefs to either mesh or omim... thos that are already classified as dieases in our network are not of interest... So we will compare the Xrefs of HPO ids to our network and select those which do not have an xref.

In [22]:
hp_xrefs = expand_split_col(hp_nodes['xref'].dropna().str.split('|'))['xref'].str.replace('MSH:', 'MESH:').unique()

In [23]:
hp_to_xref = expand_split_col(hp_nodes.set_index('id')['xref'].dropna().str.split('|')).rename(columns={'old_idx': 'hp_id'})
(hp_to_xref[hp_to_xref['xref'].str.contains('MSH:')]['hp_id'].value_counts() > 1).sum()

78

In [24]:
hp_to_xref['hp_id'].nunique()

11686

In [25]:
disease_ids = nodes.query('label == "Disease"')['id'].unique()

In [26]:
hp_to_xref['xref'] = hp_to_xref['xref'].str.replace('MSH:', 'MESH:')
hp_disease_ids = hp_to_xref.query('xref in @disease_ids')['hp_id'].unique()
non_disease_hp = hp_to_xref.query('xref not in @disease_ids').copy()
non_disease_hp['name'] = non_disease_hp['hp_id'].map(hp_name_map)
len(non_disease_hp[non_disease_hp['xref'].str.contains('MESH:')])

453

In [27]:
hp_to_dis = hp_to_dis.query('hp_id not in @hp_disease_ids').copy()
print(len(hp_to_dis))
print(hp_to_dis['hp_id'].nunique())
print(hp_to_dis['disease_id'].nunique())

12483
2824
2245


In [28]:
hp_to_dis.sample(10)

Unnamed: 0,db,db_object_id,db_name,qualifier,hp_id,db_reference,evidence_code,onset_modifier,frequency,sex,modifier,aspect,date_created,assigned_by,hp_name,disease_id
54030,OMIM,602579,"CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Ib;...",,HP:0011473,OMIM:602579,TAS,,,,P,"CONGENITAL DISORDER OF GLYCOSYLATION, TYPE IB;...",HPO:skoehler,,Villous atrophy,OMIM:602579
50994,OMIM,601152,#601152 HEREDITARY MOTOR AND SENSORY NEUROPATH...,,HP:0008587,OMIM:601152,TAS,,HP:0040283,,P,,HPO:probinson,HP:0040283,Mild neurosensory hearing impairment,OMIM:601152
15494,OMIM,186570,#186570 TARSAL-CARPAL COALITION SYNDROME; TCC;...,,HP:0002967,OMIM:186570,PCS,,HP:0040283,,P,,HPO:iea,HP:0040283,Cubitus valgus,OMIM:186570
65625,OMIM,610717,NEUTRAL LIPID STORAGE DISEASE WITH MYOPATHY,,HP:0009046,OMIM:610717,TAS,,,,P,,HPO:skoehler,,Difficulty running,OMIM:610717
59048,OMIM,607595,BRAIN SMALL VESSEL DISEASE WITH OR WITHOUT OCU...,,HP:0000519,OMIM:607595,TAS,,HP:0040283,,P,,HPO:skoehler,HP:0040283,Developmental cataract,OMIM:607595
69285,OMIM,612718,Cerebral creatine deficiency syndrome 3,,HP:0025051,PMID:20682460,PCS,,,,P,CEREBRAL CREATINE DEFICIENCY SYNDROME 3,HP:probinson,,Reduced brain creatine level by MRS,OMIM:612718
15499,OMIM,186570,#186570 TARSAL-CARPAL COALITION SYNDROME; TCC;...,,HP:0008368,OMIM:186579;PMID:11545688,PCS,,HP:0040282,,P,,HPO:iea,HP:0040282,Tarsal synostosis,OMIM:186570
39137,OMIM,273750,#273750 THREE M SYNDROME 1; 3M1;;3@M SYNDROME;...,,HP:0001382,OMIM:273750,TAS,,,,P,,HPO:probinson,,Joint hypermobility,OMIM:273750
59587,OMIM,607812,CRANIOLENTICULOSUTURAL DYSPLASIA,,HP:0000343,OMIM:607812,TAS,,,,P,,HPO:probinson,,Long philtrum,OMIM:607812
53552,OMIM,602398,#602398 DESMOSTEROLOSIS,,HP:0000463,OMIM:602398,TAS,,,,P,,HPO:skoehler,,Anteverted nares,OMIM:602398


In [29]:
hp_to_keep = hp_to_dis['hp_id'].unique()

So these are the HPO ids that we'll keep for our network... 
Not all the Gene to HP ids will be contained in this set so we need to filter a bit

In [30]:
hp_gene.query('hp_id in @hp_to_keep and gene_id in @node_ids')['gene_id'].nunique()

3951

In [31]:
hp_gene.query('hp_id in @hp_to_keep and gene_id in @node_ids')['hp_id'].nunique()

2803

In [32]:
len(hp_gene.query('hp_id in @hp_to_keep and gene_id in @node_ids'))

67367

## Aggregate the new edges and save to disk

In [33]:
hp_to_dis_edges = hp_to_dis.rename(columns={'disease_id': 'start_id', 'hp_id': 'end_id'})
hp_to_dis_edges['start_id'] = hp_to_dis_edges['start_id'].map({v:k for k, v in mesh_to_omim.items()})
hp_to_dis_edges['type'] = 'presents_DpPH'

hp_to_dis_edges[['start_id', 'end_id', 'type']].head(2)

Unnamed: 0,start_id,end_id,type
443,MESH:D016518,HP:0007935,presents_DpPH
445,MESH:D016518,HP:0009590,presents_DpPH


In [34]:
hp_to_gene_edges = hp_gene.query('hp_id in @hp_to_keep and gene_id in @node_ids').copy()
hp_to_gene_edges = hp_to_gene_edges.rename(columns={'hp_id': 'start_id', 'gene_id': 'end_id'})
hp_to_gene_edges['type'] = 'associated_with_PHawG'

hp_to_gene_edges[['start_id', 'end_id', 'type']].head(2)

Unnamed: 0,start_id,end_id,type
1,HP:0004322,8192,associated_with_PHawG
2,HP:0000786,8192,associated_with_PHawG


In [35]:
all_edges = pd.concat([hp_to_dis_edges[['start_id', 'end_id', 'type']], 
                       hp_to_gene_edges[['start_id', 'end_id', 'type']]], ignore_index=True)
all_edges.head(2)

Unnamed: 0,start_id,end_id,type
0,MESH:D016518,HP:0007935,presents_DpPH
1,MESH:D016518,HP:0009590,presents_DpPH


In [36]:
edge_ids = set(all_edges['start_id']).union(set(all_edges['end_id']))
hp_nodes_filt = hp_nodes.query('id in @edge_ids')[['id', 'name']]
hp_nodes_filt['label'] = 'Phenotype'

In [37]:
all_edges.to_csv(out_dir.joinpath('edges.csv'), index=False)
hp_nodes_filt.to_csv(out_dir.joinpath('nodes.csv'), index=False)