# Building out the CTD network

We've downloaded the files from CTD.  Some contain node to node relationships, others contian information about the nodes including identifiers and hierarchial structures.  We will use these data to build a node and edges file that can be imported into neo4j

In [1]:
# python package imports
import pandas as pd
from pathlib import Path

# personal package imports
import hetnet_ml.src.graph_tools as gt
from metapaths.tools.hetnet_file_processing import read_ctd
from metapaths.tools import processing as pr

In [2]:
file_info = pd.read_csv('../0_data/manual/edge_file_info.csv')
load_dir = Path('../2_pipeline/00_download_data/out/').resolve()

In [3]:
id_files = file_info.query('file_source == "CTD" and file_type == "ids"').set_index('source_type')

# Nodes

In [5]:
nodes = list()

### Compounds

In [6]:
compound_info = read_ctd(load_dir.joinpath(id_files.loc['Compound', 'file_name']))
compound_info.head(2)

Unnamed: 0,chemical_name,chemical_id,cas_rn,definition,parent_ids,tree_numbers,parent_tree_numbers,synonyms,drug_bank_ids
0,(0.017ferrocene)amylose,MESH:C089250,,,MESH:D000688|MESH:D005296,D01.490.200/C089250|D02.691.550.200/C089250|D0...,D01.490.200|D02.691.550.200|D09.301.915.361|D0...,(0.017 ferrocene)amylose,
1,001-C8-NBD,MESH:C114385,,,MESH:D009842|MESH:D010069,D03.383.129.462.580/C114385|D12.644.456/C114385,D03.383.129.462.580|D12.644.456,001 C8 NBD|H-MeTyr-Arg-MeArg-D-Leu-NH(CH2)8NH-...,


ID and name are required for making a node file.  We will use the parent IDs to make a tree of the hierarchy, to better mapping of edges.  Finally, keeping info like tree numbers and any xrefs (like drugbank) will be useful for future network joining and analysis of compound classes.

In [7]:
comp_nodes = compound_info[['chemical_id', 'chemical_name', 'tree_numbers', 'drug_bank_ids']]
comp_nodes = comp_nodes.rename(columns={'chemical_id': 'id', 'chemical_name': 'name'})
comp_nodes['label'] = 'Compound'

comp_nodes.head(5)

Unnamed: 0,id,name,tree_numbers,drug_bank_ids,label
0,MESH:C089250,(0.017ferrocene)amylose,D01.490.200/C089250|D02.691.550.200/C089250|D0...,,Compound
1,MESH:C114385,001-C8-NBD,D03.383.129.462.580/C114385|D12.644.456/C114385,,Compound
2,MESH:C114386,001-C8 oligopeptide,D12.644.456/C114386,,Compound
3,MESH:C434150,"0231A , Streptomyces",D03.633.400/C434150,,Compound
4,MESH:C434149,"0231B, Streptomyces",D03.633.400/C434149,,Compound


In [8]:
nodes.append(comp_nodes)

In [9]:
comp_p2c = pr.get_parent_to_child_map(compound_info, 'parent_ids', 'chemical_id')

In [10]:
type(comp_nodes) == pd.DataFrame

True

In [11]:
def test_parent_child(pc_info, node_df, parent_2_child_map, idx=0):

    if type(pc_info) == pd.DataFrame:        
        test_id = pc_info.dropna(subset=['parent_ids']).iloc[idx]['parent_ids'].split('|')[0]
    elif type(pc_info) == str:
        test_id = pc_info
    node_df = node_df.set_index('id')
    print('Parent: {}'.format(node_df.loc[test_id, 'name']))
    print('Children:')
    for child in parent_2_child_map[test_id]:
        print('\t{}'.format(node_df.loc[child, 'name']))


In [12]:
test_parent_child(compound_info, comp_nodes, comp_p2c)

Parent: Amylose
Children:
	carboxymethylamylose
	Cibacron Blue-amylose
	Cibacron Blue F 3GA-amylose
	thymol amylose
	chiralpak AS
	6-deoxyamylose
	oxyamylose
	sodium amylose succinate
	amylose triacetate
	(0.017ferrocene)amylose
	amylose tris(1-phenylethylcarbamate)
	amylose tris(n-hexylcarbamate)
	amylose-tris-(5-chloro-2-methylphenylcarbamate)
	Chiralpak AD
	amylose-2-acetyl-3,6-bis(phenylcarbamate)
	amylose tris(ethylcarbamate)
	amylose tri(3,5-dimethyl carbamate)


### Diseases

In [13]:
disease_info = read_ctd(load_dir.joinpath(id_files.loc['Disease', 'file_name']))
disease_info.head(2)

Unnamed: 0,disease_name,disease_id,alt_disease_ids,definition,parent_ids,tree_numbers,parent_tree_numbers,synonyms,slim_mappings
0,10p Deletion Syndrome (Partial),MESH:C538288,,,MESH:D002872|MESH:D025063,C16.131.260/C538288|C16.320.180/C538288|C23.55...,C16.131.260|C16.320.180|C23.550.210.050.500.500,"Chromosome 10, 10p- Partial|Chromosome 10, mon...",Congenital abnormality|Genetic disease (inborn...
1,13q deletion syndrome,MESH:C535484,,,MESH:D002872|MESH:D025063,C16.131.260/C535484|C16.320.180/C535484|C23.55...,C16.131.260|C16.320.180|C23.550.210.050.500.500,Chromosome 13q deletion|Chromosome 13q deletio...,Congenital abnormality|Genetic disease (inborn...


In [14]:
dis_nodes = disease_info[['disease_id', 'disease_name', 'tree_numbers', 'alt_disease_ids']]
dis_nodes = dis_nodes.rename(columns={'disease_id': 'id', 'disease_name': 'name'})
dis_nodes['label'] = 'Disease'

dis_nodes.head(5)

Unnamed: 0,id,name,tree_numbers,alt_disease_ids,label
0,MESH:C538288,10p Deletion Syndrome (Partial),C16.131.260/C538288|C16.320.180/C538288|C23.55...,,Disease
1,MESH:C535484,13q deletion syndrome,C16.131.260/C535484|C16.320.180/C535484|C23.55...,,Disease
2,MESH:C579849,15q24 Microdeletion,C10.597.606.360/C579849|C16.131.260/C579849|C1...,DO:DOID:0060395,Disease
3,MESH:C579850,16p11.2 Deletion Syndrome,C10.597.606.360/C579850|C16.131.260/C579850|C1...,,Disease
4,MESH:C567076,"17,20-Lyase Deficiency, Isolated",C12.706.316.090.500/C567076|C13.351.875.253.09...,,Disease


In [15]:
nodes.append(dis_nodes)

In [16]:
dis_p2c_map = pr.get_parent_to_child_map(disease_info, 'parent_ids', 'disease_id')

In [17]:
# IDX 0 == chromoasome deletion with too many children to print nicely
test_parent_child(disease_info, dis_nodes, dis_p2c_map, idx=20)

Parent: Urea Cycle Disorders, Inborn
Children:
	Neonatal-onset citrullinemia type 2
	Hyperargininemia
	HHH syndrome
	3-methylcrotonyl CoA carboxylase 2 deficiency
	Carbamoyl-Phosphate Synthase I Deficiency Disease
	CTNL1
	Ornithine Carbamoyltransferase Deficiency Disease
	Osteopetrosis with renal tubular acidosis
	Citrulline transport defect
	Adult-onset citrullinemia type 2
	ORNITHINE TRANSCARBAMYLASE DEFICIENCY, HYPERAMMONEMIA DUE TO
	3-methylcrotonyl CoA carboxylase 1 deficiency
	N-acetyl glutamate synthetase deficiency
	Citrullinemia
	Argininosuccinic Aciduria


### Genes

In [18]:
gene_info = read_ctd(load_dir.joinpath(id_files.loc['Gene', 'file_name']))
gene_info.head(2)

Unnamed: 0,gene_symbol,gene_name,gene_id,alt_gene_ids,synonyms,bio_gridids,pharm_gkbids,uni_prot_ids
0,03B03F,"DNA segment, 03B03F (Research Genetics)",27777,,,,,
1,03B03R,"DNA segment, 03B03R (Research Genetics)",27778,,,,,


Both Gene symbol and Gene ID could be use as identifiers... we also want all the other ids, too.. No parent-child relationships to worry about here.

In [19]:
gene_nodes = gene_info[['gene_id', 'gene_name', 'gene_symbol', 'alt_gene_ids', 'bio_gridids', 'pharm_gkbids', 'uni_prot_ids']]
gene_nodes = gene_nodes.rename(columns={'gene_id': 'id', 'gene_name': 'name'})
gene_nodes['label'] = 'Gene'

gene_nodes.head(5)

Unnamed: 0,id,name,gene_symbol,alt_gene_ids,bio_gridids,pharm_gkbids,uni_prot_ids,label
0,27777,"DNA segment, 03B03F (Research Genetics)",03B03F,,,,,Gene
1,27778,"DNA segment, 03B03R (Research Genetics)",03B03R,,,,,Gene
2,53288,"DNA segment, 03.MMHAP34FRA.seq",03.MMHAP34FRA.SEQ,,,,,Gene
3,5658107,,064YA,,,,,Gene
4,56573,"DNA segment, 102g4T7",102G4T7,,,,,Gene


In [20]:
nodes.append(gene_nodes)

### Pathways

In [21]:
pathway_info = read_ctd(load_dir.joinpath(id_files.loc['Pathway', 'file_name']))
pathway_info.head(2)

Unnamed: 0,pathway_name,pathway_id
0,2-LTR circle formation,REACT:R-HSA-164843
1,2-Oxocarboxylic acid metabolism,KEGG:hsa01210


This is about as simple as it can be, just take ids and names

In [22]:
pw_nodes = pathway_info[['pathway_id', 'pathway_name']].copy()
pw_nodes.columns = [c.split('_')[-1] for c in pw_nodes.columns]
pw_nodes['label'] = 'Pathway'

pw_nodes.head(5)

Unnamed: 0,id,name,label
0,REACT:R-HSA-164843,2-LTR circle formation,Pathway
1,KEGG:hsa01210,2-Oxocarboxylic acid metabolism,Pathway
2,REACT:R-HSA-73843,5-Phosphoribose 1-diphosphate biosynthesis,Pathway
3,REACT:R-HSA-2161541,Abacavir metabolism,Pathway
4,REACT:R-HSA-2161517,Abacavir transmembrane transport,Pathway


In [23]:
nodes.append(pw_nodes)

## What other nodes do we need info on?

In [24]:
set(file_info.query('file_source == "CTD"')[['source_type', 'target_type']].stack()) - \
        set([n.loc[0,'label'] for n in nodes])


{'Biological Process',
 'Cellular Component',
 'GO Term',
 'Molecular Function',
 'relationships'}

All are GO Term related, so best practice to use GO Ontology to sort out relationships

In [25]:
import obonet
import networkx as nx

In [26]:
go_file = file_info.query('file_source == "go"').iloc[0,0]
go = obonet.read_obo(load_dir.joinpath(go_file))

In [27]:
go_nodes = dict()
go_nodes['id'] = [g for g in go]
go_nodes['name'] = [info['name'] for nid, info in go.nodes(data=True)]
go_nodes['label'] = [info['namespace'] for nid, info in go.nodes(data=True)]

go_nodes = pd.DataFrame(go_nodes)
go_nodes['label'] = go_nodes['label'].str.replace('_', ' ').str.title()
go_nodes.head(5)

Unnamed: 0,id,name,label
0,GO:0000001,mitochondrion inheritance,Biological Process
1,GO:0000002,mitochondrial genome maintenance,Biological Process
2,GO:0000003,reproduction,Biological Process
3,GO:0000006,high-affinity zinc transmembrane transporter a...,Molecular Function
4,GO:0000007,low-affinity zinc ion transmembrane transporte...,Molecular Function


In [28]:
nodes.append(go_nodes)

In [29]:
# Networkx makes this mapping easy
go_p2c_map = {g: nx.ancestors(go, g) for g in go_nodes['id']}

In [30]:
test_parent_child('GO:0015800', go_nodes, go_p2c_map)

Parent: acidic amino acid transport
Children:
	negative regulation of glutamate secretion, neurotransmission
	aspartate secretion, neurotransmission
	negative regulation of glutamate uptake involved in transmission of nerve impulse
	regulation of aspartate secretion
	glutamate transmembrane import into vacuole
	L-glutamate import involved in cellular response to nitrogen starvation
	gamma-aminobutyric acid import
	regulation of glutamate uptake involved in transmission of nerve impulse
	L-glutamate transmembrane transport
	positive regulation of glutamate uptake involved in transmission of nerve impulse
	positive regulation of glutamate secretion
	positive regulation of glutamate neurotransmitter secretion in response to membrane depolarization
	positive regulation of glutamate secretion, neurotransmission
	positive regulation of aspartate secretion
	glutamate secretion, neurotransmission
	positive regulation of L-glutamate import across plasma membrane
	L-glutamate import
	aspartate s

In [31]:
nodes = pd.concat(nodes, sort=False, ignore_index=True)
len(nodes)

747662

In [32]:
nodes = nodes[sorted(nodes.columns, key=lambda x: x not in ['id', 'name', 'label'])]

In [33]:
nodes.shape

(747662, 11)

# Edges

Need some info about certain types of relationships in this netowrk

In [34]:
edges = list()

In [35]:
rel_info = read_ctd(load_dir.joinpath(id_files.loc['relationships', 'file_name']))

In [36]:
ixn_map_to_par = pr.make_child_to_root_map(rel_info, 'parent_code', 'code', sep='|')
ixn_map_to_par_full = pr.convert_abbrev_mapper_to_full(ixn_map_to_par, rel_info, 'code', 'type_name')

In [37]:
e_files = file_info.query('file_source == "CTD" and file_type == "edges"')['file_name'].tolist()

In [38]:
print(e_files[0])

CTD_chem_gene_ixns.csv.gz


### Chem to Genes

In [39]:
chem_gene = read_ctd(load_dir.joinpath(e_files[0]))
chem_gene.head(2)

Unnamed: 0,chemical_name,chemical_id,cas_rn,gene_symbol,gene_id,gene_forms,organism,organism_id,interaction,interaction_actions,pub_med_ids
0,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 affects the folding of and results in...,affects^binding|affects^folding|decreases^acti...,26474287
1,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,affects^binding|decreases^reaction,26474287


interaction_actions are pipe, separated... We want those that are unique on their own line. However, the interactions used are a bit too granular, so we also want to map to thier parents.  Finally, we want to keep all PMIDs and organism ids for each interaction for potnetial future use.

In [40]:
chem_gene = pr.expand_col_on_char(chem_gene, 'interaction_actions', '|')
chem_gene.head(5)

Unnamed: 0,chemical_name,chemical_id,cas_rn,gene_symbol,gene_id,gene_forms,organism,organism_id,interaction,interaction_actions,pub_med_ids
0,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 affects the folding of and results in...,affects^binding,26474287
1,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 affects the folding of and results in...,affects^folding,26474287
2,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 affects the folding of and results in...,decreases^activity,26474287
3,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,affects^binding,26474287
4,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,decreases^reaction,26474287


In [41]:
ixn_dirs = chem_gene['interaction_actions'].str.split('^', expand=True)[0].unique()
ixn_dirs

array(['affects', 'decreases', 'increases'], dtype=object)

In [42]:
directed_ixn_map_to_par_full = pr.prepend_direction_to_map(ixn_map_to_par_full, ixn_dirs, '^')

In [43]:
chem_gene['parent_ixn'] = chem_gene['interaction_actions'].map(directed_ixn_map_to_par_full)
chem_gene.head(5)

Unnamed: 0,chemical_name,chemical_id,cas_rn,gene_symbol,gene_id,gene_forms,organism,organism_id,interaction,interaction_actions,pub_med_ids,parent_ixn
0,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 affects the folding of and results in...,affects^binding,26474287,affects^binding
1,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 affects the folding of and results in...,affects^folding,26474287,affects^folding
2,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 affects the folding of and results in...,decreases^activity,26474287,decreases^activity
3,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,affects^binding,26474287,affects^binding
4,10074-G5,C534883,,MAX,4149,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,decreases^reaction,26474287,decreases^reaction


In [44]:
# Fix typing on organism IDs as they will be floats...
idx = chem_gene.dropna(subset=['organism_id']).index
chem_gene.loc[idx, 'organism_id'] = chem_gene.loc[idx, 'organism_id'].apply(lambda s: str(int(s)))

In [45]:
%%time
chem_gene = pr.combine_group_rows_on_char(chem_gene, ['chemical_id', 'gene_id', 'parent_ixn'],
                                          ['pub_med_ids', 'organism_id'])

CPU times: user 7min 18s, sys: 1.86 s, total: 7min 19s
Wall time: 7min 15s


In [46]:
cg_edges = chem_gene[['chemical_id', 'gene_id', 'parent_ixn', 'pub_med_ids', 'organism_id']]
cg_edges = cg_edges.rename(columns={'chemical_id': 'start_id', 'gene_id': 'end_id'})

In [47]:
cg_edges['parent_ixn'].value_counts()

increases^expression               479825
decreases^expression               410983
affects^cotreatment                184777
affects^expression                 137093
decreases^reaction                  95223
increases^metabolic processing      66522
affects^binding                     36161
increases^reaction                  30886
increases^activity                  29001
decreases^metabolic processing      28742
decreases^activity                  21926
affects^reaction                    17713
affects^metabolic processing        16070
increases^transport                 12559
affects^response to substance       12279
increases^response to substance      7964
decreases^response to substance      6965
increases^abundance                  6253
affects^localization                 5965
decreases^transport                  2632
affects^activity                     2382
affects^transport                    1700
increases^mutagenesis                1529
decreases^abundance               

Lots of different things here, but basically all we care about is +, -, and general association in terms of activity... for now we will keep a distinction between expression and activity, but these may be merged later.

In [48]:
keep_ixns = [direction+'^expression' for direction in ixn_dirs]
keep_ixns += [direction+'^activity' for direction in ixn_dirs]

In [49]:
cg_edges = cg_edges.query('parent_ixn in @keep_ixns')

cg_edges['abbv'] = 'C' + cg_edges['parent_ixn'].apply(lambda i: ''.join([s[0] for s in i.split('^')])) + 'G'
cg_edges['type'] = cg_edges['parent_ixn'].str.replace('^', '_') + '_' + cg_edges['abbv']
cg_edges['start_id'] = 'MESH:' + cg_edges['start_id']

cg_edges.head(5)

Unnamed: 0,start_id,end_id,parent_ixn,pub_med_ids,organism_id,abbv,type
3,MESH:C000121,4313,decreases^activity,25899827,9606,CdaG,decreases_activity_CdaG
4,MESH:C000121,4313,decreases^expression,25899827,9606,CdeG,decreases_expression_CdeG
5,MESH:C000121,4318,decreases^activity,25899827,9606,CdaG,decreases_activity_CdaG
6,MESH:C000121,4318,decreases^expression,25899827,9606,CdeG,decreases_expression_CdeG
8,MESH:C000152,367,decreases^activity,11118046,9606,CdaG,decreases_activity_CdaG


In [50]:
edges.append(cg_edges)

In [51]:
len(cg_edges)

1081210

    from itertools import chain

    comp_p2c_df = pd.DataFrame({'parent_id': list(chain(*[[k]*len(v) for k, v in comp_p2c.items()])), 
                                    'child_id': list(chain(*[v for k, v in comp_p2c.items()])) })

    cg_expand = cg_edges.copy()
    cg_expand = cg_expand.merge(comp_p2c_df, how='inner', left_on='start_id', right_on='parent_id')

    cg_expand['start_id'] = cg_expand['child_id']
    cg_expand = cg_expand.drop(['parent_id', 'child_id'], axis=1)

    cg_expand = pd.concat([cg_edges, cg_expand])


In [52]:
cg_edges['source'] = 'CTD'
cg_edges['evidence'] = 'curated'

In [53]:
print(e_files[1])

CTD_chemicals_diseases.csv.gz


### Chem to Disease

In [54]:
chem_dis = read_ctd(load_dir.joinpath(e_files[1]))
chem_dis.head(2)

Unnamed: 0,chemical_name,chemical_id,cas_rn,disease_name,disease_id,direct_evidence,inference_gene_symbol,inference_score,omim_ids,pub_med_ids
0,06-Paris-LA-66 protocol,C046983,,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,therapeutic,,,,4519131
1,10074-G5,C534883,,Adenocarcinoma,MESH:D000230,,MYC,4.4,,26432044


In [55]:
chem_dis.head(2)

Unnamed: 0,chemical_name,chemical_id,cas_rn,disease_name,disease_id,direct_evidence,inference_gene_symbol,inference_score,omim_ids,pub_med_ids
0,06-Paris-LA-66 protocol,C046983,,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,therapeutic,,,,4519131
1,10074-G5,C534883,,Adenocarcinoma,MESH:D000230,,MYC,4.4,,26432044


Want chem_id, and disease_id... only want curated for now, so those with direct evedence. Direct evidence will be the semmantic type of the edges

In [56]:
chem_dis['direct_evidence'].value_counts()

marker/mechanism    62915
therapeutic         34573
Name: direct_evidence, dtype: int64

In [57]:
cd_edges = chem_dis.dropna(subset=['direct_evidence'])[['chemical_id', 'disease_id', 'direct_evidence', 'pub_med_ids']]
cd_edges.head(2)

Unnamed: 0,chemical_id,disease_id,direct_evidence,pub_med_ids
0,C046983,MESH:D054198,therapeutic,4519131
72,C112297,MESH:D006948,marker/mechanism,19098162


In [58]:
cd_edges = cd_edges.rename(columns={'chemical_id': 'start_id', 'disease_id': 'end_id'})
cd_edges['abbv'] = 'C' + cd_edges['direct_evidence'].apply(lambda x: x[0]) + 'D'
cd_edges['type'] = cd_edges['direct_evidence'].str.replace('/', '_or_') + '_' + cd_edges['abbv']
cd_edges['start_id'] = 'MESH:' + cd_edges['start_id']

cd_edges.head(5)

Unnamed: 0,start_id,end_id,direct_evidence,pub_med_ids,abbv,type
0,MESH:C046983,MESH:D054198,therapeutic,4519131,CtD,therapeutic_CtD
72,MESH:C112297,MESH:D006948,marker/mechanism,19098162,CmD,marker_or_mechanism_CmD
87,MESH:C112297,MESH:D012640,marker/mechanism,26348896,CmD,marker_or_mechanism_CmD
135,MESH:C039775,MESH:D004827,therapeutic,17516704,CtD,therapeutic_CtD
190,MESH:C425777,MESH:D006948,marker/mechanism,15765258,CmD,marker_or_mechanism_CmD


In [59]:
len(cd_edges)

97488

In [60]:
cd_edges['source'] = 'CTD'
cd_edges['evidence'] = 'curated'

In [61]:
edges.append(cd_edges)

    cd_expand = cd_edges.copy()

    cd_expand = cd_expand.merge(comp_p2c_df, how='inner', left_on='start_id', right_on='parent_id')

    dis_p2c_df = pd.DataFrame({'parent_id': list(chain(*[[k]*len(v) for k, v in dis_p2c_map.items()])), 
                                    'child_id': list(chain(*[v for k, v in dis_p2c_map.items()])) })

    cd_expand = cd_expand.merge(dis_p2c_df, how='inner', left_on='end_id', right_on='parent_id')
    cd_expand.shape

In [62]:
e_files[2]

'CTD_chem_go_enriched.csv.gz'

### Chem to GO

In [63]:
chem_go = read_ctd(load_dir.joinpath(e_files[2]))
chem_go.head(2)

Unnamed: 0,chemical_name,chemical_id,cas_rn,ontology,goterm_name,goterm_id,highest_golevel,pvalue,corrected_pvalue,target_match_qty,target_total_qty,background_match_qty,background_total_qty
0,10074-G5,C534883,,Molecular Function,E-box binding,GO:0070888,9,1e-06,0.000652,2,2,49,43112
1,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,Biological Process,cation transmembrane transport,GO:0098655,6,1.9e-05,0.00963,3,4,723,43112


Chemical IDs and GO Term IDs are most important... keep the correcte p-vals in cues we want to do some filtering later

In [64]:
chem_go.shape

(5416205, 13)

In [65]:
cgo_edges = chem_go[['chemical_id', 'goterm_id', 'ontology', 'corrected_pvalue']]

In [66]:
cgo_edges = cgo_edges.rename(columns={'chemical_id': 'start_id', 'goterm_id': 'end_id'})
cgo_edges['abbv'] = 'Caw' + cgo_edges['ontology'].apply(lambda s: ''.join([w[0] for w in s.split(' ')]))
cgo_edges['type'] = 'associated_with_' + cgo_edges['abbv']
cgo_edges['start_id'] = 'MESH:' + cgo_edges['start_id']

cgo_edges = cgo_edges.drop('ontology', axis=1)

cgo_edges.head(5)

Unnamed: 0,start_id,end_id,corrected_pvalue,abbv,type
0,MESH:C534883,GO:0070888,0.000652,CawMF,associated_with_CawMF
1,MESH:C112297,GO:0098655,0.00963,CawBP,associated_with_CawBP
2,MESH:C112297,GO:0071804,6.9e-05,CawBP,associated_with_CawBP
3,MESH:C112297,GO:0098662,0.00665,CawBP,associated_with_CawBP
4,MESH:C112297,GO:0098660,0.00935,CawBP,associated_with_CawBP


In [67]:
cgo_edges['source'] = 'CTD'
cgo_edges['evidence'] = 'computed'

In [68]:
e_files[3]

'CTD_chem_pathways_enriched.csv.gz'

## Chemical - Pathway

In [69]:
chem_path = read_ctd(load_dir.joinpath(e_files[3]))
chem_path.head(2)

Unnamed: 0,chemical_name,chemical_id,cas_rn,pathway_name,pathway_id,pvalue,corrected_pvalue,target_match_qty,target_total_qty,background_match_qty,background_total_qty
0,10074-G5,C534883,,"Cell Cycle, Mitotic",REACT:R-HSA-69278,0.000142,0.00922,2,2,514,43112
1,10074-G5,C534883,,Cyclin A:Cdk2-associated events at S phase entry,REACT:R-HSA-69656,3e-06,0.000174,2,2,71,43112


In [70]:
cp_edges = chem_path[['chemical_id', 'pathway_id', 'corrected_pvalue']]

In [71]:
cp_edges = cp_edges.rename(columns={'chemical_id': 'start_id', 'pathway_id': 'end_id'})
cp_edges['abbv'] = 'CawPW'
cp_edges['type'] = 'associated_with_' + cp_edges['abbv']
cp_edges['start_id'] = 'MESH:' + cp_edges['start_id']

cp_edges.head(5)

Unnamed: 0,start_id,end_id,corrected_pvalue,abbv,type
0,MESH:C534883,REACT:R-HSA-69278,0.00922,CawPW,associated_with_CawPW
1,MESH:C534883,REACT:R-HSA-69656,0.000174,CawPW,associated_with_CawPW
2,MESH:C534883,REACT:R-HSA-69202,0.000179,CawPW,associated_with_CawPW
3,MESH:C534883,REACT:R-HSA-69206,0.000491,CawPW,associated_with_CawPW
4,MESH:C534883,KEGG:hsa04010,0.00227,CawPW,associated_with_CawPW


In [72]:
cp_edges['source'] = 'CTD'
cp_edges['evidence'] = 'computed'

In [73]:
edges.append(cp_edges)

In [74]:
e_files[4]

'CTD_genes_diseases.csv.gz'

### Gene - Disease

In [75]:
gene_dis = read_ctd(load_dir.joinpath(e_files[4]))
gene_dis.head(2)

  if self.run_code(code, result):


Unnamed: 0,gene_symbol,gene_id,disease_name,disease_id,direct_evidence,inference_chemical_name,inference_score,omim_ids,pub_med_ids
0,11-BETA-HSD3,100174880,"Abnormalities, Drug-Induced",MESH:D000014,,Endocrine Disruptors,5.16,,22659286
1,11-BETA-HSD3,100174880,Anemia,MESH:D000740,,"Water Pollutants, Chemical",4.2,,26546277


In [76]:
gene_dis.head(2)

Unnamed: 0,gene_symbol,gene_id,disease_name,disease_id,direct_evidence,inference_chemical_name,inference_score,omim_ids,pub_med_ids
0,11-BETA-HSD3,100174880,"Abnormalities, Drug-Induced",MESH:D000014,,Endocrine Disruptors,5.16,,22659286
1,11-BETA-HSD3,100174880,Anemia,MESH:D000740,,"Water Pollutants, Chemical",4.2,,26546277


Similar to chemical disease, we're only going to look at curated instances, so those with a valid 'direct_evidence' value.

Other than that, PMIDs will be captured.

In [77]:
gd_edges = gene_dis.dropna(subset=['direct_evidence'])[['gene_id', 'disease_id', 'direct_evidence', 'pub_med_ids']]

In [78]:
gd_edges.shape

(30957, 4)

In [79]:
gd_edges['direct_evidence'].value_counts()

marker/mechanism                29067
therapeutic                      1615
marker/mechanism|therapeutic      275
Name: direct_evidence, dtype: int64

In [80]:
gd_edges = pr.expand_col_on_char(gd_edges, 'direct_evidence', '|')

In [81]:
gd_edges = gd_edges.rename(columns={'gene_id': 'start_id', 'disease_id': 'end_id'})
gd_edges['abbv'] = 'G' + gd_edges['direct_evidence'].apply(lambda x: x[0]) + 'D'
gd_edges['type'] = gd_edges['direct_evidence'].str.replace('/', '_or_') + '_' + gd_edges['abbv']

gd_edges.head(5)

Unnamed: 0,start_id,end_id,direct_evidence,pub_med_ids,abbv,type
0,50518,MESH:D003920,marker/mechanism,1473152,GmD,marker_or_mechanism_GmD
1,50518,MESH:D003924,marker/mechanism,8146154,GmD,marker_or_mechanism_GmD
2,50518,MESH:D008113,marker/mechanism,15175105,GmD,marker_or_mechanism_GmD
3,50518,MESH:D009369,marker/mechanism,1473152,GmD,marker_or_mechanism_GmD
4,50518,MESH:D009765,marker/mechanism,1473152|25447408|25448685|8146154,GmD,marker_or_mechanism_GmD


In [82]:
gd_edges['source'] = 'CTD'
gd_edges['evidence'] = 'computed'

In [83]:
edges.append(gd_edges)

In [84]:
e_files[5]

'CTD_genes_pathways.csv.gz'

### Gene - Pathway

In [85]:
gene_path = read_ctd(load_dir.joinpath(e_files[5]))
gene_path.head(2)

Unnamed: 0,gene_symbol,gene_id,pathway_name,pathway_id
0,A1BG,1,Hemostasis,REACT:R-HSA-109582
1,A1BG,1,Immune System,REACT:R-HSA-168256


Not a lot of info here... we'll just take gene's and pathways with a part-of edge

In [86]:
gp_edges = gene_path[['gene_id', 'pathway_id']]
gp_edges = gp_edges.rename(columns={'gene_id': 'start_id', 'pathway_id': 'end_id'})
gp_edges['abbv'] = 'GpoPW'
gp_edges['type'] = 'part_of_' + gp_edges['abbv']

gp_edges.head(5)

Unnamed: 0,start_id,end_id,abbv,type
0,1,REACT:R-HSA-109582,GpoPW,part_of_GpoPW
1,1,REACT:R-HSA-168256,GpoPW,part_of_GpoPW
2,1,REACT:R-HSA-168249,GpoPW,part_of_GpoPW
3,1,REACT:R-HSA-6798695,GpoPW,part_of_GpoPW
4,1,REACT:R-HSA-76002,GpoPW,part_of_GpoPW


In [87]:
gp_edges['source'] = 'CTD'
gp_edges['evidence'] = 'curated'

In [88]:
edges.append(gp_edges)

In [89]:
e_files[6]

'CTD_diseases_pathways.csv.gz'

### Disease - Pathway

In [90]:
dis_path =  read_ctd(load_dir.joinpath(e_files[6]))
dis_path.head(2)

Unnamed: 0,disease_name,disease_id,pathway_name,pathway_id,inference_gene_symbol
0,17-Hydroxysteroid Dehydrogenase Deficiency,MESH:C537805,Androgen biosynthesis,REACT:R-HSA-193048,HSD17B3
1,17-Hydroxysteroid Dehydrogenase Deficiency,MESH:C537805,"Fatty acid, triacylglycerol, and ketone body m...",REACT:R-HSA-535734,HSD17B3


We'll keep inference gene symbol for now to mark that this was not curated

In [91]:
dis_path.shape

(563169, 5)

In [92]:
dp_edges = dis_path[['disease_id', 'pathway_id', 'inference_gene_symbol']]
dp_edges = dp_edges.rename(columns={'disease_id': 'start_id', 'pathway_id': 'end_id'})
dp_edges['abbv'] = 'DawPW'
dp_edges['type'] = 'associated_with_' + dp_edges['abbv']

dp_edges.head(5)

Unnamed: 0,start_id,end_id,inference_gene_symbol,abbv,type
0,MESH:C537805,REACT:R-HSA-193048,HSD17B3,DawPW,associated_with_DawPW
1,MESH:C537805,REACT:R-HSA-535734,HSD17B3,DawPW,associated_with_DawPW
2,MESH:C537805,REACT:R-HSA-75105,HSD17B3,DawPW,associated_with_DawPW
3,MESH:C537805,KEGG:hsa01100,HSD17B3,DawPW,associated_with_DawPW
4,MESH:C537805,REACT:R-HSA-1430728,HSD17B3,DawPW,associated_with_DawPW


In [93]:
dp_edges['source'] = 'CTD'
dp_edges['evidence'] = 'computed'

In [94]:
edges.append(dp_edges)

In [95]:
e_files[7]

'CTD_pheno_term_ixns.csv.gz'

### Chem-Pheno

In [96]:
chem_pheno = read_ctd(load_dir.joinpath(e_files[7]))
chem_pheno.head(2)

Unnamed: 0,chemicalname,chemicalid,casrn,phenotypename,phenotypeid,comentionedterms,organism,organismid,interaction,interactionactions,anatomyterms,inferencegenesymbols,pubmedids,unnamed_13
0,10074-G5,C534883,,ATP biosynthetic process,GO:0006754,,Homo sapiens,9606.0,10074-G5 analog results in decreased ATP biosy...,decreases^phenotype,1^HL-60 Cells^D018922,,26036281,
1,10074-G5,C534883,,ATP biosynthetic process,GO:0006754,,Homo sapiens,9606.0,10074-G5 analog results in decreased ATP biosy...,decreases^phenotype,"1^Lung^D008168|2^Cell Line, Tumor^D045744",,26036281,


In [97]:
chem_pheno.shape

(185731, 14)

In [98]:
chem_pheno = pr.expand_col_on_char(chem_pheno, 'interactionactions', '|')

In [99]:
chem_pheno['parent_ixn'] = chem_pheno['interactionactions'].apply(lambda x: ixn_map_to_par_full.get(x, x))
idx = chem_pheno.dropna(subset=['organismid']).index
chem_pheno.loc[idx, 'organismid'] = chem_pheno.loc[idx, 'organismid'].astype(int).astype(str)
chem_pheno.head(2)

Unnamed: 0,chemicalname,chemicalid,casrn,phenotypename,phenotypeid,comentionedterms,organism,organismid,interaction,interactionactions,anatomyterms,inferencegenesymbols,pubmedids,unnamed_13,parent_ixn
0,10074-G5,C534883,,ATP biosynthetic process,GO:0006754,,Homo sapiens,9606,10074-G5 analog results in decreased ATP biosy...,decreases^phenotype,1^HL-60 Cells^D018922,,26036281,,decreases^phenotype
1,10074-G5,C534883,,ATP biosynthetic process,GO:0006754,,Homo sapiens,9606,10074-G5 analog results in decreased ATP biosy...,decreases^phenotype,"1^Lung^D008168|2^Cell Line, Tumor^D045744",,26036281,,decreases^phenotype


In [100]:
%%time
chem_pheno = pr.combine_group_rows_on_char(chem_pheno, ['chemicalid', 'phenotypeid', 'parent_ixn'],
                                                ['pubmedids', 'organismid'])

CPU times: user 36.6 s, sys: 400 ms, total: 37 s
Wall time: 35.3 s


In [101]:
chem_pheno['parent_ixn'].value_counts()

increases^phenotype                 37477
decreases^reaction                  28480
decreases^phenotype                 18896
affects^phenotype                   11422
affects^cotreatment                  9789
increases^reaction                   6941
increases^abundance                  2880
affects^reaction                     2597
decreases^abundance                  1736
increases^response to substance       847
affects^binding                       838
affects^localization                  428
decreases^activity                    423
increases^expression                  400
increases^activity                    388
affects^response to substance         325
decreases^response to substance       321
increases^secretion                   300
increases^chemical synthesis          236
decreases^expression                  235
decreases^chemical synthesis          139
affects^abundance                     129
increases^uptake                      119
increases^oxidation               

In [102]:
keep_pheno_ixns = [direction+'^phenotype' for direction in ixn_dirs]
keep_pheno_ixns += [direction+'^reaction' for direction in ixn_dirs]

In [103]:
cph_edges = chem_pheno[['chemicalid', 'phenotypeid', 'parent_ixn', 'pubmedids', 'organismid']]
cph_edges = cph_edges.rename(columns={'chemicalid': 'start_id', 'phenotypeid': 'end_id', 
                                      'pubmedids': 'pub_med_ids', 'organismid': 'organism_id'})

In [104]:
id_to_label = nodes.set_index('id')['label'].apply(lambda s: ''.join(c[0] for c in s.split(' '))).to_dict()

In [105]:
cph_edges = cph_edges.query('parent_ixn in @keep_pheno_ixns')

cph_edges['abbv'] = 'C' + cph_edges['parent_ixn'].apply(lambda i: ''.join([s[0] for s in i.split('^')]))  
cph_edges['abbv'] = cph_edges['abbv'] + cph_edges['end_id'].map(id_to_label)


cph_edges['type'] = cph_edges['parent_ixn'].str.replace('^', '_') + '_' + cph_edges['abbv']
cph_edges['start_id'] = 'MESH:' + cph_edges['start_id']

cph_edges.head(5)

Unnamed: 0,start_id,end_id,parent_ixn,pub_med_ids,organism_id,abbv,type
0,MESH:C000121,GO:0008283,decreases^phenotype,25899827,9606,CdpBP,decreases_phenotype_CdpBP
1,MESH:C000121,GO:0016477,decreases^phenotype,25899827,9606,CdpBP,decreases_phenotype_CdpBP
3,MESH:C000152,GO:0006702,decreases^phenotype,11118046,9606,CdpBP,decreases_phenotype_CdpBP
4,MESH:C000152,GO:0008284,decreases^phenotype,11118046,9606,CdpBP,decreases_phenotype_CdpBP
5,MESH:C000152,GO:0008284,decreases^reaction,11118046,9606,CdrBP,decreases_reaction_CdrBP


In [106]:
cph_edges['source'] = 'CTD'
cph_edges['evidence'] = 'curated'

In [107]:
edges.append(cph_edges)

In [108]:
e_files[8]

'CTD_Phenotype-Disease_biological_process_associations.csv.gz'

### Dis - BP

In [109]:
dis_bp = read_ctd(load_dir.joinpath(e_files[8]))
dis_bp.head(2)

Unnamed: 0,goname,goid,disease_name,disease_id,inference_chemical_qty,inference_chemical_names,inference_gene_qty,inference_gene_symbols
0,10-formyltetrahydrofolate biosynthetic process,GO:0009257,Abruptio Placentae,MESH:D000037,0,,1,MTHFD1
1,10-formyltetrahydrofolate biosynthetic process,GO:0009257,"Neural tube defect, folate-sensitive",MESH:C536409,0,,1,MTHFD1


In [110]:
dis_bp.shape

(1802197, 8)

In [111]:
dbp_edges = dis_bp[['goid', 'disease_id']].rename(columns={'goid': 'start_id', 'disease_id': 'end_id'})
dbp_edges['abbv'] = 'BPawD'
dbp_edges['type'] = 'associated_with_' + dbp_edges['abbv']

dbp_edges.head(5)

Unnamed: 0,start_id,end_id,abbv,type
0,GO:0009257,MESH:D000037,BPawD,associated_with_BPawD
1,GO:0009257,MESH:C536409,BPawD,associated_with_BPawD
2,GO:0009257,MESH:D009436,BPawD,associated_with_BPawD
3,GO:0009258,MESH:D056486,BPawD,associated_with_BPawD
4,GO:0009258,MESH:D000860,BPawD,associated_with_BPawD


In [113]:
dbp_edges['source'] = 'CTD'
dbp_edges['evidence'] = 'computed'

In [114]:
edges.append(dbp_edges)

In [115]:
e_files[9]

'CTD_Phenotype-Disease_cellular_component_associations.csv.gz'

### Dis - CC

In [116]:
dis_cc = read_ctd(load_dir.joinpath(e_files[9]))
dis_cc.head(2)

Unnamed: 0,goname,goid,disease_name,disease_id,inference_chemical_qty,inference_chemical_names,inference_gene_qty,inference_gene_symbols
0,3M complex,GO:1990393,Autistic Disorder,MESH:D001321,0,,1,CUL7
1,3M complex,GO:1990393,"Carcinoma, Renal Cell",MESH:D002292,0,,1,CUL7


In [117]:
dis_cc.shape

(126497, 8)

In [118]:
dcc_edges = dis_cc[['goid', 'disease_id']].rename(columns={'goid': 'start_id', 'disease_id': 'end_id'})
dcc_edges['abbv'] = 'CCawD'
dcc_edges['type'] = 'associated_with_' + dcc_edges['abbv']

dcc_edges.head(5)

Unnamed: 0,start_id,end_id,abbv,type
0,GO:1990393,MESH:D001321,CCawD,associated_with_CCawD
1,GO:1990393,MESH:D002292,CCawD,associated_with_CCawD
2,GO:1990393,MESH:C535314,CCawD,associated_with_CCawD
3,GO:1990393,MESH:C567862,CCawD,associated_with_CCawD
4,GO:1990393,MESH:D001749,CCawD,associated_with_CCawD


In [119]:
dcc_edges['source'] = 'CTD'
dcc_edges['evidence'] = 'computed'

In [120]:
edges.append(dcc_edges)

In [121]:
e_files[10]

'CTD_Phenotype-Disease_molecular_function_associations.csv.gz'

### Dis - MF

In [122]:
dis_mf = read_ctd(load_dir.joinpath(e_files[10]))
dis_mf.head(2)

Unnamed: 0,goname,goid,disease_name,disease_id,inference_chemical_qty,inference_chemical_names,inference_gene_qty,inference_gene_symbols
0,10-hydroxy-9-(phosphonooxy)octadecanoate phosp...,GO:0033885,Acute Kidney Injury,MESH:D058186,0,,1,EPHX2
1,10-hydroxy-9-(phosphonooxy)octadecanoate phosp...,GO:0033885,Autism Spectrum Disorder,MESH:D000067877,0,,1,EPHX2


In [123]:
dis_mf.shape

(200321, 8)

In [124]:
dmf_edges = dis_mf[['goid', 'disease_id']].rename(columns={'goid': 'start_id', 'disease_id': 'end_id'})
dmf_edges['abbv'] = 'MFawD'
dmf_edges['type'] = 'associated_with_' + dmf_edges['abbv']

dmf_edges.head(5)

Unnamed: 0,start_id,end_id,abbv,type
0,GO:0033885,MESH:D058186,MFawD,associated_with_MFawD
1,GO:0033885,MESH:D000067877,MFawD,associated_with_MFawD
2,GO:0033885,MESH:D064420,MFawD,associated_with_MFawD
3,GO:0033885,MESH:D006333,MFawD,associated_with_MFawD
4,GO:0033885,OMIM:143890,MFawD,associated_with_MFawD


In [125]:
dmf_edges['source'] = 'CTD'
dmf_edges['evidence'] = 'computed'

In [126]:
edges.append(dmf_edges)

# Putting it all together

In [127]:
edges = pd.concat(edges, sort=False, ignore_index=True)
edges.head(2)

Unnamed: 0,start_id,end_id,parent_ixn,pub_med_ids,organism_id,abbv,type,source,evidence,direct_evidence,corrected_pvalue,inference_gene_symbol
0,MESH:C000121,4313,decreases^activity,25899827,9606,CdaG,decreases_activity_CdaG,CTD,curated,,,
1,MESH:C000121,4313,decreases^expression,25899827,9606,CdeG,decreases_expression_CdeG,CTD,curated,,,


In [128]:
edges = edges[sorted(edges.columns, key=lambda x: x not in ['start_id', 'end_id', 'type'])]
edges.head(2)

Unnamed: 0,start_id,end_id,type,parent_ixn,pub_med_ids,organism_id,abbv,source,evidence,direct_evidence,corrected_pvalue,inference_gene_symbol
0,MESH:C000121,4313,decreases_activity_CdaG,decreases^activity,25899827,9606,CdaG,CTD,curated,,,
1,MESH:C000121,4313,decreases_expression_CdeG,decreases^expression,25899827,9606,CdeG,CTD,curated,,,


In [129]:
'{:,}'.format(len(edges))

'5,359,668'

In [130]:
edge_ids = edges[['start_id', 'end_id']].stack().unique()

In [131]:
len(edge_ids)

90726

In [132]:
print('{:,}'.format(len(nodes)))
nodes_filt = nodes.query('id in @edge_ids')
print('{:,}'.format(len(nodes_filt)))

747,662
90,736


### Some QC on the nodes and edges...

#### Make sure all IDs found in edges are also found in nodes...

In [133]:
len(set(edge_ids) - set(nodes['id']))

25

In [134]:
len(set(edge_ids) - set(nodes_filt['id']))

25

In [135]:
set(edge_ids) - set(nodes_filt['id'])

{'GO:0000060',
 'GO:0004874',
 'GO:0008565',
 'GO:0015238',
 'GO:0015307',
 'GO:0017082',
 'GO:0033160',
 'GO:0033216',
 'GO:0034437',
 'GO:0038052',
 'GO:0042891',
 'GO:0042895',
 'GO:0042954',
 'GO:0097286',
 'GO:0097689',
 'GO:1903414',
 'GO:1903756',
 'GO:1903757',
 'GO:1903758',
 'GO:1904167',
 'GO:1904169',
 'GO:2000824',
 'GO:2001273',
 'GO:2001274',
 'GO:2001275'}

These are all Obsolete GO terms... so we will remove them from the graph..

In [136]:
obsolete_go_terms = set(edge_ids) - set(nodes_filt['id'])
print('Edges before: {:,}'.format(len(edges)))
edges = edges.query('start_id not in @obsolete_go_terms and end_id not in @obsolete_go_terms')
print('Edges after: {:,}'.format(len(edges)))

Edges before: 5,359,668
Edges after: 5,354,828


#### Ensure all unique node identifiers have only 1 'label'

In [137]:
print('IDs with more than one type: {}'.format(nodes[nodes['id'].duplicated(keep=False)]['id'].nunique()))

IDs with more than one type: 59


59 examples... let's look at the first 5 and see if there's a pattern..

In [138]:
nodes[nodes['id'].duplicated(keep=False)].sort_values('id').head(10)

Unnamed: 0,id,name,label,tree_numbers,drug_bank_ids,alt_disease_ids,gene_symbol,alt_gene_ids,bio_gridids,pharm_gkbids,uni_prot_ids
100582,MESH:C536075,Hyaloideoretinal degeneration of Wagner,Compound,D09.698.735.200.750/C536075,,,,,,,
176555,MESH:C536075,Hyaloideoretinal degeneration of Wagner,Disease,C11.270.612/C536075|C11.768.585/C536075,,OMIM:143200,,,,,
102253,MESH:C536285,Iminoglycinuria,Compound,D12.125.481/C536285,,,,,,,
177023,MESH:C536285,Iminoglycinuria,Disease,C12.777.419.815/C536285|C13.351.968.419.815/C5...,,OMIM:242600,,,,,
78949,MESH:C557815,Deficiency of interleukin-1 receptor antagonist,Compound,D23.529.374.460/C557815,,,,,,,
174279,MESH:C557815,Deficiency of interleukin-1 receptor antagonist,Disease,C16.320.382/C557815|C17.800.827.368/C557815,,,,,,,
97525,MESH:C562477,Halothane Hepatitis,Compound,D02.455.526.340/C562477,,,,,,,
176126,MESH:C562477,Halothane Hepatitis,Disease,C06.552.100/C562477|C25.100.562/C562477|C25.72...,,,,,,,
114485,MESH:C562609,Metachromatic Leukodystrophy due to Saposin B ...,Compound,D08.211.790.500/C562609,,,,,,,
178638,MESH:C562609,Metachromatic Leukodystrophy due to Saposin B ...,Disease,C10.228.140.163.100.362.550/C562609|C10.228.14...,,OMIM:249900,,,,,


They appear to be diseases mis-labeld as compounds in some instances....

In [139]:
nodes[nodes['id'].duplicated(keep=False)].groupby('label')['id'].nunique()

label
Compound    59
Disease     59
Name: id, dtype: int64

All of these duplicated IDs are represented as both a Compound and a Disease... So we will drop the compound version

In [140]:
drop_node_idx = nodes[nodes['id'].duplicated(keep=False)].query('label == "Compound"').index
nodes = nodes.drop(drop_node_idx)

edge_ids = edges[['start_id', 'end_id']].stack().unique()
print('Unique IDs in Edges {:,}'.format(len(edge_ids)))
print('Nodes before dropping: {:,}'.format(len(nodes_filt)))
nodes_filt = nodes.query('id in @edge_ids')
print('Nodes after dropping: {:,}'.format(len(nodes_filt)))

Unique IDs in Edges 90,698
Nodes before dropping: 90,736
Nodes after dropping: 90,698


#### Validate Node types match edge labels

Finally, we want to make sure our node typing is all correct...
Nodes that say they're a compound appear as compounds in the edge file, etc....

In [141]:
node_abv_to_full = {'C': 'Compound',
                    'D': 'Disease',
                    'G': 'Gene',
                    'PW': 'Pathway',
                    'MF': 'Molecular Function',
                    'BP': 'Biological Process',
                    'CC': 'Cellular Component'}

In [142]:
# combine_nodes_and_edges uses typing from the node dataframe to type out the concpets in the edge
combo = gt.combine_nodes_and_edges(nodes, edges)
col_order = ['start_id', 'start_name', 'start_label', 'type', 'abbv', 'end_id', 'end_name', 'end_label']
combo.head(2)[col_order]

Unnamed: 0,start_id,start_name,start_label,type,abbv,end_id,end_name,end_label
0,MESH:C000121,pipoxolan,Compound,decreases_activity_CdaG,CdaG,4313,matrix metallopeptidase 2,Gene
1,MESH:C000121,pipoxolan,Compound,decreases_expression_CdeG,CdeG,4313,matrix metallopeptidase 2,Gene


In [143]:
# The edge abbreviation will have the desired typing of these two, so we can compare the two to see if 
# any edges have concepts that are mis-typed...
combo['actual_start'] = combo['abbv'].apply(lambda a: node_abv_to_full[gt.parse_edge_abbrev(a)[0]])
combo['actual_end'] = combo['abbv'].apply(lambda a: node_abv_to_full[gt.parse_edge_abbrev(a)[2]])
col_order += ['actual_start', 'actual_end']
combo.head(2)[col_order]

Unnamed: 0,start_id,start_name,start_label,type,abbv,end_id,end_name,end_label,actual_start,actual_end
0,MESH:C000121,pipoxolan,Compound,decreases_activity_CdaG,CdaG,4313,matrix metallopeptidase 2,Gene,Compound,Gene
1,MESH:C000121,pipoxolan,Compound,decreases_expression_CdeG,CdeG,4313,matrix metallopeptidase 2,Gene,Compound,Gene


In [144]:
combo.query('actual_start != start_label or actual_end != end_label')[col_order]

Unnamed: 0,start_id,start_name,start_label,type,abbv,end_id,end_name,end_label,actual_start,actual_end
1107641,MESH:C564403,Coenzyme Q10 Deficiency,Disease,marker_or_mechanism_CmD,CmD,MESH:D003865,"Depressive Disorder, Major",Disease,Compound,Disease
1107642,MESH:C564403,Coenzyme Q10 Deficiency,Disease,marker_or_mechanism_CmD,CmD,MESH:D006333,Heart Failure,Disease,Compound,Disease
1107643,MESH:C564403,Coenzyme Q10 Deficiency,Disease,marker_or_mechanism_CmD,CmD,MESH:D009203,Myocardial Infarction,Disease,Compound,Disease
1903827,MESH:D065606,Metabolic Side Effects of Drugs and Substances,Disease,associated_with_CawPW,CawPW,REACT:R-HSA-211859,Biological oxidations,Pathway,Compound,Pathway
1903828,MESH:D065606,Metabolic Side Effects of Drugs and Substances,Disease,associated_with_CawPW,CawPW,KEGG:hsa_M00109,"C21-Steroid hormone biosynthesis, progesterone...",Pathway,Compound,Pathway
1903829,MESH:D065606,Metabolic Side Effects of Drugs and Substances,Disease,associated_with_CawPW,CawPW,REACT:R-HSA-211897,Cytochrome P450 - arranged by substrate type,Pathway,Compound,Pathway
1903830,MESH:D065606,Metabolic Side Effects of Drugs and Substances,Disease,associated_with_CawPW,CawPW,REACT:R-HSA-211976,Endogenous sterols,Pathway,Compound,Pathway
1903831,MESH:D065606,Metabolic Side Effects of Drugs and Substances,Disease,associated_with_CawPW,CawPW,REACT:R-HSA-194002,Glucocorticoid biosynthesis,Pathway,Compound,Pathway
1903832,MESH:D065606,Metabolic Side Effects of Drugs and Substances,Disease,associated_with_CawPW,CawPW,REACT:R-HSA-556833,Metabolism of lipids and lipoproteins,Pathway,Compound,Pathway
1903833,MESH:D065606,Metabolic Side Effects of Drugs and Substances,Disease,associated_with_CawPW,CawPW,REACT:R-HSA-196071,Metabolism of steroid hormones,Pathway,Compound,Pathway


These are all diseases that should be classified as compounds... similar to our duplicated node IDs above.  Likely, they appear because the disease is a compound deficiency, or otherwise highly related to a compound, as in the compound is a causative agent for the disease.

We'll drop them for now.

In [145]:
bad_edge = combo.query('actual_start != start_label or actual_end != end_label').index
edges = edges.drop(bad_edge)

# Final Stats

In [146]:
print('Nodes: {:,}'.format(len(nodes_filt)))
print('Edges: {:,}'.format(len(edges)))

Nodes: 90,698
Edges: 5,354,816


In [147]:
nodes['label'].value_counts()

Gene                  516505
Compound              170729
Biological Process     29704
Disease                12989
Molecular Function     11113
Cellular Component      4200
Pathway                 2363
Name: label, dtype: int64

In [148]:
edges['type'].value_counts()

associated_with_BPawD        1798362
associated_with_CawPW        1215923
associated_with_DawPW         563169
increases_expression_CieG     479825
decreases_expression_CdeG     410983
associated_with_MFawD         199530
affects_expression_CaeG       137093
part_of_GpoPW                 135809
associated_with_CCawD         126497
marker_or_mechanism_CmD        62912
increases_phenotype_CipBP      35984
therapeutic_CtD                34573
marker_or_mechanism_GmD        29342
increases_activity_CiaG        29001
decreases_reaction_CdrBP       26730
decreases_activity_CdaG        21926
decreases_phenotype_CdpBP      16993
affects_phenotype_CapBP        11221
increases_reaction_CirBP        6694
affects_reaction_CarBP          2419
affects_activity_CaaG           2382
therapeutic_GtD                 1890
decreases_phenotype_CdpMF       1831
decreases_reaction_CdrMF        1662
increases_phenotype_CipMF       1337
increases_reaction_CirMF         220
affects_phenotype_CapMF          189
a

# Saving it out

In [149]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')
// hack to get the filename for this notebook

<IPython.core.display.Javascript object>

In [150]:
out_dir = Path('../2_pipeline/').joinpath(nb_name.split('.')[0]).joinpath('out')
if not out_dir.exists():
    out_dir.mkdir(parent=True)

In [151]:
gt.add_colons(nodes, id_name='identifier').to_csv(out_dir.joinpath('nodes_all.csv'), index=False)
gt.add_colons(nodes_filt, id_name='identifier').to_csv(out_dir.joinpath('nodes_filt.csv'), index=False)

gt.add_colons(edges).to_csv(out_dir.joinpath('edges.csv'), index=False)

Didn't end up using the parent to child mappings, but lets save them for easy access in future uses

In [152]:
from itertools import chain

comp_p2c_df = pd.DataFrame({'parent_id': list(chain(*[[k]*len(v) for k, v in comp_p2c.items()])), 
                                'child_id': list(chain(*[v for k, v in comp_p2c.items()])) })
comp_p2c_df['type'] = 'Compound'


dis_p2c_df = pd.DataFrame({'parent_id': list(chain(*[[k]*len(v) for k, v in dis_p2c_map.items()])), 
                                'child_id': list(chain(*[v for k, v in dis_p2c_map.items()])) })
dis_p2c_df['type'] = 'Disease'

go_p2c_df = pd.DataFrame({'parent_id': list(chain(*[[k]*len(v) for k, v in go_p2c_map.items()])), 
                                'child_id': list(chain(*[v for k, v in go_p2c_map.items()])) })
go_p2c_df['type'] = 'GO'

p_to_c = pd.concat([comp_p2c_df, dis_p2c_df, go_p2c_df], sort=False)
p_to_c.to_csv(out_dir.joinpath('parent_to_child_mappings.csv'), index=False)