# GO Nodes into Edges

GO Biological Process nodes sometimes have to do with regulation...

E.G. GO:0045861	negative regulation of proteolysis	

Any Term that links to this go term essentially has an edge:
    negatively_regulates -> GO:0006508	proteolysis

Therefore we will determine these regulation nodes and convert them to edges.

In [1]:
import pandas as pd
from pathlib import Path
from metapaths.tools.hetnet_file_processing import read_reactome
from metapaths.tools.processing import regularize_colnames, head, expand_col_on_char, combine_group_rows_on_char
from hetnet_ml.src import graph_tools as gt
import tools.obo_tools as ot

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

load_dir = Path('../2_pipeline/00_download_data/out/').resolve()
network_dir = load_dir.parent.parent.joinpath('05_GO_Genes_and_protein_families/out')

In [2]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')
// hack to get the filename for this notebook

<IPython.core.display.Javascript object>

In [3]:
out_dir = Path('../2_pipeline/').joinpath(nb_name.split('.')[0]).joinpath('out').resolve()
out_dir.mkdir(parents=True, exist_ok=True)

In [4]:
all_nodes = gt.remove_colons(pd.read_csv(network_dir.joinpath('nodes_all.csv'), dtype=str))
edges = gt.remove_colons(pd.read_csv(network_dir.joinpath('edges.csv'), dtype=str))

In [5]:
all_node_ids = all_nodes['id'].values
new_nodes = []
new_edges = []

In [6]:
all_nodes.head(2)

Unnamed: 0,id,name,label,tree_numbers,drug_bank_ids,alt_disease_ids,gene_symbol,alt_gene_ids,bio_gridids,pharm_gkbids,uni_prot_ids,uniprot_id,mesh_ids,chebi_ids
0,MESH:C089250,(0.017ferrocene)amylose,Compound,D01.490.200/C089250|D02.691.550.200/C089250|D0...,,,,,,,,,,
1,MESH:C114385,001-C8-NBD,Compound,D03.383.129.462.580/C114385|D12.644.456/C114385,,,,,,,,,,


# GO regulation...

In [7]:
go_terms = all_nodes.query('label in {}'.format(['Molecular Function', 'Cellular Component', 'Biological Process']))
len(go_terms)

45017

In [8]:
reg_terms = go_terms['name'].str.contains('regulation of')
reg_terms.sum()

10748

In [9]:
reg_ids = go_terms[reg_terms]['id'].values

## We'll get the regulation Edges from go.obo

Need some functions to parse the obo file...

In [10]:
go_rels = ot.get_ontology_edges(load_dir.joinpath('go.obo'))

In [11]:
go_rels.head(10)

Unnamed: 0,src_id,src_name,rel_type,tgt_id,tgt_name,src_src,tgt_src
0,GO:0000001,mitochondrion inheritance,is_a,GO:0048308,organelle inheritance,GO,GO
1,GO:0000001,mitochondrion inheritance,is_a,GO:0048311,mitochondrion distribution,GO,GO
2,GO:0000002,mitochondrial genome maintenance,is_a,GO:0007005,mitochondrion organization,GO,GO
3,GO:0000003,reproduction,is_a,GO:0008150,biological_process,GO,GO
4,GO:0000006,high-affinity zinc transmembrane transporter a...,is_a,GO:0005385,zinc ion transmembrane transporter activity,GO,GO
5,GO:0000007,low-affinity zinc ion transmembrane transporte...,is_a,GO:0005385,zinc ion transmembrane transporter activity,GO,GO
6,GO:0000009,"alpha-1,6-mannosyltransferase activity",is_a,GO:0000030,mannosyltransferase activity,GO,GO
7,GO:0000010,trans-hexaprenyltranstransferase activity,is_a,GO:0016765,"transferase activity, transferring alkyl or ar...",GO,GO
8,GO:0000011,vacuole inheritance,is_a,GO:0007033,vacuole organization,GO,GO
9,GO:0000011,vacuole inheritance,is_a,GO:0048308,organelle inheritance,GO,GO


In [12]:
go_rels['rel_type'].value_counts()

is_a                    77058
part_of                  7933
regulates                3579
negatively_regulates     3131
positively_regulates     3105
has_part                  733
occurs_in                 193
happens_during              7
ends_during                 1
Name: rel_type, dtype: int64

In [13]:
go_rels.query('rel_type == "part_of"').sample(5)

Unnamed: 0,src_id,src_name,rel_type,tgt_id,tgt_name,src_src,tgt_src
134,GO:0000100,S-methylmethionine transmembrane transporter a...,part_of,GO:0015806,S-methylmethionine transport,GO,GO
48808,GO:0048905,anterior lateral line neuromast mantle cell di...,part_of,GO:0048901,anterior lateral line neuromast development,GO,GO
22613,GO:0021628,olfactory nerve formation,part_of,GO:0021627,olfactory nerve morphogenesis,GO,GO
60188,GO:0070743,interleukin-23 complex,part_of,GO:0005615,extracellular space,GO,GO
77249,GO:1901260,peptidyl-lysine hydroxylation involved in bact...,part_of,GO:0072580,bacterial-type EF-P lysine modification,GO,GO


In [14]:
reg_rels = ['regulates', 'negatively_regulates', 'positively_regulates']
reg_res = go_rels.query('rel_type in @reg_rels')

In [15]:
reg_ids = reg_res['src_id'].values

In [16]:
combo = gt.combine_nodes_and_edges(all_nodes, edges)
combo_cols = ['start_id', 'start_name', 'start_label', 'type', 'end_id', 'end_name', 'end_label']

In [20]:
no_map_terms = go_terms[reg_terms].query('id not in @reg_ids')['id'].values
print('Number of edges containing Regulation terms that cannot be converted to an edge: {:,}'.format(
            len(combo.query('start_id in @no_map_terms and end_label == "Disease"'))))
combo.query('start_id in @no_map_terms and end_label == "Disease"')[combo_cols].sample(5)

Number of edges containing Regulation terms that cannot be converted to an edge: 62,099


Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label
4589622,GO:0035810,positive regulation of urine volume,Biological Process,associated_with_BPawD,MESH:D009140,Musculoskeletal Diseases,Disease
4763535,GO:0099509,regulation of presynaptic cytosolic calcium io...,Biological Process,associated_with_BPawD,MESH:C563134,"Spinocerebellar Ataxia, X-Linked 1",Disease
4666560,GO:0008217,regulation of blood pressure,Biological Process,associated_with_BPawD,MESH:C536056,Osteopetrosis autosomal dominant type 1,Disease
3967266,GO:0045792,negative regulation of cell size,Biological Process,associated_with_BPawD,MESH:D009358,"Congenital, Hereditary, and Neonatal Diseases ...",Disease
4361165,GO:0010460,positive regulation of heart rate,Biological Process,associated_with_BPawD,MESH:D052456,Hypoalphalipoproteinemias,Disease


In [21]:
map_terms = go_terms.query('id in @reg_ids')['id'].values
print('Number of edges containing Regulation terms that can become an edge: {:,}'.format(
    len(combo.query('start_id in @map_terms and end_label == "Disease"'))))
combo.query('start_id in @map_terms and end_label == "Disease"')[combo_cols].sample(5)

Number of edges containing Regulation terms that can become an edge: 656,949


Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label
4350018,GO:0045722,positive regulation of gluconeogenesis,Biological Process,associated_with_BPawD,MESH:D011628,"Puberty, Delayed",Disease
4262322,GO:0060355,positive regulation of cell adhesion molecule ...,Biological Process,associated_with_BPawD,MESH:D001172,"Arthritis, Rheumatoid",Disease
4483534,GO:0045778,positive regulation of ossification,Biological Process,associated_with_BPawD,MESH:D013119,Spinal Cord Injuries,Disease
4346262,GO:0010628,positive regulation of gene expression,Biological Process,associated_with_BPawD,MESH:D012174,Retinitis Pigmentosa,Disease
4050183,GO:0071638,negative regulation of monocyte chemotactic pr...,Biological Process,associated_with_BPawD,MESH:D010003,Osteoarthritis,Disease


In [22]:
tgt_ids = reg_res['tgt_id'].values
map_terms1 = go_terms.query('id in @tgt_ids')['id'].values
print(len(combo.query('start_id in @map_terms1 and end_label == "Disease"')))
combo.query('start_id in @map_terms1 and end_label == "Disease"')[combo_cols].sample(5)

532791


Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label
3516631,GO:0019221,cytokine-mediated signaling pathway,Biological Process,associated_with_BPawD,MESH:D011018,"Pneumonia, Pneumococcal",Disease
4147031,GO:0006809,nitric oxide biosynthetic process,Biological Process,associated_with_BPawD,MESH:D009325,Nausea,Disease
4949394,GO:0036466,synaptic vesicle recycling via endosome,Biological Process,associated_with_BPawD,MESH:D000236,Adenoma,Disease
4814280,GO:0003016,respiratory system process,Biological Process,associated_with_BPawD,MESH:D005512,Food Hypersensitivity,Disease
5011555,GO:0016192,vesicle-mediated transport,Biological Process,associated_with_BPawD,MESH:D002779,Cholestasis,Disease


In [23]:
# Regulations of regulation.... not sure what to do here but just drop
print(len(reg_res[reg_res['tgt_name'].str.contains('regulation of')]))
reg_res[reg_res['tgt_name'].str.contains('regulation of')].sample(5)

34


Unnamed: 0,src_id,src_name,rel_type,tgt_id,tgt_name,src_src,tgt_src
74831,GO:1900545,regulation of phenotypic switching by regulati...,regulates,GO:0100057,regulation of phenotypic switching by transcri...,GO,GO
74370,GO:1900382,regulation of thiamine biosynthetic process by...,regulates,GO:0100016,regulation of thiamine biosynthetic process by...,GO,GO
74384,GO:1900389,regulation of glucose import by regulation of ...,regulates,GO:0100018,regulation of glucose import by transcription ...,GO,GO
54755,GO:0060239,positive regulation of signal transduction inv...,positively_regulates,GO:0032005,signal transduction involved in positive regul...,GO,GO
84747,GO:1904001,positive regulation of pyrimidine-containing c...,positively_regulates,GO:0100068,positive regulation of pyrimidine-containing c...,GO,GO


In [24]:
# Things without 'regulation of' that appearantly regulate...
print(len(reg_res[~reg_res['src_name'].str.contains('regulation of')]))
reg_res[~reg_res['src_name'].str.contains('regulation of')].sample(5)

166


Unnamed: 0,src_id,src_name,rel_type,tgt_id,tgt_name,src_src,tgt_src
6400,GO:0004860,protein kinase inhibitor activity,negatively_regulates,GO:0004672,protein kinase activity,GO,GO
10716,GO:0008073,ornithine decarboxylase inhibitor activity,negatively_regulates,GO:0004586,ornithine decarboxylase activity,GO,GO
53312,GO:0052371,regulation by organism of entry into other org...,regulates,GO:0051828,entry into other organism involved in symbioti...,GO,GO
64057,GO:0072587,DNA topoisomerase (ATP-hydrolyzing) activator ...,positively_regulates,GO:0003918,DNA topoisomerase type II (ATP-hydrolyzing) ac...,GO,GO
64951,GO:0075235,modulation of zoospore movement on or near host,regulates,GO:0075234,zoospore movement on or near host,GO,GO


In [25]:
edges.columns

Index(['start_id', 'end_id', 'type', 'parent_ixn', 'pub_med_ids',
       'organism_id', 'abbv', 'source', 'evidence', 'direct_evidence',
       'corrected_pvalue', 'inference_gene_symbol', 'qualifier',
       'db_reference', 'evidence_code', 'with_or_from', 'date', 'assigned_by',
       'experiments', 'support_type'],
      dtype='object')

In [26]:
combo_cols += ['abbv', 'parent_ixn', 'db_reference', 'evidence_code', 'source', 'evidence']
mrg = pd.merge(combo[combo_cols], reg_res, how='left', left_on='end_id', right_on='src_id')

In [27]:
'{:,}'.format(len(mrg))

'6,478,252'

In [31]:
# don't wnat to touch the curated edges... they already have direction to them
reg_edges = mrg[mrg['parent_ixn'].isnull()]
reg_edges = reg_edges.dropna(subset=['tgt_id'])

reg_edges.head(2)

Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label,abbv,parent_ixn,db_reference,evidence_code,source,evidence,src_id,src_name,rel_type,tgt_id,tgt_name,src_src,tgt_src
5354839,100,adenosine deaminase,Gene,involved_in_GinBP,GO:0002636,positive regulation of germinal center formation,Biological Process,GinBP,,GO_REF:0000107,IEA,GO,computed,GO:0002636,positive regulation of germinal center formation,positively_regulates,GO:0002467,germinal center formation,GO,GO
5354840,100,adenosine deaminase,Gene,involved_in_GinBP,GO:0002686,negative regulation of leukocyte migration,Biological Process,GinBP,,GO_REF:0000107,IEA,GO,computed,GO:0002686,negative regulation of leukocyte migration,negatively_regulates,GO:0050900,leukocyte migration,GO,GO


In [32]:
id_to_label = all_nodes.set_index('id')['label'].to_dict()

In [33]:
abbv, et = gt.get_abbrev_dict_and_edge_tuples(all_nodes, edges.drop_duplicates(subset=['type']))

In [34]:
reg_edges = reg_edges.drop(['end_id', 'end_label', 'end_name'], axis=1)
reg_edges['end_label'] = reg_edges['tgt_id'].map(id_to_label)
reg_edges['abbv'] = reg_edges['start_label'].map(abbv) + \
                        reg_edges['rel_type'].apply(lambda s: ''.join([z[0] for z in s.split('_')])) + \
                        reg_edges['end_label'].map(abbv)
reg_edges['type'] = reg_edges['rel_type'] + '_' + reg_edges['abbv']
reg_edges = reg_edges.rename(columns={'tgt_id':'end_id', 'tgt_name': 'end_name'})
reg_edges.head(2)

Unnamed: 0,start_id,start_name,start_label,type,abbv,parent_ixn,db_reference,evidence_code,source,evidence,src_id,src_name,rel_type,end_id,end_name,src_src,tgt_src,end_label
5354839,100,adenosine deaminase,Gene,positively_regulates_GprBP,GprBP,,GO_REF:0000107,IEA,GO,computed,GO:0002636,positive regulation of germinal center formation,positively_regulates,GO:0002467,germinal center formation,GO,GO,Biological Process
5354840,100,adenosine deaminase,Gene,negatively_regulates_GnrBP,GnrBP,,GO_REF:0000107,IEA,GO,computed,GO:0002686,negative regulation of leukocyte migration,negatively_regulates,GO:0050900,leukocyte migration,GO,GO,Biological Process


In [35]:
reg_edges = reg_edges[combo_cols].drop(['start_name', 'start_label', 'end_name', 'end_label'], axis=1).reset_index(drop=True)
reg_edges.head(2)

Unnamed: 0,start_id,type,end_id,abbv,parent_ixn,db_reference,evidence_code,source,evidence
0,100,positively_regulates_GprBP,GO:0002467,GprBP,,GO_REF:0000107,IEA,GO,computed
1,100,negatively_regulates_GnrBP,GO:0050900,GnrBP,,GO_REF:0000107,IEA,GO,computed


# Putting it all together

In [36]:
reg_edges.head(2)

Unnamed: 0,start_id,type,end_id,abbv,parent_ixn,db_reference,evidence_code,source,evidence
0,100,positively_regulates_GprBP,GO:0002467,GprBP,,GO_REF:0000107,IEA,GO,computed
1,100,negatively_regulates_GnrBP,GO:0050900,GnrBP,,GO_REF:0000107,IEA,GO,computed


In [37]:
reg_edges['type'].value_counts()

positively_regulates_GprBP     15578
negatively_regulates_GnrBP     10464
regulates_GrBP                 10036
positively_regulates_GprMF      2144
negatively_regulates_GnrMF      1593
regulates_PFrBP                  898
regulates_GrMF                   851
negatively_regulates_NnrBP       737
positively_regulates_NprBP       464
negatively_regulates_PFnrBP      267
positively_regulates_XprBP       197
positively_regulates_PFprBP      196
negatively_regulates_PFnrMF      111
regulates_XrBP                   109
positively_regulates_PFprMF       90
negatively_regulates_XnrBP        63
regulates_PFrMF                   58
negatively_regulates_NnrMF        50
regulates_NrBP                    48
regulates_PWrBP                   46
negatively_regulates_RXnrBP       36
positively_regulates_PWprBP       30
positively_regulates_XprMF        17
positively_regulates_RXprBP       16
regulates_RXrBP                   13
regulates_XrMF                    12
positively_regulates_NprMF        12
n

In [38]:
print('Total number of new edges: {:,}'.format(len(reg_edges)))
print('Number of unique new edges: {:,}'.format(len(reg_edges.drop_duplicates(subset=['start_id', 'end_id', 'type']))))

Total number of new edges: 44,176
Number of unique new edges: 43,704


Why are so many edges duplicated? what kind are they?

In [39]:
ix = reg_edges.duplicated(keep=False)
reg_edges[ix].sort_values(['start_id', 'end_id', 'type']).head(10)

Unnamed: 0,start_id,type,end_id,abbv,parent_ixn,db_reference,evidence_code,source,evidence
780,10221,negatively_regulates_GnrMF,GO:0004672,GnrMF,,PMID:15299019,IMP,GO,curated
781,10221,negatively_regulates_GnrMF,GO:0004672,GnrMF,,PMID:15299019,IMP,GO,curated
1940,10681,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:10521509,IDA,GO,curated
1941,10681,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:10521509,IDA,GO,curated
2322,10928,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:7673236,IDA|IBA,GO,curated
2324,10928,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:7673236,IDA|IBA,GO,curated
2626,11142,negatively_regulates_GnrMF,GO:0004691,GnrMF,,PMID:21873635,IBA,GO,curated
2628,11142,negatively_regulates_GnrMF,GO:0004691,GnrMF,,PMID:21873635,IBA,GO,curated
2805,1124,positively_regulates_GprMF,GO:0003924,GprMF,,GO_REF:0000002,IEA,GO,computed
2807,1124,positively_regulates_GprMF,GO:0003924,GprMF,,GO_REF:0000002,IEA,GO,computed


In [40]:
reg_edges[ix]['type'].value_counts()

positively_regulates_GprMF     92
negatively_regulates_GnrMF     60
positively_regulates_PFprMF    24
regulates_GrMF                 10
negatively_regulates_PFnrMF    10
negatively_regulates_XnrMF      2
Name: type, dtype: int64

In [41]:
dup_types = reg_edges[ix]['type'].unique()

dup_dfs = []

for dt in dup_types:
    dup_dfs.append(reg_edges[ix].query('type == @dt').sort_values(['start_id', 'end_id', 'type']).head(6))
    
pd.concat(dup_dfs)

Unnamed: 0,start_id,type,end_id,abbv,parent_ixn,db_reference,evidence_code,source,evidence
780,10221,negatively_regulates_GnrMF,GO:0004672,GnrMF,,PMID:15299019,IMP,GO,curated
781,10221,negatively_regulates_GnrMF,GO:0004672,GnrMF,,PMID:15299019,IMP,GO,curated
2626,11142,negatively_regulates_GnrMF,GO:0004691,GnrMF,,PMID:21873635,IBA,GO,curated
2628,11142,negatively_regulates_GnrMF,GO:0004691,GnrMF,,PMID:21873635,IBA,GO,curated
4045,132864,negatively_regulates_GnrMF,GO:0003924,GnrMF,,GO_REF:0000024,ISS,GO,computed
4046,132864,negatively_regulates_GnrMF,GO:0003924,GnrMF,,GO_REF:0000024,ISS,GO,computed
1940,10681,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:10521509,IDA,GO,curated
1941,10681,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:10521509,IDA,GO,curated
2322,10928,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:7673236,IDA|IBA,GO,curated
2324,10928,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:7673236,IDA|IBA,GO,curated


I see no reason for any of these duplications.... We'll just do a simple 'drop_duplicates' rather than merging any columns

In [42]:
reg_edges = reg_edges.drop_duplicates(subset=['start_id', 'end_id', 'type']).copy()

In [43]:
len(reg_edges)

43704

In [44]:
all_node_ids = set(all_nodes['id']) 

print(len(reg_edges))
reg_edges_filt = reg_edges.query('start_id in @all_node_ids and end_id in @all_node_ids')
print(len(reg_edges_filt))

43704
43704


In [45]:
all_edges = pd.concat([edges, reg_edges_filt], sort=False)
all_edges.head(2)

Unnamed: 0,start_id,end_id,type,parent_ixn,pub_med_ids,organism_id,abbv,source,evidence,direct_evidence,corrected_pvalue,inference_gene_symbol,qualifier,db_reference,evidence_code,with_or_from,date,assigned_by,experiments,support_type
0,MESH:C000121,4313,decreases_activity_CdaG,decreases^activity,25899827,9606,CdaG,CTD,curated,,,,,,,,,,,
1,MESH:C000121,4313,decreases_expression_CdeG,decreases^expression,25899827,9606,CdeG,CTD,curated,,,,,,,,,,,


In [46]:
all_edge_ids = all_edges[['start_id', 'end_id']].stack()
filt_nodes = all_nodes.query('id in @all_edge_ids')
len(filt_nodes)

126691

# Save to Disk

In [48]:
gt.add_colons(reg_edges).to_csv(out_dir.joinpath('new_edges.csv'), index=False)

In [49]:
gt.add_colons(all_nodes, id_name='identifier').to_csv(out_dir.joinpath('nodes_all.csv'), index=False)
gt.add_colons(filt_nodes, id_name='identifier').to_csv(out_dir.joinpath('nodes_filt.csv'), index=False)

gt.add_colons(all_edges).to_csv(out_dir.joinpath('edges.csv'), index=False)