# GO Nodes into Edges

GO Biological Process nodes sometimes have to do with regulation...

E.G. GO:0045861	negative regulation of proteolysis	

Any Term that links to this go term essentially has an edge:
    negatively_regulates -> GO:0006508	proteolysis

Therefore we will determine these regulation nodes and convert them to edges.

In [1]:
import pandas as pd
from pathlib import Path
from data_tools.df_processing import regularize_colnames, expand_col_on_char
from data_tools import graphs as gt
from data_tools import obo_processing as ot

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

load_dir = Path('../2_pipeline/00_download_data/out/').resolve()
network_dir = load_dir.parent.parent.joinpath('05b_Other_Ontologies/out')

  from tqdm.autonotebook import tqdm


In [2]:
this_name = '05c_Computing_GO_Regulation_Edges'
out_dir = Path('../2_pipeline/').joinpath(this_name).joinpath('out').resolve()
out_dir.mkdir(parents=True, exist_ok=True)

In [3]:
all_nodes = pd.read_csv(network_dir.joinpath('nodes.csv'), dtype=str)
edges = pd.read_csv(network_dir.joinpath('edges.csv'), dtype=str)

In [4]:
all_node_ids = all_nodes['id'].values
new_nodes = []
new_edges = []

In [5]:
all_nodes.head(2)

Unnamed: 0,id,name,label,xrefs
0,UBERON:0000002,cervix,Anatomy,MESH:D002584|UBERON:0000002
1,UBERON:0000004,human nose,Anatomy,MESH:D009666|UBERON:0000004


# GO regulation...

In [6]:
go_terms = all_nodes.query('label in {}'.format(['Molecular Function', 'Cellular Component', 'Biological Process']))
len(go_terms)

45877

In [7]:
reg_terms = go_terms['name'].str.contains('regulation of')
reg_terms.sum()

10941

In [8]:
reg_ids = go_terms[reg_terms]['id'].values

## We'll get the regulation Edges from go.obo

Need some functions to parse the obo file...

In [9]:
go_rels = ot.get_ontology_edges(load_dir.joinpath('go.obo'))

In [10]:
go_rels.head(10)

Unnamed: 0,src_id,src_name,rel_type,tgt_id,tgt_name,src_src,tgt_src
0,GO:0000001,mitochondrion inheritance,is_a,GO:0048308,organelle inheritance,GO,GO
1,GO:0000001,mitochondrion inheritance,is_a,GO:0048311,mitochondrion distribution,GO,GO
2,GO:0000002,mitochondrial genome maintenance,is_a,GO:0007005,mitochondrion organization,GO,GO
3,GO:0000003,reproduction,is_a,GO:0008150,biological_process,GO,GO
4,GO:0000006,high-affinity zinc transmembrane transporter a...,is_a,GO:0005385,zinc ion transmembrane transporter activity,GO,GO
5,GO:0000007,low-affinity zinc ion transmembrane transporte...,is_a,GO:0005385,zinc ion transmembrane transporter activity,GO,GO
6,GO:0000009,"alpha-1,6-mannosyltransferase activity",is_a,GO:0000030,mannosyltransferase activity,GO,GO
7,GO:0000010,trans-hexaprenyltranstransferase activity,is_a,GO:0016765,"transferase activity, transferring alkyl or ar...",GO,GO
8,GO:0000011,vacuole inheritance,is_a,GO:0007033,vacuole organization,GO,GO
9,GO:0000011,vacuole inheritance,is_a,GO:0048308,organelle inheritance,GO,GO


In [11]:
go_rels['rel_type'].value_counts()

is_a                    77058
part_of                  7933
regulates                3579
negatively_regulates     3131
positively_regulates     3105
has_part                  733
occurs_in                 193
happens_during              7
ends_during                 1
Name: rel_type, dtype: int64

In [12]:
go_rels.query('rel_type == "part_of"').sample(5)

Unnamed: 0,src_id,src_name,rel_type,tgt_id,tgt_name,src_src,tgt_src
18990,GO:0016990,arginine deiminase activity,part_of,GO:0018101,protein citrullination,GO,GO
24673,GO:0030658,transport vesicle membrane,part_of,GO:0030133,transport vesicle,GO,GO
4869,GO:0003221,right ventricular cardiac muscle tissue morpho...,part_of,GO:0003215,cardiac right ventricle morphogenesis,GO,GO
50199,GO:0050908,detection of light stimulus involved in visual...,part_of,GO:0007601,visual perception,GO,GO
63293,GO:0072210,metanephric nephron development,part_of,GO:0001656,metanephros development,GO,GO


In [13]:
reg_rels = ['regulates', 'negatively_regulates', 'positively_regulates']
reg_res = go_rels.query('rel_type in @reg_rels')

In [14]:
reg_ids = reg_res['src_id'].values

In [15]:
combo = gt.combine_nodes_and_edges(all_nodes, edges)
combo_cols = ['start_id', 'start_name', 'start_label', 'type', 'end_id', 'end_name', 'end_label']

In [16]:
no_map_terms = go_terms[reg_terms].query('id not in @reg_ids')['id'].values
print('Number of edges containing Regulation terms that cannot be converted to an edge: {:,}'.format(
            len(combo.query('start_id in @no_map_terms and end_label == "Disease"'))))
combo.query('start_id in @no_map_terms and end_label == "Disease"')[combo_cols].sample(5)

Number of edges containing Regulation terms that cannot be converted to an edge: 2,041


Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label
38721,GO:0010460,positive regulation of heart rate,Biological Process,associated_with,DOID:0110224,Brugada syndrome 7,Disease
80453,GO:0045814,"negative regulation of gene expression, epigen...",Biological Process,associated_with,DOID:14731,Weaver syndrome,Disease
106928,GO:0098736,negative regulation of the force of heart cont...,Biological Process,associated_with,DOID:12978,Plasmodium vivax malaria,Disease
38734,GO:0010460,positive regulation of heart rate,Biological Process,associated_with,MONDO:0009314,GTP cyclohydrolase I deficiency,Disease
76439,GO:0043620,regulation of DNA-templated transcription in r...,Biological Process,associated_with,DOID:5363,myxoid liposarcoma,Disease


In [17]:
map_terms = go_terms.query('id in @reg_ids')['id'].values
print('Number of edges containing Regulation terms that can become an edge: {:,}'.format(
    len(combo.query('start_id in @map_terms and end_label == "Disease"'))))
combo.query('start_id in @map_terms and end_label == "Disease"')[combo_cols].sample(5)

Number of edges containing Regulation terms that can become an edge: 20,447


Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label
112386,GO:1904261,positive regulation of basement membrane assem...,Biological Process,associated_with,DOID:3773,third ventricle chordoid glioma,Disease
99740,GO:0071338,positive regulation of hair follicle cell prol...,Biological Process,associated_with,MONDO:0014757,macrothrombocytopenia-lymphedema-developmental...,Disease
70432,GO:0042035,regulation of cytokine biosynthetic process,Biological Process,associated_with,DOID:11714,gestational diabetes,Disease
70799,GO:0042130,negative regulation of T cell proliferation,Biological Process,associated_with,DOID:2436,glomangioma,Disease
39764,GO:0010804,negative regulation of tumor necrosis factor-m...,Biological Process,associated_with,MONDO:0014884,"cholestasis, progressive familial intrahepatic...",Disease


In [18]:
tgt_ids = reg_res['tgt_id'].values
map_terms1 = go_terms.query('id in @tgt_ids')['id'].values
print(len(combo.query('start_id in @map_terms1 and end_label == "Disease"')))
combo.query('start_id in @map_terms1 and end_label == "Disease"')[combo_cols].sample(5)

25093


Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label
69587,GO:0038027,apolipoprotein A-I-mediated signaling pathway,Biological Process,associated_with,DOID:1388,Tangier disease,Disease
18133,GO:0005261,cation channel activity,Molecular Function,associated_with,DOID:0111129,focal segmental glomerulosclerosis 2,Disease
103000,GO:0086014,atrial cardiac muscle cell action potential,Biological Process,associated_with,DOID:0080249,erythrokeratodermia variabilis et progressiva 3,Disease
16179,GO:0005089,Rho guanyl-nucleotide exchange factor activity,Molecular Function,associated_with,DOID:440,neuromuscular disease,Disease
89115,GO:0051289,protein homotetramerization,Biological Process,associated_with,DOID:0060007,CD3zeta deficiency,Disease


In [19]:
# Regulations of regulation.... not sure what to do here but just drop
print(len(reg_res[reg_res['tgt_name'].str.contains('regulation of')]))
reg_res[reg_res['tgt_name'].str.contains('regulation of')].sample(5)

34


Unnamed: 0,src_id,src_name,rel_type,tgt_id,tgt_name,src_src,tgt_src
80902,GO:1902574,negative regulation of leucine import by regul...,regulates,GO:0100065,negative regulation of leucine import by trans...,GO,GO
15759,GO:0010999,regulation of eIF2 alpha phosphorylation by heme,regulates,GO:0010998,regulation of translational initiation by eIF2...,GO,GO
74387,GO:1900392,regulation of transport by negative regulation...,negatively_regulates,GO:0100020,regulation of transport by transcription from ...,GO,GO
93348,GO:2000531,regulation of fatty acid biosynthetic process ...,regulates,GO:0100070,regulation of fatty acid biosynthetic process ...,GO,GO
62496,GO:0071810,regulation of fever generation by regulation o...,regulates,GO:0100009,regulation of fever generation by prostaglandi...,GO,GO


In [20]:
# Things without 'regulation of' that appearantly regulate...
print(len(reg_res[~reg_res['src_name'].str.contains('regulation of')]))
reg_res[~reg_res['src_name'].str.contains('regulation of')].sample(5)

166


Unnamed: 0,src_id,src_name,rel_type,tgt_id,tgt_name,src_src,tgt_src
69853,GO:0099106,ion channel regulator activity,regulates,GO:0015267,channel activity,GO,GO
22139,GO:0019869,chloride channel inhibitor activity,negatively_regulates,GO:0005254,chloride channel activity,GO,GO
35353,GO:0036478,L-dopa decarboxylase activator activity,positively_regulates,GO:0036468,L-dopa decarboxylase activity,GO,GO
11450,GO:0008657,DNA topoisomerase (ATP-hydrolyzing) inhibitor ...,negatively_regulates,GO:0003918,DNA topoisomerase type II (ATP-hydrolyzing) ac...,GO,GO
15349,GO:0010852,cyclase inhibitor activity,negatively_regulates,GO:0009975,cyclase activity,GO,GO


In [21]:
edges.columns

Index(['start_id', 'end_id', 'type', 'dsrc_type', 'comp_type', 'p_val',
       'adj_p', 'source', 'license', 'experiments', 'support_type', 'pmids'],
      dtype='object')

In [22]:
combo_cols += ['source', 'dsrc_type', 'comp_type', 'p_val', 'adj_p', 'license']
mrg = pd.merge(combo[combo_cols], reg_res, how='left', left_on='end_id', right_on='src_id')

In [23]:
'{:,}'.format(len(mrg))

'3,497,850'

In [24]:
reg_edges = mrg.dropna(subset=['tgt_id'])

reg_edges.head(2)

Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label,source,dsrc_type,comp_type,p_val,adj_p,license,src_id,src_name,rel_type,tgt_id,tgt_name,src_src,tgt_src
237375,InterPro:IPR000240,Serpin B9/maspin,Protein Family,enables,GO:0004867,serine-type endopeptidase inhibitor activity,Molecular Function,InterPro,curated,,,,custom open,GO:0004867,serine-type endopeptidase inhibitor activity,negatively_regulates,GO:0004252,serine-type endopeptidase activity,GO,GO
237392,InterPro:IPR000272,"Ion-transport regulator, FXYD motif",Protein Family,enables,GO:0099106,ion channel regulator activity,Molecular Function,InterPro,curated,,,,custom open,GO:0099106,ion channel regulator activity,regulates,GO:0015267,channel activity,GO,GO


In [25]:
id_to_label = all_nodes.set_index('id')['label'].to_dict()

In [26]:
reg_edges = reg_edges.drop(['type', 'end_id', 'end_label', 'end_name'], axis=1)
reg_edges['end_label'] = reg_edges['tgt_id'].map(id_to_label)
reg_edges = reg_edges.rename(columns={'tgt_id':'end_id', 'tgt_name': 'end_name', 'rel_type': 'type'})
reg_edges.head(2)

Unnamed: 0,start_id,start_name,start_label,source,dsrc_type,comp_type,p_val,adj_p,license,src_id,src_name,type,end_id,end_name,src_src,tgt_src,end_label
237375,InterPro:IPR000240,Serpin B9/maspin,Protein Family,InterPro,curated,,,,custom open,GO:0004867,serine-type endopeptidase inhibitor activity,negatively_regulates,GO:0004252,serine-type endopeptidase activity,GO,GO,Molecular Function
237392,InterPro:IPR000272,"Ion-transport regulator, FXYD motif",Protein Family,InterPro,curated,,,,custom open,GO:0099106,ion channel regulator activity,regulates,GO:0015267,channel activity,GO,GO,Molecular Function


In [27]:
reg_edges = reg_edges[combo_cols].drop(['start_name', 'start_label', 'end_name', 'end_label'], axis=1).reset_index(drop=True)
reg_edges['dsrc_type'] = 'computed'
reg_edges['comp_type'] = 'punning'
reg_edges.head(2)

Unnamed: 0,start_id,type,end_id,source,dsrc_type,comp_type,p_val,adj_p,license
0,InterPro:IPR000240,negatively_regulates,GO:0004252,InterPro,computed,punning,,,custom open
1,InterPro:IPR000272,regulates,GO:0015267,InterPro,computed,punning,,,custom open


In [28]:
reg_edges['source'].value_counts()

Gene Ontology|WikiData             37650
WikiData                           19745
Gene Ontology                       2739
Gene Ontology|Reactome|WikiData     2242
InterPro                            1620
Reactome                             338
Reactome|WikiData                    212
Cell Ontology                         36
Gene Ontology|Reactome                17
UBERON                                12
Name: source, dtype: int64

# Putting it all together

In [29]:
reg_edges.head(2)

Unnamed: 0,start_id,type,end_id,source,dsrc_type,comp_type,p_val,adj_p,license
0,InterPro:IPR000240,negatively_regulates,GO:0004252,InterPro,computed,punning,,,custom open
1,InterPro:IPR000272,regulates,GO:0015267,InterPro,computed,punning,,,custom open


In [30]:
reg_edges['type'].value_counts()

regulates               24626
positively_regulates    23042
negatively_regulates    16943
Name: type, dtype: int64

In [31]:
print('Total number of new edges: {:,}'.format(len(reg_edges)))
print('Number of unique new edges: {:,}'.format(len(reg_edges.drop_duplicates(subset=['start_id', 'end_id', 'type']))))

Total number of new edges: 64,611
Number of unique new edges: 63,621


Why are so many edges duplicated? what kind are they?

In [32]:
ix = reg_edges.duplicated(keep=False)
reg_edges[ix].sort_values(['start_id', 'end_id', 'type']).head(10)

Unnamed: 0,start_id,type,end_id,source,dsrc_type,comp_type,p_val,adj_p,license
22365,CPX:CPX-128,negatively_regulates,GO:0038023,Gene Ontology,computed,punning,,,CC-BY 4.0
22366,CPX:CPX-128,negatively_regulates,GO:0038023,Gene Ontology,computed,punning,,,CC-BY 4.0
22461,CPX:CPX-98,negatively_regulates,GO:0008233,Gene Ontology,computed,punning,,,CC-BY 4.0
22462,CPX:CPX-98,negatively_regulates,GO:0008233,Gene Ontology,computed,punning,,,CC-BY 4.0
22,InterPro:IPR003913,positively_regulates,GO:0003924,InterPro,computed,punning,,,custom open
1224,InterPro:IPR003913,positively_regulates,GO:0003924,InterPro,computed,punning,,,custom open
33,InterPro:IPR007648,negatively_regulates,GO:0016887,InterPro,computed,punning,,,custom open
1293,InterPro:IPR007648,negatively_regulates,GO:0016887,InterPro,computed,punning,,,custom open
37,InterPro:IPR008296,negatively_regulates,GO:0008233,InterPro,computed,punning,,,custom open
1313,InterPro:IPR008296,negatively_regulates,GO:0008233,InterPro,computed,punning,,,custom open


In [33]:
reg_edges[ix]['type'].value_counts()

positively_regulates    1054
negatively_regulates     738
regulates                 56
Name: type, dtype: int64

In [34]:
dup_types = reg_edges[ix]['type'].unique()

dup_dfs = []

for dt in dup_types:
    dup_dfs.append(reg_edges[ix].query('type == @dt').sort_values(['start_id', 'end_id', 'type']).head(6))
    
pd.concat(dup_dfs)

Unnamed: 0,start_id,type,end_id,source,dsrc_type,comp_type,p_val,adj_p,license
22,InterPro:IPR003913,positively_regulates,GO:0003924,InterPro,computed,punning,,,custom open
1224,InterPro:IPR003913,positively_regulates,GO:0003924,InterPro,computed,punning,,,custom open
80,InterPro:IPR026817,positively_regulates,GO:0003924,InterPro,computed,punning,,,custom open
1700,InterPro:IPR026817,positively_regulates,GO:0003924,InterPro,computed,punning,,,custom open
83,InterPro:IPR027107,positively_regulates,GO:0003924,InterPro,computed,punning,,,custom open
1715,InterPro:IPR027107,positively_regulates,GO:0003924,InterPro,computed,punning,,,custom open
22365,CPX:CPX-128,negatively_regulates,GO:0038023,Gene Ontology,computed,punning,,,CC-BY 4.0
22366,CPX:CPX-128,negatively_regulates,GO:0038023,Gene Ontology,computed,punning,,,CC-BY 4.0
22461,CPX:CPX-98,negatively_regulates,GO:0008233,Gene Ontology,computed,punning,,,CC-BY 4.0
22462,CPX:CPX-98,negatively_regulates,GO:0008233,Gene Ontology,computed,punning,,,CC-BY 4.0


I see no reason for any of these duplications.... We'll just do a simple 'drop_duplicates' rather than merging any columns

In [35]:
reg_edges = reg_edges.drop_duplicates(subset=['start_id', 'end_id', 'type']).copy()

In [36]:
len(reg_edges)

63621

In [37]:
all_node_ids = set(all_nodes['id']) 

print(len(reg_edges))
reg_edges_filt = reg_edges.query('start_id in @all_node_ids and end_id in @all_node_ids')
print(len(reg_edges_filt))

63621
63554


In [38]:
all_edges = pd.concat([edges, reg_edges_filt], sort=False, ignore_index=True)
all_edges.head(2)

Unnamed: 0,start_id,end_id,type,dsrc_type,comp_type,p_val,adj_p,source,license,experiments,support_type,pmids
0,CHEMBL:CHEMBL1743034,NCBIGene:3605,Neutralizing antibody,computed,merge,,,WikiData,CC0 1.0,,,
1,CHEBI:10055,NCBIGene:153,agonist,computed,merge,,,WikiData,CC0 1.0,,,


In [39]:
all_edge_ids = all_edges[['start_id', 'end_id']].stack()
filt_nodes = all_nodes.query('id in @all_edge_ids')
len(filt_nodes)

369829

# Save to Disk

In [40]:
all_nodes.to_csv(out_dir.joinpath('nodes.csv'), index=False)
filt_nodes.to_csv(out_dir.joinpath('nodes_filt.csv'), index=False)

all_edges.to_csv(out_dir.joinpath('edges.csv'), index=False)