# GO Nodes into Edges

GO Biological Process nodes sometimes have to do with regulation...

E.G. GO:0045861	negative regulation of proteolysis	

Any Term that links to this go term essentially has an edge:
    negatively_regulates -> GO:0006508	proteolysis

Therefore we will determine these regulation nodes and convert them to edges.

In [1]:
import pandas as pd
from pathlib import Path
from metapaths.tools.hetnet_file_processing import read_reactome
from metapaths.tools.processing import regularize_colnames, head, expand_col_on_char, combine_group_rows_on_char
from hetnet_ml.src import graph_tools as gt

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

load_dir = Path('../2_pipeline/00_download_data/out/').resolve()
network_dir = load_dir.parent.parent.joinpath('05_GO_Genes_and_protein_families/out')

In [2]:
all_nodes = gt.remove_colons(pd.read_csv(network_dir.joinpath('nodes_all.csv'), dtype=str))
edges = gt.remove_colons(pd.read_csv(network_dir.joinpath('edges.csv'), dtype=str))

In [3]:
all_node_ids = all_nodes['id'].values
new_nodes = []
new_edges = []

In [4]:
all_nodes.head(2)

Unnamed: 0,id,name,label,tree_numbers,drug_bank_ids,alt_disease_ids,gene_symbol,alt_gene_ids,bio_gridids,pharm_gkbids,uni_prot_ids,uniprot_id,mesh_ids,chebi_ids
0,MESH:C089250,(0.017ferrocene)amylose,Compound,D01.490.200/C089250|D02.691.550.200/C089250|D0...,,,,,,,,,,
1,MESH:C114385,001-C8-NBD,Compound,D03.383.129.462.580/C114385|D12.644.456/C114385,,,,,,,,,,


# GO regulation...

In [5]:
go_terms = all_nodes.query('label in {}'.format(['Molecular Function', 'Cellular Component', 'Biological Process']))
len(go_terms)

45017

In [6]:
reg_terms = go_terms['name'].str.contains('regulation of')
reg_terms.sum()

10748

In [7]:
reg_ids = go_terms[reg_terms]['id'].values

## We'll get the regulation Edges from go.obo

Need some functions to parse the obo file...

In [8]:
def read_lines(filename):
    with open(filename, 'r') as fin:
        for line in fin.readlines():
            yield line

            
def parse_id(line):
    return line.lstrip('id:').strip()


def parse_name(line):
    return line.lstrip('name:').strip()


def parse_relationship(line, go_id, name):
    value = line.lstrip('relationship:').strip()
    rel_info, tgt_name = value.split(' ! ')
    id_idx = rel_info.index('GO:')
    tgt_id = rel_info[id_idx:]
    rel_type = rel_info[:id_idx].strip()
    
    return {'go_id': go_id, 'go_name': name, 'rel_type': rel_type, 'tgt_id': tgt_id, 'tgt_name': tgt_name}            


def read_obo_relationships(filename):
    relationships = []

    for l in read_lines(filename):
        if l.startswith('id:'):
            go_id = parse_id(l)
        if l.startswith('name:'):
            name = parse_name(l)
        if l.startswith('relationship:'):
            relationship = parse_relationship(l, go_id, name)
            relationships.append(relationship)

    return pd.DataFrame(relationships)

In [9]:
go_rels = read_obo_relationships(load_dir.joinpath('go.obo'))

In [10]:
go_rels.head(10)

Unnamed: 0,go_id,go_name,rel_type,tgt_id,tgt_name
0,GO:0000018,regulation of DNA recombination,regulates,GO:0006310,DNA recombination
1,GO:0000019,regulation of mitotic recombination,regulates,GO:0006312,mitotic recombination
2,GO:0000022,mitotic spindle elongation,part_of,GO:0000070,mitotic sister chromatid segregation
3,GO:0000022,mitotic spindle elongation,part_of,GO:0007052,mitotic spindle organization
4,GO:0000027,ribosomal large subunit assembly,part_of,GO:0042255,ribosome assembly
5,GO:0000027,ribosomal large subunit assembly,part_of,GO:0042273,ribosomal large subunit biogenesis
6,GO:0000028,ribosomal small subunit assembly,part_of,GO:0042255,ribosome assembly
7,GO:0000028,ribosomal small subunit assembly,part_of,GO:0042274,ribosomal small subunit biogenesis
8,GO:0000030,mannosyltransferase activity,part_of,GO:0097502,mannosylation
9,GO:0000036,acyl carrier activity,part_of,GO:0006633,fatty acid biosynthetic process


In [11]:
go_rels['rel_type'].value_counts()

part_of                 7933
regulates               3579
negatively_regulates    3131
positively_regulates    3105
has_part                 733
occurs_in                193
happens_during             7
ends_during                1
Name: rel_type, dtype: int64

In [12]:
print(len(go_rels.query('rel_type == "part_of"')))
go_rels.query('rel_type == "part_of"').sample(5)

7933


Unnamed: 0,go_id,go_name,rel_type,tgt_id,tgt_name
6203,GO:0035857,eosinophil fate specification,part_of,GO:0035854,eosinophil fate commitment
3355,GO:0015247,aminophospholipid transmembrane transporter ac...,part_of,GO:0015917,aminophospholipid transport
6121,GO:0035599,aspartic acid methylthiotransferase activity,part_of,GO:0018339,peptidyl-L-beta-methylthioaspartic acid biosyn...
11669,GO:0072326,vulval cell fate determination,part_of,GO:0072325,vulval cell fate commitment
9989,GO:0060681,branch elongation involved in ureteric bud bra...,part_of,GO:0001658,branching involved in ureteric bud morphogenesis


In [13]:
reg_rels = ['regulates', 'negatively_regulates', 'positively_regulates']
reg_res = go_rels.query('rel_type in @reg_rels')

In [14]:
reg_ids = reg_res['go_id'].values

In [15]:
combo = gt.combine_nodes_and_edges(all_nodes, edges)
combo_cols = ['start_id', 'start_name', 'start_label', 'type', 'end_id', 'end_name', 'end_label']

In [16]:
no_map_terms = go_terms[reg_terms].query('id not in @reg_ids')['id'].values
print(len(combo.query('start_id in @no_map_terms and end_label == "Disease"')))
combo.query('start_id in @no_map_terms and end_label == "Disease"')[combo_cols].sample(5)

62099


Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label
4794880,GO:0061418,regulation of transcription from RNA polymeras...,Biological Process,associated_with_BPawD,MESH:D002114,Calcinosis,Disease
4471937,GO:0051092,positive regulation of NF-kappaB transcription...,Biological Process,associated_with_BPawD,MESH:D003537,Cystadenoma,Disease
4115201,GO:0043116,negative regulation of vascular permeability,Biological Process,associated_with_BPawD,MESH:D007247,"Infertility, Female",Disease
4911051,GO:0023019,signal transduction involved in regulation of ...,Biological Process,associated_with_BPawD,MESH:D009362,Neoplasm Metastasis,Disease
4578923,GO:1901522,positive regulation of transcription from RNA ...,Biological Process,associated_with_BPawD,MESH:D008654,Mesothelioma,Disease


In [17]:
map_terms = go_terms.query('id in @reg_ids')['id'].values
print(len(combo.query('start_id in @map_terms and end_label == "Disease"')))
combo.query('start_id in @map_terms and end_label == "Disease"')[combo_cols].sample(5)

656949


Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label
4281435,GO:1900039,positive regulation of cellular response to hy...,Biological Process,associated_with_BPawD,MESH:D012770,"Shock, Cardiogenic",Disease
4028952,GO:2000483,negative regulation of interleukin-8 secretion,Biological Process,associated_with_BPawD,MESH:D007249,Inflammation,Disease
4792913,GO:0006357,regulation of transcription by RNA polymerase II,Biological Process,associated_with_BPawD,MESH:D018335,Rhabdoid Tumor,Disease
4234354,GO:2001171,positive regulation of ATP biosynthetic process,Biological Process,associated_with_BPawD,MESH:D009421,Nervous System Malformations,Disease
4271968,GO:0030335,positive regulation of cell migration,Biological Process,associated_with_BPawD,MESH:D020325,Migraine with Aura,Disease


In [18]:
tgt_ids = reg_res['tgt_id'].values
map_terms1 = go_terms.query('id in @tgt_ids')['id'].values
print(len(combo.query('start_id in @map_terms1 and end_label == "Disease"')))
combo.query('start_id in @map_terms1 and end_label == "Disease"')[combo_cols].sample(5)

532791


Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label
3418930,GO:0044267,cellular protein metabolic process,Biological Process,associated_with_BPawD,MESH:D001480,Basal Ganglia Diseases,Disease
3723884,GO:0043967,histone H4 acetylation,Biological Process,associated_with_BPawD,MESH:D015417,Hereditary Sensory and Motor Neuropathy,Disease
4618984,GO:0016579,protein deubiquitination,Biological Process,associated_with_BPawD,MESH:D009422,Nervous System Diseases,Disease
3326623,GO:0008150,biological_process,Biological Process,associated_with_BPawD,MESH:C566593,"Hypomagnesemia 1, Intestinal",Disease
3555883,GO:0006310,DNA recombination,Biological Process,associated_with_BPawD,MESH:D001855,Bone Marrow Diseases,Disease


In [19]:
# Regulations of regulation.... not sure what to do here but just drop
print(len(reg_res[reg_res['tgt_name'].str.contains('regulation of')]))
reg_res[reg_res['tgt_name'].str.contains('regulation of')].sample(5)

34


Unnamed: 0,go_id,go_name,rel_type,tgt_id,tgt_name
13862,GO:1900545,regulation of phenotypic switching by regulati...,regulates,GO:0100057,regulation of phenotypic switching by transcri...
13809,GO:1900465,negative regulation of arginine catabolic proc...,negatively_regulates,GO:0100045,negative regulation of arginine catabolic proc...
13768,GO:1900415,regulation of fungal-type cell wall biogenesis...,regulates,GO:0100033,regulation of fungal-type cell wall biogenesis...
13863,GO:1900547,negative regulation of phenotypic switching by...,regulates,GO:0100059,negative regulation of phenotypic switching by...
11339,GO:0071809,regulation of fever generation by regulation o...,regulates,GO:0100008,regulation of fever generation by prostaglandi...


In [20]:
# Things without 'regulation of' that appearantly regulate...
print(len(reg_res[~reg_res['go_name'].str.contains('regulation of')]))
reg_res[~reg_res['go_name'].str.contains('regulation of')].sample(5)

166


Unnamed: 0,go_id,go_name,rel_type,tgt_id,tgt_name
6514,GO:0042030,ATPase inhibitor activity,negatively_regulates,GO:0016887,ATPase activity
181,GO:0000765,response to pheromone regulating pheromone-ind...,regulates,GO:0000762,pheromone-induced unidirectional conjugation
3676,GO:0019834,phospholipase A2 inhibitor activity,negatively_regulates,GO:0004623,phospholipase A2 activity
16846,GO:1905318,meiosis I spindle assembly checkpoint,negatively_regulates,GO:0051755,meiotic sister chromatid arm separation
9618,GO:0060228,phosphatidylcholine-sterol O-acyltransferase a...,positively_regulates,GO:0004607,phosphatidylcholine-sterol O-acyltransferase a...


In [21]:
edges.columns

Index(['start_id', 'end_id', 'type', 'parent_ixn', 'pub_med_ids',
       'organism_id', 'abbv', 'direct_evidence', 'corrected_pvalue',
       'inference_gene_symbol', 'qualifier', 'db_reference', 'evidence_code',
       'with_or_from', 'date', 'assigned_by', 'experiments', 'support_type'],
      dtype='object')

In [22]:
combo_cols += ['abbv', 'parent_ixn', 'db_reference', 'evidence_code']
mrg = pd.merge(combo[combo_cols], reg_res, how='left', left_on='end_id', right_on='go_id')

In [23]:
len(mrg)

6333568

In [24]:
# don't wnat to touch the curated edges... they already have direction to them
reg_edges = mrg[mrg['parent_ixn'].isnull()]
reg_edges = reg_edges.dropna(subset=['tgt_id'])

reg_edges.head(2)

Unnamed: 0,start_id,start_name,start_label,type,end_id,end_name,end_label,abbv,parent_ixn,db_reference,evidence_code,go_id,go_name,rel_type,tgt_id,tgt_name
5354839,100,adenosine deaminase,Gene,involved_in_GinBP,GO:0002636,positive regulation of germinal center formation,Biological Process,GinBP,,GO_REF:0000107,IEA,GO:0002636,positive regulation of germinal center formation,positively_regulates,GO:0002467,germinal center formation
5354840,100,adenosine deaminase,Gene,involved_in_GinBP,GO:0002686,negative regulation of leukocyte migration,Biological Process,GinBP,,GO_REF:0000107,IEA,GO:0002686,negative regulation of leukocyte migration,negatively_regulates,GO:0050900,leukocyte migration


In [25]:
id_to_label = all_nodes.set_index('id')['label'].to_dict()

In [26]:
abbv, et = gt.get_abbrev_dict_and_edge_tuples(all_nodes, edges.drop_duplicates(subset=['type']))

In [27]:
reg_edges = reg_edges.drop(['end_id', 'end_label', 'end_name'], axis=1)
reg_edges['end_label'] = reg_edges['tgt_id'].map(id_to_label)
reg_edges['abbv'] = reg_edges['start_label'].map(abbv) + \
                        reg_edges['rel_type'].apply(lambda s: ''.join([z[0] for z in s.split('_')])) + \
                        reg_edges['end_label'].map(abbv)
reg_edges['type'] = reg_edges['rel_type'] + '_' + reg_edges['abbv']
reg_edges = reg_edges.rename(columns={'tgt_id':'end_id', 'tgt_name': 'end_name'})
reg_edges.head(2)

Unnamed: 0,start_id,start_name,start_label,type,abbv,parent_ixn,db_reference,evidence_code,go_id,go_name,rel_type,end_id,end_name,end_label
5354839,100,adenosine deaminase,Gene,positively_regulates_GprBP,GprBP,,GO_REF:0000107,IEA,GO:0002636,positive regulation of germinal center formation,positively_regulates,GO:0002467,germinal center formation,Biological Process
5354840,100,adenosine deaminase,Gene,negatively_regulates_GnrBP,GnrBP,,GO_REF:0000107,IEA,GO:0002686,negative regulation of leukocyte migration,negatively_regulates,GO:0050900,leukocyte migration,Biological Process


In [28]:
reg_edges = reg_edges[combo_cols].drop(['start_name', 'start_label', 'end_name', 'end_label'], axis=1).reset_index(drop=True)
reg_edges.head(2)

Unnamed: 0,start_id,type,end_id,abbv,parent_ixn,db_reference,evidence_code
0,100,positively_regulates_GprBP,GO:0002467,GprBP,,GO_REF:0000107,IEA
1,100,negatively_regulates_GnrBP,GO:0050900,GnrBP,,GO_REF:0000107,IEA


# Putting it all together

In [29]:
reg_edges.head(2)

Unnamed: 0,start_id,type,end_id,abbv,parent_ixn,db_reference,evidence_code
0,100,positively_regulates_GprBP,GO:0002467,GprBP,,GO_REF:0000107,IEA
1,100,negatively_regulates_GnrBP,GO:0050900,GnrBP,,GO_REF:0000107,IEA


In [30]:
reg_edges['type'].value_counts()

positively_regulates_GprBP     15578
negatively_regulates_GnrBP     10464
regulates_GrBP                 10036
positively_regulates_GprMF      2144
negatively_regulates_GnrMF      1593
regulates_PFrBP                  898
regulates_GrMF                   851
negatively_regulates_NnrBP       737
positively_regulates_NprBP       464
negatively_regulates_PFnrBP      267
positively_regulates_XprBP       197
positively_regulates_PFprBP      196
negatively_regulates_PFnrMF      111
regulates_XrBP                   109
positively_regulates_PFprMF       90
negatively_regulates_XnrBP        63
regulates_PFrMF                   58
negatively_regulates_NnrMF        50
regulates_NrBP                    48
regulates_PWrBP                   46
negatively_regulates_RXnrBP       36
positively_regulates_PWprBP       30
positively_regulates_XprMF        17
positively_regulates_RXprBP       16
regulates_RXrBP                   13
regulates_XrMF                    12
positively_regulates_NprMF        12
n

In [31]:
print('Total number of new edges: {:,}'.format(len(reg_edges)))
print('Number of unique new edges: {:,}'.format(len(reg_edges.drop_duplicates(subset=['start_id', 'end_id', 'type']))))

Total number of new edges: 44,176
Number of unique new edges: 43,704


Why are so many edges duplicated? what kind are they?

In [32]:
ix = reg_edges.duplicated(keep=False)
reg_edges[ix].sort_values(['start_id', 'end_id', 'type']).head(10)

Unnamed: 0,start_id,type,end_id,abbv,parent_ixn,db_reference,evidence_code
780,10221,negatively_regulates_GnrMF,GO:0004672,GnrMF,,PMID:15299019,IMP
781,10221,negatively_regulates_GnrMF,GO:0004672,GnrMF,,PMID:15299019,IMP
1940,10681,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:10521509,IDA
1941,10681,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:10521509,IDA
2322,10928,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:7673236,IBA|IDA
2324,10928,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:7673236,IBA|IDA
2626,11142,negatively_regulates_GnrMF,GO:0004691,GnrMF,,PMID:21873635,IBA
2628,11142,negatively_regulates_GnrMF,GO:0004691,GnrMF,,PMID:21873635,IBA
2805,1124,positively_regulates_GprMF,GO:0003924,GprMF,,GO_REF:0000002,IEA
2807,1124,positively_regulates_GprMF,GO:0003924,GprMF,,GO_REF:0000002,IEA


In [33]:
reg_edges[ix]['type'].value_counts()

positively_regulates_GprMF     92
negatively_regulates_GnrMF     60
positively_regulates_PFprMF    24
regulates_GrMF                 10
negatively_regulates_PFnrMF    10
negatively_regulates_XnrMF      2
Name: type, dtype: int64

In [34]:
dup_types = reg_edges[ix]['type'].unique()

dup_dfs = []

for dt in dup_types:
    dup_dfs.append(reg_edges[ix].query('type == @dt').sort_values(['start_id', 'end_id', 'type']).head(6))
    
pd.concat(dup_dfs)

Unnamed: 0,start_id,type,end_id,abbv,parent_ixn,db_reference,evidence_code
780,10221,negatively_regulates_GnrMF,GO:0004672,GnrMF,,PMID:15299019,IMP
781,10221,negatively_regulates_GnrMF,GO:0004672,GnrMF,,PMID:15299019,IMP
2626,11142,negatively_regulates_GnrMF,GO:0004691,GnrMF,,PMID:21873635,IBA
2628,11142,negatively_regulates_GnrMF,GO:0004691,GnrMF,,PMID:21873635,IBA
4045,132864,negatively_regulates_GnrMF,GO:0003924,GnrMF,,GO_REF:0000024,ISS
4046,132864,negatively_regulates_GnrMF,GO:0003924,GnrMF,,GO_REF:0000024,ISS
1940,10681,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:10521509,IDA
1941,10681,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:10521509,IDA
2322,10928,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:7673236,IBA|IDA
2324,10928,positively_regulates_GprMF,GO:0003924,GprMF,,PMID:7673236,IBA|IDA


I see no reason for any of these duplications.... We'll just do a simple 'drop_duplicates' rather than merging any columns

In [35]:
reg_edges = reg_edges.drop_duplicates(subset=['start_id', 'end_id', 'type']).copy()

In [36]:
len(reg_edges)

43704

In [38]:
all_node_ids = set(all_nodes['id']) 

print(len(reg_edges))
reg_edges_filt = reg_edges.query('start_id in @all_node_ids and end_id in @all_node_ids')
print(len(reg_edges_filt))

43704
43704


In [41]:
all_edges = pd.concat([edges, reg_edges_filt], sort=False)
all_edges.head(2)

Unnamed: 0,start_id,end_id,type,parent_ixn,pub_med_ids,organism_id,abbv,direct_evidence,corrected_pvalue,inference_gene_symbol,qualifier,db_reference,evidence_code,with_or_from,date,assigned_by,experiments,support_type
0,MESH:C000121,4313,decreases_activity_CdaG,decreases^activity,25899827,9606,CdaG,,,,,,,,,,,
1,MESH:C000121,4313,decreases_expression_CdeG,decreases^expression,25899827,9606,CdeG,,,,,,,,,,,


In [43]:
all_edge_ids = all_edges[['start_id', 'end_id']].stack()
filt_nodes = all_nodes.query('id in @all_edge_ids')
len(filt_nodes)

126348

# Save to Disk

In [44]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')
// hack to get the filename for this notebook

<IPython.core.display.Javascript object>

In [45]:
out_dir = Path('../2_pipeline/').joinpath(nb_name.split('.')[0]).joinpath('out').resolve()
out_dir.mkdir(parents=True, exist_ok=True)

In [47]:
#gt.add_colons(new_nodes_df, id_name='identifier').to_csv(out_dir.joinpath('new_nodes.csv'), index=False)
gt.add_colons(reg_edges).to_csv(out_dir.joinpath('new_edges.csv'), index=False)

In [48]:
gt.add_colons(all_nodes, id_name='identifier').to_csv(out_dir.joinpath('nodes_all.csv'), index=False)
gt.add_colons(filt_nodes, id_name='identifier').to_csv(out_dir.joinpath('nodes_filt.csv'), index=False)

gt.add_colons(all_edges).to_csv(out_dir.joinpath('edges.csv'), index=False)