# Reactions and Regulation

We started with reactions as their own nodes... so if a Compound regulates another compound through a action on a reaction the path was represend like this (Compound 1) -[negatively regulates]- (Reaction) -[has_part]- (Compound 2).  As reactions are human abstractoins of chemical-chemical interactions, we feel it is better to represent these interactions as direct relationships between (Compound 1) and (Compound 2).  Therefore we will be merging different concepts across the reaction node to better represent these direct relationships

In [1]:
import pandas as pd
from pathlib import Path

from data_tools import df_processing as dfp
from data_tools import graphs as gt

from wikidataintegrator.wdi_core import WDItemEngine

  from tqdm.autonotebook import tqdm


In [2]:
rhea_react = pd.read_csv('ftp://ftp.expasy.org/databases/rhea/tsv/rhea2reactome.tsv', sep='\t')
rhea_react.head(2)

Unnamed: 0,RHEA_ID,DIRECTION,MASTER_ID,ID
0,10041,LR,10040,R-HSA-176606.2
1,10041,LR,10040,R-HSA-8953499.2


In [3]:
rhea_react.columns = [c.lower() for c in rhea_react.columns]
rhea_react['rhea_id'] = 'RHEA:' + rhea_react['rhea_id'].astype(str)
rhea_react['id'] = 'REACT:' + rhea_react['id']
rhea_react.head(2)

Unnamed: 0,rhea_id,direction,master_id,id
0,RHEA:10041,LR,10040,REACT:R-HSA-176606.2
1,RHEA:10041,LR,10040,REACT:R-HSA-8953499.2


In [4]:
rhea_react['spl_id'] = rhea_react['id'].apply(lambda s: s.split('.')[0])

In [5]:
rhea_uniprot = pd.read_csv('ftp://ftp.expasy.org/databases/rhea/tsv/rhea2uniprot.tsv', sep='\t')
rhea_uniprot.head(2)

Unnamed: 0,RHEA_ID,DIRECTION,MASTER_ID,ID
0,10008,UN,10008,O17433
1,10008,UN,10008,O34564


In [6]:
rhea_uniprot.columns = [c.lower() for c in rhea_uniprot.columns]
rhea_uniprot['rhea_id'] = 'RHEA:' + rhea_uniprot['rhea_id'].astype(str)
rhea_uniprot['id'] = 'UniProt:' + rhea_uniprot['id']
rhea_uniprot.head(2)

Unnamed: 0,rhea_id,direction,master_id,id
0,RHEA:10008,UN,10008,UniProt:O17433
1,RHEA:10008,UN,10008,UniProt:O34564


In [7]:
cheb_names = pd.read_csv('ftp://ftp.expasy.org/databases/rhea/tsv/chebiId_name.tsv', sep='\t', header=None, names=['id', 'name'])
cheb_names.head(2)

Unnamed: 0,id,name
0,CHEBI:7,(+)-car-3-ene
1,CHEBI:20,"(1R,4S)-camphene"


In [8]:
# Get chemicals involved in reactions

query = """PREFIX rh:<http://rdf.rhea-db.org/>
            SELECT DISTINCT ?rhea_id ?chebi_id WHERE {
          ?reaction rdfs:subClassOf rh:Reaction .
          ?reaction rh:status rh:Approved .
          ?reaction rh:side ?reactionSide .
          ?reactionSide rh:contains ?sideContents .
          ?sideContents rh:compound ?sideCompound .
          ?sideCompound rh:accession ?chebi_id .
          ?reaction rh:accession ?rhea_id . 
        }"""

In [9]:
# Get names and citations related to different reactions

query2 = """PREFIX rh:<http://rdf.rhea-db.org/>
            SELECT DISTINCT ?rhea_id ?cit ?eqn ?trans WHERE {
          ?reaction rdfs:subClassOf rh:Reaction .
          # ?reaction rh:status rh:Approved .
          ?reaction rh:equation ?eqn .
          ?reaction rh:citation ?cit .
          ?reaction rh:accession ?rhea_id . 
          ?reaction rh:isTransport ?trans .
        }"""

In [10]:
rhea_db = WDItemEngine.execute_sparql_query(query, endpoint='https://sparql.rhea-db.org/sparql', as_dataframe=True)
rhea_db

Unnamed: 0,rhea_id,chebi_id
0,RHEA:51132,CHEBI:26386
1,RHEA:26006,CHEBI:142217
2,RHEA:26006,CHEBI:142345
3,RHEA:40719,GENERIC:14730
4,RHEA:40719,GENERIC:14731
...,...,...
62308,RHEA:52216,POLYMER:15071
62309,RHEA:56932,CHEBI:141179
62310,RHEA:56936,CHEBI:141179
62311,RHEA:57904,GENERIC:15030


In [11]:
rhea_nodes = WDItemEngine.execute_sparql_query(query2, endpoint='https://sparql.rhea-db.org/sparql', as_dataframe=True)
rhea_nodes

Unnamed: 0,rhea_id,cit,eqn,trans
0,RHEA:13973,http://rdf.ncbi.nlm.nih.gov/pubmed/16664761,diphosphate + H(+)(in) + H2O = 2 H(+)(out) + 2...,1
1,RHEA:13973,http://rdf.ncbi.nlm.nih.gov/pubmed/16667022,diphosphate + H(+)(in) + H2O = 2 H(+)(out) + 2...,1
2,RHEA:13973,http://rdf.ncbi.nlm.nih.gov/pubmed/2479537,diphosphate + H(+)(in) + H2O = 2 H(+)(out) + 2...,1
3,RHEA:13973,http://rdf.ncbi.nlm.nih.gov/pubmed/29691313,diphosphate + H(+)(in) + H2O = 2 H(+)(out) + 2...,1
4,RHEA:19121,http://rdf.ncbi.nlm.nih.gov/pubmed/8626454,an S-substituted glutathione(in) + ATP + H2O =...,1
...,...,...,...,...
27046,RHEA:57884,http://rdf.ncbi.nlm.nih.gov/pubmed/15697234,diphosphate + H2O + Na(+)(in) = H(+) + Na(+)(o...,1
27047,RHEA:57884,http://rdf.ncbi.nlm.nih.gov/pubmed/17605473,diphosphate + H2O + Na(+)(in) = H(+) + Na(+)(o...,1
27048,RHEA:57896,http://rdf.ncbi.nlm.nih.gov/pubmed/28002401,2-oxoglutarate + a 5'-end (N(7)-methyl 5'-trip...,0
27049,RHEA:57900,http://rdf.ncbi.nlm.nih.gov/pubmed/30197295,2-oxoglutarate + N(6)-methyladenosine in U6 sn...,0


In [12]:
rhea_nodes['cit'] = rhea_nodes['cit'].apply(lambda c: c.split('/')[-1])
rhea_nodes = dfp.combine_group_cols_on_char(rhea_nodes, ['rhea_id'], sort=True, prog=False)
rhea_nodes

Unnamed: 0,rhea_id,cit,eqn,trans
0,RHEA:27746,17420004,H(+) + L-cysteine = CO2 + cysteamine,0
1,RHEA:45912,11514237,"a (2E,4E)-dienoyl-CoA + H(+) + NADPH = a 4,5-s...",0
2,RHEA:46564,12815048,acetyl-CoA + apo-[ACP] = acetyl-[ACP] + adenos...,0
3,RHEA:46736,9305923,H2O + monoacyl-sn-glycero-3-phosphate = a mono...,0
4,RHEA:50868,11192938,"(18R)-hydroxy-(5Z,8Z,11Z,14Z,16E)-eicosapentae...",0
...,...,...,...,...
11645,RHEA:63844,11087748|20356456,[thioredoxin]-dithiol + cumene hydroperoxide =...,0
11646,RHEA:64016,1520296|7721776,alpha-L-Fuc-(1->2)-beta-D-Gal-(1->4)-D-Glc + G...,0
11647,RHEA:64024,17346033|20466767|22408253|27425635,"3,4-dihydroxybenzoyl-[aryl-carrier protein] + ...",0
11648,RHEA:64028,17346033|20466767|22408253|27425635,"3,4-dihydroxybenzoyl-[aryl-carrier protein] + ...",0


### Just a quick look see what chemicals are present in most approved reactions...

In [13]:
top_cheb = rhea_db[rhea_db['chebi_id'].str.startswith('CHEBI:')]['chebi_id'].value_counts().head(40).rename('count').to_frame().reset_index()
top_cheb.merge(cheb_names, how='left', left_on=['index'], right_on=['id']).drop('index', axis=1)

Unnamed: 0,count,id,name
0,7566,CHEBI:15378,H(+)
1,4971,CHEBI:15377,H2O
2,2147,CHEBI:15379,O2
3,1165,CHEBI:57287,CoA
4,1108,CHEBI:58349,NADP(+)
5,1107,CHEBI:57783,NADPH
6,1019,CHEBI:30616,ATP
7,975,CHEBI:57540,NAD(+)
8,948,CHEBI:57945,NADH
9,839,CHEBI:33019,diphosphate


In [14]:
load_dir = Path('../2_pipeline').resolve()

In [15]:
prev_dir = load_dir.joinpath('10e_Diseae_Phenotype_Cleanup_adding_edges', 'out')
nodes = pd.read_csv(prev_dir.joinpath('nodes.csv'), dtype=str)
edges = pd.read_csv(prev_dir.joinpath('edges.csv'), dtype=str)

node_ids = set(nodes['id'].unique())
edge_ids = set(edges[['start_id', 'end_id']].stack().unique())

In [16]:
combo = gt.combine_nodes_and_edges(nodes, edges)

In [17]:
len(combo.query('start_label == "Reaction" or end_label == "Reaction"'))

543936

In [18]:
new_edges = []

## Format relationships from Rhea

In [19]:
len(rhea_db.query('chebi_id in @edge_ids'))

32102

In [20]:
len(rhea_db.query('chebi_id in @node_ids'))

45234

In [21]:
len(rhea_react.query('spl_id in @node_ids'))

895

In [22]:
len(rhea_react)

900

In [23]:
from data_tools.wiki import get_curi_xrefs

In [24]:
uniprot_xrefs = get_curi_xrefs(nodes, 'UniProt')

In [25]:
uniprot_ids = set(uniprot_xrefs['xrefs'].unique())| set([u for u in node_ids if u.startswith('UniProt:')])

In [26]:
rhea_uniprot.query('id in @uniprot_ids')

Unnamed: 0,rhea_id,direction,master_id,id
8,RHEA:10008,UN,10008,UniProt:P58004
15,RHEA:10008,UN,10008,UniProt:Q9Y6P5
24,RHEA:10024,UN,10024,UniProt:O44498
25,RHEA:10024,UN,10024,UniProt:O44757
28,RHEA:10024,UN,10024,UniProt:P34544
...,...,...,...,...
297117,RHEA:64224,UN,64224,UniProt:Q5ZRP2
297152,RHEA:64224,UN,64224,UniProt:Q837T2
297161,RHEA:64224,UN,64224,UniProt:Q8CXS1
297196,RHEA:64228,UN,64228,UniProt:Q6P1A2


In [27]:
rhea_uniprot.query('id in @uniprot_ids')

Unnamed: 0,rhea_id,direction,master_id,id
8,RHEA:10008,UN,10008,UniProt:P58004
15,RHEA:10008,UN,10008,UniProt:Q9Y6P5
24,RHEA:10024,UN,10024,UniProt:O44498
25,RHEA:10024,UN,10024,UniProt:O44757
28,RHEA:10024,UN,10024,UniProt:P34544
...,...,...,...,...
297117,RHEA:64224,UN,64224,UniProt:Q5ZRP2
297152,RHEA:64224,UN,64224,UniProt:Q837T2
297161,RHEA:64224,UN,64224,UniProt:Q8CXS1
297196,RHEA:64228,UN,64228,UniProt:Q6P1A2


In [28]:
rhea_mappable = rhea_react.query('spl_id in @node_ids')['rhea_id'].unique()

In [29]:
rhea_uniprot.query('id in @uniprot_ids and rhea_id in @rhea_mappable')

Unnamed: 0,rhea_id,direction,master_id,id
271,RHEA:10133,LR,10132,UniProt:Q08AH1
272,RHEA:10133,LR,10132,UniProt:Q08AH3
273,RHEA:10133,LR,10132,UniProt:Q68CK6
2410,RHEA:10225,LR,10224,UniProt:O00764
4774,RHEA:10389,LR,10388,UniProt:P35558
...,...,...,...,...
270313,RHEA:52357,LR,52356,UniProt:O00204
270317,RHEA:52369,LR,52368,UniProt:O00204
270325,RHEA:52373,LR,52372,UniProt:P49888
272715,RHEA:53509,LR,53508,UniProt:Q16873


In [30]:
rhea_chems = rhea_db.query('chebi_id in @node_ids')['rhea_id'].unique()
rhea_prot_mapped = rhea_uniprot.query('id in @uniprot_ids and (rhea_id in @rhea_mappable or rhea_id in @rhea_chems)')['rhea_id'].unique()

In [31]:
rhea_db.query('chebi_id in @node_ids and rhea_id in @rhea_prot_mapped')

Unnamed: 0,rhea_id,chebi_id
0,RHEA:51132,CHEBI:26386
13,RHEA:53972,CHEBI:138002
16,RHEA:53992,CHEBI:138003
17,RHEA:53980,CHEBI:138004
18,RHEA:54000,CHEBI:138004
...,...,...
62226,RHEA:57840,CHEBI:43474
62227,RHEA:57884,CHEBI:43474
62242,RHEA:56916,CHEBI:15994
62309,RHEA:56932,CHEBI:141179


In [32]:
rhea_edges = rhea_db.merge(rhea_uniprot, on='rhea_id', how='inner').query('chebi_id in @node_ids and id in @uniprot_ids')

rhea_edges = rhea_edges.merge(uniprot_xrefs, left_on='id', right_on='xrefs', how='left', suffixes=('_xref', '_nw'))
rhea_edges.head(2)

Unnamed: 0,rhea_id,chebi_id,direction,master_id,id_xref,id_nw,xrefs
0,RHEA:51132,CHEBI:26386,UN,51132,UniProt:O43598,NCBIGene:10591,UniProt:O43598
1,RHEA:51132,CHEBI:15377,UN,51132,UniProt:O43598,NCBIGene:10591,UniProt:O43598


In [33]:
rhea_edges = rhea_edges.drop_duplicates(['chebi_id', 'id_xref'])
len(rhea_edges)

21745

In [34]:
rhea_edges.count()

rhea_id      21745
chebi_id     21745
direction    21745
master_id    21745
id_xref      21745
id_nw        21481
xrefs        21481
dtype: int64

In [35]:
rhea_edges['id_nw'] = rhea_edges['id_nw'].fillna(rhea_edges['id_xref'])

In [36]:
rhea_edges = rhea_edges.merge(rhea_nodes[['rhea_id', 'cit', 'trans']], on='rhea_id', how='left')
rhea_edges = rhea_edges.rename(columns={'cit': 'pmids'})

In [37]:
rhea_edges

Unnamed: 0,rhea_id,chebi_id,direction,master_id,id_xref,id_nw,xrefs,pmids,trans
0,RHEA:51132,CHEBI:26386,UN,51132,UniProt:O43598,NCBIGene:10591,UniProt:O43598,17234634|19720067|19822152|20962348,0
1,RHEA:51132,CHEBI:15377,UN,51132,UniProt:O43598,NCBIGene:10591,UniProt:O43598,17234634|19720067|19822152|20962348,0
2,RHEA:51132,CHEBI:62877,UN,51132,UniProt:O43598,NCBIGene:10591,UniProt:O43598,17234634|19720067|19822152|20962348,0
3,RHEA:40719,CHEBI:15378,UN,40719,UniProt:Q9Y4U1,NCBIGene:25974,UniProt:Q9Y4U1,19447654|19801555|21697092,0
4,RHEA:40719,CHEBI:57925,UN,40719,UniProt:Q9Y4U1,NCBIGene:25974,UniProt:Q9Y4U1,19447654|19801555|21697092,0
...,...,...,...,...,...,...,...,...,...
21740,RHEA:57884,CHEBI:29101,UN,57884,UniProt:Q8F641,NCBIGene:1150814,UniProt:Q8F641,15697234|17605473,1
21741,RHEA:57884,CHEBI:15378,UN,57884,UniProt:Q8F641,NCBIGene:1150814,UniProt:Q8F641,15697234|17605473,1
21742,RHEA:57884,CHEBI:43474,UN,57884,UniProt:Q8F641,NCBIGene:1150814,UniProt:Q8F641,15697234|17605473,1
21743,RHEA:57892,CHEBI:57783,UN,57892,UniProt:P37061,NCBIGene:1200486,UniProt:P37061,,


In [38]:
#rhea_edges['type'] = rhea_edges['trans'].map({'1': 'in_transport_reaction_with'}).fillna('in_reaction_with')
rhea_edges['type'] = 'in_reaction_with'

rhea_edges = rhea_edges.drop(['direction', 'master_id', 'id_xref', 'xrefs', 'trans'], axis=1)
rhea_edges = gt.order_cols(rhea_edges.rename(columns={'chebi_id': 'end_id', 'id_nw': 'start_id', 'rhea_id': 'merge_id'}))

rhea_edges['dsrc_type'] = 'curated'
rhea_edges['source'] = 'RheaDB'
rhea_edges['license'] = 'CC-BY 4.0'

rhea_edges.sample(10)

Unnamed: 0,start_id,end_id,type,merge_id,pmids,dsrc_type,source,license
18137,NCBIGene:122481,CHEBI:57930,in_reaction_with,RHEA:18113,,curated,RheaDB,CC-BY 4.0
2829,NCBIGene:1149468,CHEBI:15378,in_reaction_with,RHEA:24793,11264293|11839304|12795595|16142895,curated,RheaDB,CC-BY 4.0
9922,NCBIGene:9107,CHEBI:43474,in_reaction_with,RHEA:39019,11676921|12045210|19901554,curated,RheaDB,CC-BY 4.0
4399,NCBIGene:2182,CHEBI:57287,in_reaction_with,RHEA:30751,10198260|10749848|11772874|11980911|12366803|1...,curated,RheaDB,CC-BY 4.0
9349,NCBIGene:117283,CHEBI:30616,in_reaction_with,RHEA:37467,10567691|10574768|17412958|17690096|17702752|1...,curated,RheaDB,CC-BY 4.0
19737,NCBIGene:5917,CHEBI:30616,in_reaction_with,RHEA:20301,,curated,RheaDB,CC-BY 4.0
10332,NCBIGene:5321,CHEBI:15377,in_reaction_with,RHEA:40427,10085124|10358058|12522102|12672805|14636062|1...,curated,RheaDB,CC-BY 4.0
719,NCBIGene:3931,CHEBI:58168,in_reaction_with,RHEA:21204,10222237|10329423|14636062|15654758|19065001|2...,curated,RheaDB,CC-BY 4.0
21219,NCBIGene:1588,CHEBI:137031,in_reaction_with,RHEA:53200,22773874,curated,RheaDB,CC-BY 4.0
16710,NCBIGene:1150196,CHEBI:58274,in_reaction_with,RHEA:14877,29777624|6319365,curated,RheaDB,CC-BY 4.0


In [39]:
new_edges.append(rhea_edges)

# Update reactions in network

Having to have things go through a reaction node is expensive.... Direct connections between genes and chemicals and whatever they go through is better for the algorithm's small path length limitaitons.  Long path lengths may result in important information not being utilized by the algorithm

In [40]:
combo.query('start_label == "Reaction" or end_label == "Reaction"').groupby(['start_label', 'end_label', 'type']).apply(len).rename('count').to_frame().reset_index().sort_values('type')

Unnamed: 0,start_label,end_label,type,count
29,Reaction,Disease,associated_with,670
30,Reaction,Disease,disrupted_in,377
42,Reaction,Reaction,disrupts,351
35,Reaction,Molecular Function,enables,1688
43,Reaction,Reaction,follows_in_sequence,10363
8,Gene,Reaction,fucntion_altered_in,2361
15,Protein,Reaction,fucntion_altered_in,1
27,Reaction,Compound,has_input,10412
25,Reaction,Complex,has_input,5500
31,Reaction,Drug,has_input,900


In [41]:
chem_types = ['Compound', 'Drug']
gene_types = ['Gene', 'Protein', 'Complex', 'Micro RNA']


chem_react = combo.query('start_label in @chem_types and end_label == "Reaction"')
gene_react = combo.query('start_label in @gene_types and end_label == "Reaction"')

react_chem = combo.query('end_label in @chem_types and start_label == "Reaction"')
react_gene = combo.query('end_label in @gene_types and start_label == "Reaction"')

In [42]:
chem_react['type'].value_counts()

part_of                 33068
positively_regulates      235
negatively_regulates      109
Name: type, dtype: int64

In [43]:
gene_react['type'].value_counts()

part_of                 169564
regulates                 9494
fucntion_altered_in       2362
positively_regulates      1139
negatively_regulates       498
Name: type, dtype: int64

In [44]:
react_chem['type'].value_counts()

has_input     11312
has_output    10901
Name: type, dtype: int64

In [45]:
react_gene['type'].value_counts()

has_input     122741
has_output    121606
Name: type, dtype: int64

In [46]:
cg_edges = chem_react.query('type == "part_of"').merge(gene_react.query('type == "part_of"'), on='end_id')[['start_id_x', 'start_id_y', 'end_id', 'source_x', 'dsrc_type_x', 'comp_type_x', 'license_x']].drop_duplicates()
cg_edges = cg_edges.rename(columns={'start_id_x': 'end_id', 'start_id_y': 'start_id', 'end_id': 'merge_id'})
cg_edges.columns = [c.replace('_x', '') for c in cg_edges.columns]
cg_edges['type'] = 'in_reaction_with'
cg_edges = gt.order_cols(cg_edges)
cg_edges.head(2)

Unnamed: 0,start_id,end_id,type,merge_id,source,dsrc_type,comp_type,license
0,NCBIGene:79001,CHEBI:10033,in_reaction_with,REACT:R-HSA-159790,Reactome,curated,,CC0 1.0
1,NCBIGene:79001,CHEBI:15378,in_reaction_with,REACT:R-HSA-159790,Reactome,curated,,CC0 1.0


In [47]:
new_edges.append(cg_edges)

In [48]:
len(cg_edges.drop_duplicates(subset=['start_id', 'end_id']))

128777

In [49]:
cg_edges2 = react_chem.merge(react_gene, on='start_id')[['end_id_x', 'end_id_y', 'source_x', 'dsrc_type_x', 'comp_type_x', 'license_x', 'start_id']].drop_duplicates()
cg_edges2 = cg_edges2.rename(columns={'end_id_y': 'start_id', 'end_id_x': 'end_id', 'start_id': 'merge_id'})
cg_edges2.columns = [c.replace('_x', '') for c in cg_edges2.columns]
cg_edges2['type'] = 'in_reaction_with'
cg_edges2 = gt.order_cols(cg_edges2)
cg_edges2.head(4)

Unnamed: 0,start_id,end_id,type,source,dsrc_type,comp_type,license,merge_id
0,REACT:R-CEL-162401,CHEBI:16618,in_reaction_with,Reactome,curated,,CC0 1.0,REACT:R-CEL-109700
1,REACT:R-CEL-109696,CHEBI:16618,in_reaction_with,Reactome,curated,,CC0 1.0,REACT:R-CEL-109700
2,NCBIGene:179424,CHEBI:16618,in_reaction_with,Reactome,curated,,CC0 1.0,REACT:R-CEL-109700
4,NCBIGene:181524,CHEBI:16618,in_reaction_with,Reactome,curated,,CC0 1.0,REACT:R-CEL-109700


In [50]:
new_edges.append(cg_edges2)

### We will not do Chemical-Chemical or Gene-Gene

Some reactions are of the form GeneA - reacts with - (SET OF GENES) - to produce - GeneB...  
These will cause issues in that the set of genes involved are not specifically in a reaction with each other, but merging over the reaction would have them all the genes in the set linked, rather than only linked to Genes A and B.  

What we really want is the product of (GeneA + GeneB) X  (SET OF GENES) are the involved speices. Unfortunately we do not have this data avaliable currently to do this expansion properly

## Reaction to Taxa -> Gene to Taxa

In [51]:
react_tax = combo.query('start_label == "Reaction" and type == "in_taxon"').copy()
gene_tax = gene_react.merge(react_tax, left_on='end_id', right_on='start_id')

gene_tax_tup = gene_tax[['start_id_x', 'end_id_y']].apply(tuple, axis=1)

In [52]:
old_gene_tax_tup = combo.query('start_label in @gene_types and type == "in_taxon"')[['start_id', 'end_id']].apply(tuple, axis=1)

In [53]:
len(old_gene_tax_tup)

207486

In [54]:
len(set(gene_tax_tup) - set(old_gene_tax_tup))

1284

In [55]:
gene_tax['tup'] = gene_tax_tup
gt_edges = gene_tax.query('tup not in @old_gene_tax_tup')

In [56]:
gt_edges.head(2)

Unnamed: 0,start_id_x,end_id_x,type_x,dsrc_type_x,comp_type_x,p_val_x,adj_p_x,source_x,license_x,experiments_x,...,phase_y,date_y,name_y,name_x_y,name_y_y,start_name_y,end_name_y,start_label_y,end_label_y,tup
3,NCBIGene:4615,REACT:R-HSA-5602316,fucntion_altered_in,curated,,,,Reactome,CC0 1.0,,...,,,,,,Defective MyD88 does not oligomerize within th...,Escherichia coli,Reaction,Taxon,"(NCBIGene:4615, NCBITaxon:562)"
4,NCBIGene:4615,REACT:R-HSA-5602316,part_of,curated,,,,Reactome,CC0 1.0,,...,,,,,,Defective MyD88 does not oligomerize within th...,Escherichia coli,Reaction,Taxon,"(NCBIGene:4615, NCBITaxon:562)"


In [57]:
gt_edges = gt_edges.rename(columns={'start_id_x': 'start_id', 'end_id_y': 'end_id', 'end_id_x': 'merge_id'})[['start_id', 'end_id', 'source_x', 'dsrc_type_x', 'comp_type_x', 'license_x', 'merge_id']].drop_duplicates()
gt_edges.columns = [c.replace('_x', '') for c in gt_edges.columns]
gt_edges['type'] = 'in_taxon'
gt_edges = gt.order_cols(gt_edges)
gt_edges.head(4)

Unnamed: 0,start_id,end_id,type,source,dsrc_type,comp_type,license,merge_id
3,NCBIGene:4615,NCBITaxon:562,in_taxon,Reactome,curated,,CC0 1.0,REACT:R-HSA-5602316
5,NCBIGene:7100,NCBITaxon:562,in_taxon,Reactome,curated,,CC0 1.0,REACT:R-HSA-5602316
410,NCBIGene:4928,NCBITaxon:11320,in_taxon,Reactome,curated,,CC0 1.0,REACT:R-HSA-1176059
411,NCBIGene:4928,NCBITaxon:407754,in_taxon,Reactome,curated,,CC0 1.0,REACT:R-HSA-1176059


In [58]:
new_edges.append(gt_edges)

## Gene -> GO, Chem -> GO

#### Gene - GO

In [59]:
go_types = ['Molecular Function', 'Biological Process', 'Cellular Component']

In [60]:
rx_go = combo.query('start_label == "Reaction" and end_label in @go_types')

In [61]:
gene_go = gene_react.query('type == "part_of"').merge(rx_go, left_on='end_id', right_on='start_id', how='inner').copy()
gene_go_tup = gene_go[['start_id_x', 'end_id_y']].apply(tuple, axis=1)
gene_go['tup'] = gene_go_tup

In [62]:
len(gene_go)

299430

In [63]:
nw_gene_go_tup = combo.query('start_label in @gene_types and end_label in @go_types')[['start_id', 'end_id']].apply(tuple, axis=1)

In [64]:
len(set(gene_go_tup) - set(nw_gene_go_tup))

27726

In [65]:
gene_go.head(2)

Unnamed: 0,start_id_x,end_id_x,type_x,dsrc_type_x,comp_type_x,p_val_x,adj_p_x,source_x,license_x,experiments_x,...,phase_y,date_y,name_y,name_x_y,name_y_y,start_name_y,end_name_y,start_label_y,end_label_y,tup
0,ENSG:ENSG00000076242,REACT:R-HSA-5358510,part_of,curated,,,,Reactome,CC0 1.0,,...,,,,,,MSH2:MSH6 recruits MLH1:PMS2 to mismatch and i...,nucleoplasm,Reaction,Cellular Component,"(ENSG:ENSG00000076242, GO:0005654)"
1,NCBIGene:107984056,REACT:R-HSA-5358510,part_of,curated,,,,Reactome,CC0 1.0,,...,,,,,,MSH2:MSH6 recruits MLH1:PMS2 to mismatch and i...,nucleoplasm,Reaction,Cellular Component,"(NCBIGene:107984056, GO:0005654)"


In [66]:
gg_edges = gene_go.query('tup not in @nw_gene_go_tup')

gg_edges = gg_edges.rename(columns={'start_id_x': 'start_id', 'end_id_y': 'end_id', 'end_id_x': 'merge_id'})[['start_id', 'end_id', 'type_y', 'source_y', 'dsrc_type_y', 'comp_type_y', 'license_y', 'merge_id']].drop_duplicates()
gg_edges.columns = [c.replace('_y', '') for c in gg_edges.columns]
gg_edges = gt.order_cols(gg_edges)
gg_edges.head(4)

Unnamed: 0,start_id,end_id,type,source,dsrc_type,comp_type,license,merge_id
14,ENSG:ENSG00000076242,GO:0004519,enables,Reactome,curated,,CC0 1.0,REACT:R-HSA-5358518
18,NCBIGene:2956,GO:0004519,enables,Reactome,curated,,CC0 1.0,REACT:R-HSA-5358518
20,NCBIGene:4292,GO:0004519,enables,Reactome,curated,,CC0 1.0,REACT:R-HSA-5358518
22,NCBIGene:4436,GO:0004519,enables,Reactome,curated,,CC0 1.0,REACT:R-HSA-5358518


In [67]:
gene_go2 = react_gene.merge(rx_go, on='start_id', how='inner').copy()
gene_go2_tup = gene_go2[['end_id_x', 'end_id_y']].apply(tuple, axis=1)
gene_go2['tup'] = gene_go2_tup

In [68]:
gg_edges2 = gene_go2.query('tup not in @nw_gene_go_tup')

gg_edges2 = gg_edges2.rename(columns={'end_id_x': 'start_id', 'end_id_y': 'end_id', 'start_id': 'merge_id'})[['start_id', 'end_id', 'type_y', 'source_y', 'dsrc_type_y', 'comp_type_y', 'license_y', 'merge_id']].drop_duplicates()
gg_edges2.columns = [c.replace('_y', '') for c in gg_edges2.columns]
gg_edges2 = gt.order_cols(gg_edges2)
gg_edges2.head(4)

Unnamed: 0,start_id,end_id,type,source,dsrc_type,comp_type,license,merge_id
47,UniProt:A0A1C3NSK2,GO:0005576,part_of,Reactome,curated,,CC0 1.0,REACT:R-CEL-1013012
49,UniProt:A0A1C3NSN6,GO:0005576,part_of,Reactome,curated,,CC0 1.0,REACT:R-CEL-1013012
55,NCBIGene:174803,GO:0005576,part_of,Reactome,curated,,CC0 1.0,REACT:R-CEL-1013012
59,NCBIGene:180944,GO:0005576,part_of,Reactome,curated,,CC0 1.0,REACT:R-CEL-1013012


In [69]:
len(gg_edges2)

64358

In [70]:
combo.query('start_label in @gene_types and end_label in @go_types')['type'].value_counts()

involved_in             294419
part_of                 249607
enables                 226591
regulates                23166
positively_regulates     22080
negatively_regulates     16044
marker_or_mechanism         10
decreases_phenotype          5
increases_phenotype          4
decreases_reaction           3
therapeutic                  3
increases_reaction           1
affects_reaction             1
Name: type, dtype: int64

In [71]:
gg_edges['type'].value_counts()

part_of                 78159
enables                  9962
involved_in              3800
negatively_regulates      342
positively_regulates      155
regulates                 106
Name: type, dtype: int64

In [72]:
gg_edges2['type'].value_counts()

part_of                 52736
enables                  8727
involved_in              2488
negatively_regulates      249
regulates                  86
positively_regulates       72
Name: type, dtype: int64

In [73]:
new_edges.append(gg_edges)
new_edges.append(gg_edges2)

#### Chem - GO

In [74]:
combo.query('start_label in @chem_types and end_label in @go_types')['type'].value_counts()

increases_phenotype    37158
decreases_reaction     28285
decreases_phenotype    18767
affects_phenotype      11388
increases_reaction      6906
part_of                 3576
affects_reaction        2593
inhibitor                746
activator                412
modulator                330
interacts                 81
blocker                   50
substrate                 34
marker_or_mechanism       24
chelating_agent            9
antagonist                 6
therapeutic                6
partial_agonist            3
binding_agent              3
opener                     3
releasing_agent            2
oxidative_enzyme           2
chaperone                  2
agonist                    1
Name: type, dtype: int64

In [75]:
chem_go = chem_react.query('type == "part_of"').merge(rx_go, left_on='end_id', right_on='start_id', how='inner').copy()
chem_go_tup = chem_go[['start_id_x', 'end_id_y']].apply(tuple, axis=1)
chem_go['tup'] = chem_go_tup

In [76]:
len(chem_go)

58656

In [77]:
nw_chem_go_tup = combo.query('start_label in @chem_types and end_label in @go_types')[['start_id', 'end_id']].apply(tuple, axis=1)

In [78]:
len(set(chem_go_tup) - set(nw_chem_go_tup))

5707

In [79]:
cgo_edges = chem_go.query('tup not in @nw_gene_go_tup')

cgo_edges = cgo_edges.rename(columns={'start_id_x': 'start_id', 'end_id_y': 'end_id', 'end_id_x': 'merge_id'})[['start_id', 'end_id', 'type_y', 'source_y', 'dsrc_type_y', 'comp_type_y', 'license_y', 'merge_id']].drop_duplicates()
cgo_edges.columns = [c.replace('_y', '') for c in cgo_edges.columns]
cgo_edges = gt.order_cols(cgo_edges)
cgo_edges.head(4)

Unnamed: 0,start_id,end_id,type,source,dsrc_type,comp_type,license,merge_id
0,CHEBI:10033,GO:0005788,part_of,Reactome,curated,,CC0 1.0,REACT:R-HSA-159790
1,CHEBI:10033,GO:0005789,part_of,Reactome,curated,,CC0 1.0,REACT:R-HSA-159790
2,CHEBI:10033,GO:0005829,part_of,Reactome,curated,,CC0 1.0,REACT:R-HSA-159790
3,CHEBI:15378,GO:0005788,part_of,Reactome,curated,,CC0 1.0,REACT:R-HSA-159790


In [80]:
chem_go2 = react_chem.merge(rx_go, on='start_id', how='inner').copy()
chem_go2_tup = chem_go2[['end_id_x', 'end_id_y']].apply(tuple, axis=1)
chem_go2['tup'] = chem_go2_tup

In [81]:
cgo_edges2 = chem_go2.query('tup not in @nw_gene_go_tup')

cgo_edges2 = cgo_edges2.rename(columns={'end_id_x': 'start_id', 'end_id_y': 'end_id', 'start_id': 'merge_id'})[['start_id', 'end_id', 'type_y', 'source_y', 'dsrc_type_y', 'comp_type_y', 'license_y', 'merge_id']].drop_duplicates()
cgo_edges2.columns = [c.replace('_y', '') for c in cgo_edges2.columns]
cgo_edges2 = gt.order_cols(cgo_edges2)
cgo_edges2.head(4)

Unnamed: 0,start_id,end_id,type,source,dsrc_type,comp_type,license,merge_id
0,CHEBI:15996,GO:0005759,part_of,Reactome,curated,,CC0 1.0,REACT:R-CEL-1008248
1,CHEBI:16027,GO:0005759,part_of,Reactome,curated,,CC0 1.0,REACT:R-CEL-1008248
2,CHEBI:17552,GO:0005759,part_of,Reactome,curated,,CC0 1.0,REACT:R-CEL-1008248
3,CHEBI:456216,GO:0005759,part_of,Reactome,curated,,CC0 1.0,REACT:R-CEL-1008248


In [82]:
cgo_edges['type'].value_counts()

part_of                 52621
enables                  4738
involved_in              1140
regulates                  53
negatively_regulates       52
positively_regulates       52
Name: type, dtype: int64

In [83]:
cgo_edges2['type'].value_counts()

part_of                 31501
enables                  3799
involved_in               754
negatively_regulates       42
positively_regulates       37
regulates                  31
Name: type, dtype: int64

In [84]:
new_edges.append(cgo_edges)
new_edges.append(cgo_edges2)

## Regulation of reaction components

In [85]:
reg_types = ['negatively_regulates', 'positively_regulates', 'regulates']

In [86]:
gene_reg_rx = combo.query('start_label in @gene_types and end_label == "Reaction" and type in @reg_types')

In [87]:
chem_reg_rx = combo.query('start_label in @chem_types and end_label == "Reaction" and type in @reg_types')

#### Get all the genes and reactions on the same side...

In [88]:
chem_react1 = pd.concat([chem_react, react_chem.rename(columns={'start_id': 'end_id', 'end_id': 'start_id'})], sort=False)
gene_react1 = pd.concat([gene_react, react_gene.rename(columns={'start_id': 'end_id', 'end_id': 'start_id'})], sort=False)

In [89]:
gene_react1['type'].value_counts()

part_of                 169564
has_input               122741
has_output              121606
regulates                 9494
fucntion_altered_in       2362
positively_regulates      1139
negatively_regulates       498
Name: type, dtype: int64

In [90]:
membership = ['part_of', 'has_input', 'has_output']

In [91]:
gene_gene_reg = gene_reg_rx.merge(gene_react1.query('type in {}'.format(['part_of', 'has_output'])), on='end_id', how='inner').drop_duplicates(subset=['start_id_x', 'start_id_y', 'type_x'])

In [92]:
# Any instance where a gene regulates a reaction that the gene is also a part of is going to 
# result in bi-directional regulation of All genes within that reaction
prob_tups = gene_gene_reg.query('start_id_x == start_id_y')[['start_id_x', 'end_id']].apply(tuple, axis=1)

In [93]:
gene_gene_reg['gene_rx_tup'] = gene_gene_reg[['start_id_x', 'end_id']].apply(tuple, axis=1)
gene_gene_reg.query('gene_rx_tup in @prob_tups')

Unnamed: 0,start_id_x,end_id,type_x,dsrc_type_x,comp_type_x,p_val_x,adj_p_x,source_x,license_x,experiments_x,...,phase_y,date_y,name_y,name_x_y,name_y_y,start_name_y,end_name_y,start_label_y,end_label_y,gene_rx_tup
0,NCBIGene:172243,REACT:R-CEL-549241,negatively_regulates,curated,,,,Reactome,CC0 1.0,,...,,,,,,str-176,"SLC22A4 cotransports ERGT, Na+ from extracellu...",Gene,Reaction,"(NCBIGene:172243, REACT:R-CEL-549241)"
1,NCBIGene:172243,REACT:R-CEL-549241,negatively_regulates,curated,,,,Reactome,CC0 1.0,,...,,,,,,rnt-1,"SLC22A4 cotransports ERGT, Na+ from extracellu...",Gene,Reaction,"(NCBIGene:172243, REACT:R-CEL-549241)"
2,NCBIGene:172243,REACT:R-CEL-549241,negatively_regulates,curated,,,,Reactome,CC0 1.0,,...,,,,,,K05F1.6,"SLC22A4 cotransports ERGT, Na+ from extracellu...",Gene,Reaction,"(NCBIGene:172243, REACT:R-CEL-549241)"
3,NCBIGene:172243,REACT:R-CEL-549241,negatively_regulates,curated,,,,Reactome,CC0 1.0,,...,,,,,,Y82E9BR.16,"SLC22A4 cotransports ERGT, Na+ from extracellu...",Gene,Reaction,"(NCBIGene:172243, REACT:R-CEL-549241)"
4,NCBIGene:172243,REACT:R-CEL-549241,negatively_regulates,curated,,,,Reactome,CC0 1.0,,...,,,,,,oat-1,"SLC22A4 cotransports ERGT, Na+ from extracellu...",Gene,Reaction,"(NCBIGene:172243, REACT:R-CEL-549241)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482427,NCBIGene:9636,REACT:R-HSA-1169307,regulates,curated,,,,Reactome,CC0 1.0,,...,,,,,,N-myristoyl GAG (P12493) protein,Monoubiquitination of N-myristoyl GAG (P12493)...,Protein,Reaction,"(NCBIGene:9636, REACT:R-HSA-1169307)"
482428,NCBIGene:9636,REACT:R-HSA-1169307,regulates,curated,,,,Reactome,CC0 1.0,,...,,,,,,RPS27A,Monoubiquitination of N-myristoyl GAG (P12493)...,Gene,Reaction,"(NCBIGene:9636, REACT:R-HSA-1169307)"
482429,NCBIGene:9636,REACT:R-HSA-1169307,regulates,curated,,,,Reactome,CC0 1.0,,...,,,,,,UBA52,Monoubiquitination of N-myristoyl GAG (P12493)...,Gene,Reaction,"(NCBIGene:9636, REACT:R-HSA-1169307)"
482430,NCBIGene:9636,REACT:R-HSA-1169307,regulates,curated,,,,Reactome,CC0 1.0,,...,,,,,,UBB,Monoubiquitination of N-myristoyl GAG (P12493)...,Gene,Reaction,"(NCBIGene:9636, REACT:R-HSA-1169307)"


In [94]:
gene_gene_reg = gene_gene_reg.query('gene_rx_tup not in @prob_tups').copy()

In [95]:
gene_gene_reg['short_tup'] = gene_gene_reg[['start_id_x', 'start_id_y']].apply(tuple, axis=1)
gene_gene_reg['long_tup'] = gene_gene_reg[['start_id_x', 'type_x', 'start_id_y']].apply(tuple, axis=1)

In [96]:
gene_gene_reg['rev_short_tup'] = gene_gene_reg[['start_id_y', 'start_id_x']].apply(tuple, axis=1)
gene_gene_reg['rev_long_tup'] = gene_gene_reg[['start_id_y', 'type_x', 'start_id_x']].apply(tuple, axis=1)

In [97]:
conflict = set(gene_gene_reg['short_tup']) & set(gene_gene_reg['rev_short_tup'])
len(conflict)

8898

In [98]:
conflict1 = set(gene_gene_reg['long_tup']) & set(gene_gene_reg['rev_long_tup'])
len(conflict1)

8696

In [99]:
gene_gene_reg.query('short_tup in @conflict')

Unnamed: 0,start_id_x,end_id,type_x,dsrc_type_x,comp_type_x,p_val_x,adj_p_x,source_x,license_x,experiments_x,...,name_y_y,start_name_y,end_name_y,start_label_y,end_label_y,gene_rx_tup,short_tup,long_tup,rev_short_tup,rev_long_tup
1345,NCBIGene:173085,REACT:R-CEL-5672972,regulates,curated,,,,Reactome,CC0 1.0,,...,,src-1,MAP2Ks and MAPKs bind to the activated RAF com...,Gene,Reaction,"(NCBIGene:173085, REACT:R-CEL-5672972)","(NCBIGene:173085, NCBIGene:171722)","(NCBIGene:173085, regulates, NCBIGene:171722)","(NCBIGene:171722, NCBIGene:173085)","(NCBIGene:171722, regulates, NCBIGene:173085)"
1349,NCBIGene:173085,REACT:R-CEL-5672972,regulates,curated,,,,Reactome,CC0 1.0,,...,,mpk-1,MAP2Ks and MAPKs bind to the activated RAF com...,Gene,Reaction,"(NCBIGene:173085, REACT:R-CEL-5672972)","(NCBIGene:173085, NCBIGene:175545)","(NCBIGene:173085, regulates, NCBIGene:175545)","(NCBIGene:175545, NCBIGene:173085)","(NCBIGene:175545, regulates, NCBIGene:173085)"
1795,NCBIGene:181082,REACT:R-CEL-5672972,regulates,curated,,,,Reactome,CC0 1.0,,...,,src-1,MAP2Ks and MAPKs bind to the activated RAF com...,Gene,Reaction,"(NCBIGene:181082, REACT:R-CEL-5672972)","(NCBIGene:181082, NCBIGene:171722)","(NCBIGene:181082, regulates, NCBIGene:171722)","(NCBIGene:171722, NCBIGene:181082)","(NCBIGene:171722, regulates, NCBIGene:181082)"
1799,NCBIGene:181082,REACT:R-CEL-5672972,regulates,curated,,,,Reactome,CC0 1.0,,...,,mpk-1,MAP2Ks and MAPKs bind to the activated RAF com...,Gene,Reaction,"(NCBIGene:181082, REACT:R-CEL-5672972)","(NCBIGene:181082, NCBIGene:175545)","(NCBIGene:181082, regulates, NCBIGene:175545)","(NCBIGene:175545, NCBIGene:181082)","(NCBIGene:175545, regulates, NCBIGene:181082)"
2969,NCBIGene:3569,REACT:R-HSA-1112602,regulates,curated,,,,Reactome,CC0 1.0,,...,,STAT3,"Tyrosine phosphorylation of STAT1, STAT3 by IL...",Gene,Reaction,"(NCBIGene:3569, REACT:R-HSA-1112602)","(NCBIGene:3569, NCBIGene:6774)","(NCBIGene:3569, regulates, NCBIGene:6774)","(NCBIGene:6774, NCBIGene:3569)","(NCBIGene:6774, regulates, NCBIGene:3569)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479523,NCBIGene:801,REACT:R-HSA-8986939,regulates,curated,,,,Reactome,CC0 1.0,,...,,PRKACA,MECP2 binds the NCoR/SMRT complex,Gene,Reaction,"(NCBIGene:801, REACT:R-HSA-8986939)","(NCBIGene:801, NCBIGene:5566)","(NCBIGene:801, regulates, NCBIGene:5566)","(NCBIGene:5566, NCBIGene:801)","(NCBIGene:5566, regulates, NCBIGene:801)"
479544,NCBIGene:805,REACT:R-HSA-8986939,regulates,curated,,,,Reactome,CC0 1.0,,...,,PRKACA,MECP2 binds the NCoR/SMRT complex,Gene,Reaction,"(NCBIGene:805, REACT:R-HSA-8986939)","(NCBIGene:805, NCBIGene:5566)","(NCBIGene:805, regulates, NCBIGene:5566)","(NCBIGene:5566, NCBIGene:805)","(NCBIGene:5566, regulates, NCBIGene:805)"
479565,NCBIGene:808,REACT:R-HSA-8986939,regulates,curated,,,,Reactome,CC0 1.0,,...,,PRKACA,MECP2 binds the NCoR/SMRT complex,Gene,Reaction,"(NCBIGene:808, REACT:R-HSA-8986939)","(NCBIGene:808, NCBIGene:5566)","(NCBIGene:808, regulates, NCBIGene:5566)","(NCBIGene:5566, NCBIGene:808)","(NCBIGene:5566, regulates, NCBIGene:808)"
481344,NCBIGene:6928,REACT:R-HSA-210784,positively_regulates,curated,,,,Reactome,CC0 1.0,,...,,ONECUT1,HNF1B-dependent synthesis of HNF6 protein,Gene,Reaction,"(NCBIGene:6928, REACT:R-HSA-210784)","(NCBIGene:6928, NCBIGene:3175)","(NCBIGene:6928, positively_regulates, NCBIGene...","(NCBIGene:3175, NCBIGene:6928)","(NCBIGene:3175, positively_regulates, NCBIGene..."


In [100]:
len(gene_gene_reg.query('short_tup in @conflict')), len(gene_gene_reg.query('short_tup not in @conflict'))

(9061, 43467)

lot less conflicts than non, conflicts... but genes can regulate each other...

In [101]:
gene_gene_reg = gene_gene_reg.drop(['short_tup', 'long_tup', 'rev_short_tup', 'rev_long_tup'], axis=1)

#### Gene regulates chemical

In [102]:
gene_chem_reg = gene_reg_rx.merge(chem_react1.query('type in @membership'), on='end_id', how='inner').drop_duplicates(subset=['start_id_x', 'start_id_y', 'type_x'])

In [103]:
gene_chem_reg[['start_name_x', 'start_name_y', 'type_x']]

Unnamed: 0,start_name_x,start_name_y,type_x
0,rnt-1,"SSU72 homolog, RNA polymerase II CTD phosphatase",negatively_regulates
1,rnt-1,ergothioneine thione form,negatively_regulates
6,Runt domain-containing protein CELE_B0414.2,"SSU72 homolog, RNA polymerase II CTD phosphatase",negatively_regulates
7,Runt domain-containing protein CELE_B0414.2,ergothioneine thione form,negatively_regulates
12,rnt-1,"SSU72 homolog, RNA polymerase II CTD phosphatase",regulates
...,...,...,...
23463,MPDU1,(GlcNAc)2 (Man)8 (PP-Dol)1,positively_regulates
23470,MPDU1,(GlcNAc)2 (Man)7 (PP-Dol)1,regulates
23471,MPDU1,(GlcNAc)2 (Man)8 (PP-Dol)1,regulates
23494,MPDU1,(GlcNAc)2 (Man)9 (PP-Dol)1,positively_regulates


#### Chemical regulates gene

In [104]:
chem_gene_reg = chem_reg_rx.merge(gene_react1.query('type in @membership'), on='end_id', how='inner').drop_duplicates(subset=['start_id_x', 'start_id_y', 'type_x'])

In [105]:
chem_gene_reg[['start_name_x', 'start_name_y', 'type_x']].sample(20)

Unnamed: 0,start_name_x,start_name_y,type_x
4678,ATP(4-),eukaryotic translation initiation factor 3 sub...,positively_regulates
3433,HS,FGF3,positively_regulates
4906,c-GMP-AMP,XRCC5,positively_regulates
67,sapropterin dihydrochloride,GTP cyclohydrolase I,negatively_regulates
2121,"1-phosphatidyl-1D-myo-inositol 3,4-bisphosphate",Eps15 (endocytosis protein) Homologous Sequenc...,positively_regulates
1967,GTP,eif-2alpha,positively_regulates
2403,"1-phosphatidyl-1D-myo-inositol 3,4-bisphosphate",FZD4,positively_regulates
3983,PYD and CARD domain containing,pygl-1,positively_regulates
457,glutathione,glutathione synthetase,negatively_regulates
590,amiloride,egas-3,negatively_regulates


In [106]:
reg_edges = pd.concat([gene_gene_reg, gene_chem_reg, chem_gene_reg], sort=False)

reg_edges = reg_edges.rename(columns={'start_id_x': 'start_id', 'start_id_y': 'end_id', 'end_id': 'merge_id'})

reg_edges = reg_edges[['start_id', 'end_id', 'type_x', 'source_x', 'dsrc_type_x', 'comp_type_x', 'license_x', 'merge_id']].drop_duplicates()
reg_edges.columns = [c.replace('_x', '') for c in reg_edges.columns]

reg_edges['dsrc_type'] = 'computed'
reg_edges['comp_type'] = 'punning'

reg_edges = gt.order_cols(reg_edges)


In [107]:
reg_edges

Unnamed: 0,start_id,end_id,type,source,dsrc_type,comp_type,license,merge_id
307,NCBIGene:8837,NCBIGene:7186,negatively_regulates,Reactome,computed,punning,CC0 1.0,REACT:R-HSA-141159
309,NCBIGene:8837,NCBIGene:8717,negatively_regulates,Reactome,computed,punning,CC0 1.0,REACT:R-HSA-141159
310,NCBIGene:8837,NCBIGene:8737,negatively_regulates,Reactome,computed,punning,CC0 1.0,REACT:R-HSA-141159
319,UniProt:O15519-2,NCBIGene:7186,negatively_regulates,Reactome,computed,punning,CC0 1.0,REACT:R-HSA-141159
321,UniProt:O15519-2,NCBIGene:8717,negatively_regulates,Reactome,computed,punning,CC0 1.0,REACT:R-HSA-141159
...,...,...,...,...,...,...,...,...
4949,CHEBI:58095,NCBIGene:181633,negatively_regulates,Reactome,computed,punning,CC0 1.0,REACT:R-CEL-893596
4950,CHEBI:58095,NCBIGene:185086,negatively_regulates,Reactome,computed,punning,CC0 1.0,REACT:R-CEL-893596
4951,CHEBI:58095,UniProt:Q8MP09,negatively_regulates,Reactome,computed,punning,CC0 1.0,REACT:R-CEL-893596
4952,CHEBI:58095,UniProt:Q9XX97,negatively_regulates,Reactome,computed,punning,CC0 1.0,REACT:R-CEL-893596


In [108]:
new_edges.append(reg_edges)

## Gene - Disease assocations

In [109]:
react_dis = combo.query('start_label == "Reaction" and end_label == "Disease"')
alt_gene = gene_react.query('type == "fucntion_altered_in"')

gd_edges = react_dis.merge(alt_gene, left_on='start_id', right_on='end_id', how='inner')

alt_gets = gd_edges['start_id_y'].unique()

In [110]:
combo.query('start_id in @alt_gets and end_label == "Disease"')['type'].value_counts()

marker_or_mechanism    1087
associated_with         601
therapeutic              44
Name: type, dtype: int64

In [111]:
react_gene_dis = set(gd_edges[['start_id_y', 'end_id_x']].apply(tuple, axis=1))

In [112]:
nw_gene_dis_tups = set(combo.query('start_id in @alt_gets and end_label == "Disease"')[['start_id', 'end_id']].apply(tuple, axis=1))

In [113]:
len(react_gene_dis)

380

In [114]:
len(nw_gene_dis_tups)

1611

In [115]:
len(react_gene_dis - nw_gene_dis_tups)

355

In [116]:
id_to_name = nodes.set_index('id')['name'].to_dict()

In [117]:
gd_edges['end_id_x'].map(id_to_name).value_counts()

cancer                                         1693
bone development disease                        503
T-cell lymphoblastic leukemia/lymphoma          142
Noonan syndrome                                  34
cardiofaciocutaneous syndrome                    31
ovarian cancer                                   30
cystic fibrosis                                  29
chronic myeloid leukemia                         28
female reproductive endometrioid cancer          25
holoprosencephaly                                18
subacute leukemia                                15
myelodysplastic/myeloproliferative neoplasm      15
acute lymphoblastic leukemia                     15
acrocephalosyndactylia                           14
craniosynostosis                                 13
breast cancer                                     9
large intestine cancer                            5
rhabdomyosarcoma                                  3
multiple myeloma                                  2
glioblastoma

In [118]:
gd_edges['start_id_y'].map(id_to_name).value_counts()

FGF2      71
FGF1      71
FGF9      71
FGF20     68
FGF5      68
          ..
NFKBIA     2
ERLIN1     2
DERL3      2
JAK2       1
ABCC8      1
Name: start_id_y, Length: 141, dtype: int64

These are all pretty generic... but we will keep them 

In [119]:
gd_edges = gd_edges.rename(columns={'start_id_x': 'reactome_id', 'end_id_x': 'end_id', 'start_id_y': 'start_id'})[['start_id', 'end_id', 'type_y', 'source_y', 'dsrc_type_y', 'comp_type_y', 'license_y', 'reactome_id']]


gd_edges.columns = [c.replace('_y', '') for c in gd_edges.columns]
gd_edges['type'] = 'marker_or_mechanism'
gd_edges = gt.order_cols(gd_edges).drop_duplicates()

gd_edges.head(4)

Unnamed: 0,start_id,end_id,type,source,dsrc_type,comp_type,license,reactome_id
0,NCBIGene:11140,DOID:162,marker_or_mechanism,Reactome,curated,,CC0 1.0,REACT:R-HSA-1169421
1,NCBIGene:1950,DOID:162,marker_or_mechanism,Reactome,curated,,CC0 1.0,REACT:R-HSA-1169421
2,NCBIGene:1956,DOID:162,marker_or_mechanism,Reactome,curated,,CC0 1.0,REACT:R-HSA-1169421
3,NCBIGene:3320,DOID:162,marker_or_mechanism,Reactome,curated,,CC0 1.0,REACT:R-HSA-1169421


In [120]:
gd_edges = dfp.combine_group_cols_on_char(gd_edges, ['start_id', 'end_id', 'type'], sort=True, prog=False)

In [121]:
len(gd_edges)

380

In [122]:
new_edges.append(gd_edges)

## Put it all together

In [123]:
edges_to_add = pd.concat(new_edges, ignore_index=True, sort=False)

In [124]:
# In reaction with is a direct curated relationship...
# These edges we made by merging across nodes is equivalent to punning, so lets account for that
pun_idx = edges_to_add.query('type != "in_reaction_with"').index
edges_to_add.loc[pun_idx, 'comp_type'] = 'punning'
edges_to_add.loc[pun_idx, 'dsrc_type'] = 'computed'

In [125]:
edges_to_add = dfp.combine_group_cols_on_char(edges_to_add, ['start_id', 'end_id', 'type'], sort=True, prog=False)

In [126]:
len(edges_to_add)

259373

In [127]:
edges_to_add['type'].value_counts()

in_reaction_with        151733
regulates                48796
part_of                  27468
positively_regulates     11490
enables                   9382
negatively_regulates      5462
involved_in               3378
in_taxon                  1284
marker_or_mechanism        380
Name: type, dtype: int64

In [128]:
new_combo = gt.combine_nodes_and_edges(nodes, edges_to_add)

In [129]:
me_counts = new_combo.groupby(['start_label', 'end_label', 'type']).apply(len).rename('count').to_frame().reset_index().sort_values('type')

In [130]:
len(me_counts)

105

In [131]:
me_counts.sort_values('count', ascending=False).head(50)

Unnamed: 0,start_label,end_label,type,count
55,Gene,Compound,in_reaction_with,102702
66,Gene,Gene,regulates,35602
60,Gene,Drug,in_reaction_with,21920
88,Protein,Compound,in_reaction_with,18069
51,Gene,Cellular Component,part_of,15660
14,Complex,Gene,positively_regulates,7666
58,Gene,Compound,regulates,5649
68,Gene,Molecular Function,enables,5625
7,Complex,Compound,in_reaction_with,4927
27,Compound,Cellular Component,part_of,4701


In [132]:
remove_nodes = nodes.query('label == "Reaction"')['id'].unique()

edges_out = edges.query('start_id not in @remove_nodes and end_id not in @remove_nodes')
print(len(edges), len(edges_out))

9267553 8723617


In [133]:
edges_out = pd.concat([edges_out, edges_to_add], sort=False)
edges_out = dfp.combine_group_cols_on_char(edges_out, ['start_id', 'end_id', 'type'], sort=True)

HBox(children=(FloatProgress(value=0.0, description='total_progress', max=3.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='dsrc_type', max=3391.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='source', max=3391.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='license', max=3391.0, style=ProgressStyle(description_wid…





In [134]:
len(edges_out)

8979599

In [137]:
edge_ids = set(edges_out[['start_id', 'end_id']].stack().unique())
node_ids = set(nodes['id'])

edge_ids - node_ids

set()

# Save

In [135]:
this_name = '11_Reactions_and_Regulation'

out_dir = load_dir.joinpath(this_name, 'out')
out_dir.mkdir(exist_ok=True, parents=True)

In [138]:
# Nodes were unchanged
nodes.to_csv(out_dir.joinpath('nodes.csv'), index=False)
edges_out.to_csv(out_dir.joinpath('edges.csv'), index=False)