# Data overview

In [6]:
# Import dependencies for the complete notebook

import os
import config
from config import INTERACTOMES_PATH, DATA_REACTOME_PATH
from queries import QUERY_GET_ALL_GENES, QUERY_GET_ALL_PROTEINS, QUERY_GET_ALL_PROTEOFORMS
from lib.graph_database_access import get_pathways, get_query_result, make_proteoform_string
from lib.dictionaries import read_dictionary_one_to_set
from lib.networks import get_json_filename, create_pathway_interaction_network, read_graph, get_interactomes
from pathlib import Path

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

config.set_root_wd()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Initial working directory: C:\git\ProteoformNetworks\src\Python
New working directory: c:\git\ProteoformNetworks


In [7]:
# Calculate number of human GENES we can get from Reactome. Genes that participate in a Reaction and a Pathway.
genes = get_query_result(QUERY_GET_ALL_GENES)
print(f"There are {len(genes)} genes.")
if not os.path.exists(DATA_REACTOME_PATH):
    os.makedirs(DATA_REACTOME_PATH)
genes["Id"].to_csv(DATA_REACTOME_PATH + "genes.csv", index=False, header=False)
genes

There are 10976 genes.


Unnamed: 0,Id
0,HSPA8
1,GFAP
2,PARK7
3,CFTR
4,UBC
...,...
10971,AMPH
10972,TOR1A
10973,TOR1B
10974,UBQLN2


In [8]:
# Calculate number of human Proteins we can get from Reactome. Proteins that participate in a Reaction and a Pathway.
proteins = get_query_result(QUERY_GET_ALL_PROTEINS)
print(f"There are {len(proteins)} proteins.")
proteins["Id"].to_csv(DATA_REACTOME_PATH + "proteins.csv", index=False, header=False)
proteins

There are 11074 proteins.


Unnamed: 0,Id
0,P11142
1,P14136
2,Q99497
3,P13569
4,P0CG48
...,...
11069,P49418
11070,O14656
11071,O14657
11072,Q9UHD9


In [9]:
# Calculate number of proteoforms we can get from Reactome
proteoforms = get_query_result(QUERY_GET_ALL_PROTEOFORMS)
proteoforms['Id'] = proteoforms.apply(lambda x: make_proteoform_string(x.Id), axis=1)
print(f"There are {len(proteoforms)} proteoforms.")
proteoforms["Id"].to_csv(DATA_REACTOME_PATH + "proteoforms.csv", index=False, header=False)
proteoforms

There are 14424 proteoforms.


Unnamed: 0,Id
0,A0A075B6P5;
1,A0A075B6S6;
2,A0A096LP49;
3,A0A0A6YYK7;
4,A0A0C4DH25;
...,...
14419,Q9Y6X9;
14420,Q9Y6Y8;
14421,Q9Y6Y9;
14422,"Q9Y6Y9;00160:26,00160:114"


In [10]:
# Create interactomes to make sure mapping files genes-->proteins and proteins-->proteoforms exist
interactomes = get_interactomes(config.DATA_REACTOME_PATH, INTERACTOMES_PATH)

Reading participants of all reactions for level genes...
Reading participants of all reactions for level proteins...
Reading participants of all reactions for level proteoforms...
Reading participants of all reactions for level sm...
Reading components of all complexes for level genes...
Reading components of all complexes for level proteins...
Reading components of all complexes for level proteoforms...
Reading components of all complexes for level sm...
Reading interaction network for  at genes level, method no_sm...
Reading interaction network for  at proteins level, method no_sm...
Reading interaction network for  at proteoforms level, method no_sm...
Reading interaction network for  at genes level, method with_sm...
Reading interaction network for  at proteins level, method with_sm...
Creating interaction network for  at proteoforms level, method with_sm...
Finished creating interactome file for proteoforms-with_sm
Creating interaction network for  at genes level, method with_uniq

In [11]:
map_proteins_to_proteoforms = read_dictionary_one_to_set(INTERACTOMES_PATH, "mapping_proteins_to_proteoforms.tsv",
                                                         col_indices=(0, 1))
pathways = get_pathways()["stId"]
print(f"There are {len(pathways)} pathways.")

There are 2112 pathways.


In [12]:
print(f"There are {len(map_proteins_to_proteoforms.keys())} proteins.")
selected_proteins = []
for protein, proteoforms in map_proteins_to_proteoforms.items():
    if len(proteoforms) > 1:
        selected_proteins.append(protein)

print(f"Only {len(selected_proteins)} have multiple proteoforms.")

There are 11074 proteins.
Only 1493 have multiple proteoforms.


In [13]:
selected_pathways = []
for pathway in pathways:
    filename = get_json_filename(config.proteins, config.no_sm, config.PATHWAY_GRAPHS_PATH, pathway)
    if not Path(filename).exists():
        create_pathway_interaction_network(pathway, config.proteins, config.no_sm, config.PATHWAY_GRAPHS_PATH)
    G = read_graph(filename)
    if any(protein in selected_proteins for protein in list(G.nodes)):
        selected_pathways.append(pathway)

print(f"There are {len(selected_pathways)} pathways that contain proteoforms.")

    * Creating network networks\pathways\R-HSA-9613829_proteins_no_sm.json


OSError: Cannot save file into a non-existent directory: 'networks\pathways\complexes'

In [None]:
# Examples of genes with multiple protein products

map_genes_to_proteins = read_dictionary_one_to_set(INTERACTOMES_PATH, "mapping_proteins_to_genes.tsv", col_indices=(1, 0))
map_proteins_to_proteoforms = read_dictionary_one_to_set(INTERACTOMES_PATH, "mapping_proteins_to_proteoforms.tsv", col_indices=(0, 1))

map_proteins_to_proteoforms

In [None]:

#  Examples when gene products participate in a different set of reactions.

In [None]:
- Calculate set of reactions where each gene product participates
- Select genes where it's protein products have a different set of reactions
- Quantify the difference overall:
    * How often the multiple protein products participate in the same reaction
    * Quantify the intersection: 