# Data overview

In [53]:
# Import dependencies for the complete notebook

import os
import config
from config import INTERACTOMES_PATH, DATA_REACTOME_PATH
from queries import QUERY_GET_ALL_GENES, QUERY_GET_ALL_PROTEINS, QUERY_GET_ALL_PROTEOFORMS
from lib.graph_database_access import get_pathways, get_query_result, make_proteoform_string, get_reactions
from lib.dictionaries import read_dictionary_one_to_set
from lib.networks import get_json_filename, create_pathway_interaction_network, read_graph, get_interactomes
import pandas as pd
import dataframe_image as dfi

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

config.set_root_wd()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Initial working directory: C:\git\ProteoformNetworks\src\Python
New working directory: c:\git\ProteoformNetworks


In [54]:
# Calculate number of human GENES we can get from Reactome. Genes that participate in a Reaction and a Pathway.
genes = get_query_result(QUERY_GET_ALL_GENES)
print(f"There are {len(genes)} genes.")
if not os.path.exists(DATA_REACTOME_PATH):
    os.makedirs(DATA_REACTOME_PATH)
genes["Id"].to_csv(DATA_REACTOME_PATH + "genes.csv", index=False, header=False)
genes

There are 10976 genes.


Unnamed: 0,Id
0,HSPA8
1,HDAC6
2,PARK7
3,PCNT
4,ARL13B
...,...
10971,DENND4B
10972,ALS2CL
10973,RIN2
10974,RIN1


In [55]:
# Calculate number of human Proteins we can get from Reactome. Proteins that participate in a Reaction and a Pathway.
proteins = get_query_result(QUERY_GET_ALL_PROTEINS)
print(f"There are {len(proteins)} proteins.")
proteins["Id"].to_csv(DATA_REACTOME_PATH + "proteins.csv", index=False, header=False)
proteins

There are 11074 proteins.


Unnamed: 0,Id
0,P11142
1,Q9UBN7
2,Q99497
3,O95613
4,Q3SXY8
...,...
11069,O75064
11070,Q60I27
11071,Q8WYP3
11072,Q13671


In [56]:
# Calculate number of proteoforms we can get from Reactome
proteoforms = get_query_result(QUERY_GET_ALL_PROTEOFORMS)
proteoforms['Id'] = proteoforms.apply(lambda x: make_proteoform_string(x.Id), axis=1)
print(f"These {len(proteoforms)} proteoforms represent {len(proteins)} proteins coded by {len(genes)} genes, making {len(proteoforms)/len(genes)} proteoform per gene on average. ")
proteoforms["Id"].to_csv(DATA_REACTOME_PATH + "proteoforms.csv", index=False, header=False)
proteoforms

These 14246 proteoforms represent 11074 proteins coded by 10976 genes, making 1.2979227405247813 proteoform per gene on average. 


Unnamed: 0,Id
0,A0A075B6P5;
1,A0A075B6S6;
2,A0A096LP49;
3,A0A0A6YYK7;
4,A0A0C4DH25;
...,...
14241,Q9Y6X9;
14242,Q9Y6Y8;
14243,Q9Y6Y9;
14244,"Q9Y6Y9;00160:26,00160:114"


In [57]:
# Create interactomes to make sure mapping files genes-->proteins and proteins-->proteoforms exist
interactomes = get_interactomes(config.DATA_REACTOME_PATH, INTERACTOMES_PATH)

Reading participants of all reactions for level genes...
Reading participants of all reactions for level proteins...
Reading participants of all reactions for level proteoforms...
Reading participants of all reactions for level sm...
Reading components of all complexes for level genes...
Reading components of all complexes for level proteins...
Reading components of all complexes for level proteoforms...
Reading components of all complexes for level sm...
Reading interaction network for  at genes level, method no_sm...
Reading interaction network for  at proteins level, method no_sm...
Reading interaction network for  at proteoforms level, method no_sm...
Reading interaction network for  at genes level, method with_sm...
Reading interaction network for  at proteins level, method with_sm...
Reading interaction network for  at proteoforms level, method with_sm...
Reading interaction network for  at genes level, method with_unique_sm...
Reading interaction network for  at proteins level, 

In [58]:
pathways = get_pathways()["stId"]
print(f"There are {len(pathways)} pathways.")

There are 2112 pathways.


In [59]:
reactions = get_reactions()["stId"]
print(f"There are {len(reactions)} reactions.")

There are 13806 reactions.


In [64]:
# Examples of genes with multiple proteoforms

map_genes_to_proteins = read_dictionary_one_to_set(INTERACTOMES_PATH, "mapping_proteins_to_genes.tsv", col_indices=(1, 0))
map_proteins_to_proteoforms = read_dictionary_one_to_set(INTERACTOMES_PATH, "mapping_proteins_to_proteoforms.tsv", col_indices=(0, 1))

assert len(map_genes_to_proteins.keys()) == len(genes)
assert len(map_proteins_to_proteoforms.keys()) == len(proteins)

map_genes_to_proteoforms = {}
for gene, protein_set in map_genes_to_proteins.items():
    map_genes_to_proteoforms.setdefault(gene, set())
    for protein in protein_set:
        map_genes_to_proteoforms[gene].update(map_proteins_to_proteoforms[protein])
    
map_genes_to_proteoforms

df = pd.DataFrame(map_genes_to_proteoforms.items(), columns=["gene", "proteoforms"])
df["proteins"] = df.apply(lambda row: map_genes_to_proteins[row["gene"]], axis=1)
df["num proteins"] = df.apply(lambda row: len(row["proteins"]), axis=1)
df["num proteoforms"] = df.apply(lambda row: len(row["proteoforms"]), axis=1)
df.sort_values(by=['num proteoforms'], inplace=True, ascending=False)
df.set_index("gene", inplace=True)
dfi.export(df.head(20), 'figures/genes_with_most_proteoforms.png')
df.head(20)

Unnamed: 0_level_0,proteoforms,proteins,num proteins,num proteoforms
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
UBC,"{P0CG48;, P0CG48;01148:291, P0CG48;01148:656, ...",{P0CG48},1,55
H3C1,"{P68431;00084:37, P68431;00047:12, P68431;0006...",{P68431},1,52
H3C15,"{Q71DI3;00084:80, Q71DI3;00085:5, Q71DI3;00085...",{Q71DI3},1,48
HLA-B,"{P30462;, P30491;, P30486;, P30490;, P30460;, ...","{P30460, P30481, P18465, P18463, P30483, P3048...",36,37
HLA-A,"{P30443;, Q09160;, P16189;, P30450;, P30457;, ...","{P30457, P30456, P16188, P01892, P16190, P1031...",21,23
TP53,"{P04637;00046:15,00046:392, P04637;00046:15,00...",{P04637},1,23
UBB,"{P0CG47;01148:6, P0CG47;01148:87, P0CG47;00134...",{P0CG47},1,19
RUNX2,"{Q13950-1;01148:null, Q13950-2;00046:280,00046...",{Q13950},1,16
RB1,"{P06400;00046:249, P06400;00046:230,00046:249,...",{P06400},1,16
PPP1R1B,"{Q9UD71;00047:75, Q9UD71;00046:102,00046:137,0...",{Q9UD71},1,16


In [None]:
# Show the proteoforms per gene
for i, values in df.head(10).iterrows():
    print(f"Gene: {values['gene']} has proteoforms:\n {values['proteoforms']}")

Gene: UBC has proteoforms:
 {'P0CG48;', 'P0CG48;01148:291', 'P0CG48;01148:656', 'P0CG48;01614:380', 'P0CG48;00134:228', 'P0CG48;01614:152', 'P0CG48;01148:443', 'P0CG48;01148:124', 'P0CG48;01148:543', 'P0CG48;01148:200', 'P0CG48;01148:48', 'P0CG48;01148:163', 'P0CG48;01148:619', 'P0CG48;01148:614', 'P0CG48;01614:228', 'P0CG48;01148:428', 'P0CG48;01148:87', 'P0CG48;01614:456', 'P0CG48;01148:386', 'P0CG48;01148:538', 'P0CG48;01148:504', 'P0CG48;01148:580', 'P0CG48;01148:367', 'P0CG48;00134:380', 'P0CG48;00134:304', 'P0CG48;00134:152', 'P0CG48;00134:null', 'P0CG48;01148:595', 'P0CG48;01614:532', 'P0CG48;00134:456', 'P0CG48;00134:532', 'P0CG48;00134:76', 'P0CG48;01148:215', 'P0CG48;01148:391', 'P0CG48;01148:11', 'P0CG48;01148:362', 'P0CG48;01148:671', 'P0CG48;01614:684', 'P0CG48;01148:139', 'P0CG48;01148:467', 'P0CG48;01614:608', 'P0CG48;00134:608', 'P0CG48;01148:519', 'P0CG48;01148:310', 'P0CG48;01148:82', 'P0CG48;01148:6', 'P0CG48;01614:76', 'P0CG48;01148:276', 'P0CG48;01148:352', 'P0CG48

In [None]:

selected_proteins = []

for gene, proteoforms in map_proteins_to_proteoforms.items():
    if len(proteoforms) > 1:
        selected_proteins.append(protein)

print(f"Only {len(selected_proteins)} have multiple proteoforms.")

#TODO: Make dataframe: Gene name, num proteoforms, proteoforms

In [None]:
selected_pathways = []
for pathway in pathways:
    filename = get_json_filename(config.proteins, config.no_sm, config.PATHWAY_GRAPHS_PATH, pathway)
    if not Path(filename).exists():
        create_pathway_interaction_network(pathway, config.proteins, config.no_sm, config.PATHWAY_GRAPHS_PATH)
    G = read_graph(filename)
    if any(protein in selected_proteins for protein in list(G.nodes)):
        selected_pathways.append(pathway)

print(f"There are {len(selected_pathways)} pathways that contain proteoforms.")

    * Creating network networks\pathways\R-HSA-9613829_proteins_no_sm.json


OSError: Cannot save file into a non-existent directory: 'networks\pathways\complexes'

In [None]:

#  Examples when gene products participate in a different set of reactions.

In [None]:
- Calculate set of reactions where each gene product participates
- Select genes where it's protein products have a different set of reactions
- Quantify the difference overall:
    * How often the multiple protein products participate in the same reaction
    * Quantify the intersection: 