# Data overview

## Load dependencies

In [42]:
import os
import config
from config import INTERACTOMES_PATH, DATA_REACTOME_PATH
from queries import QUERY_GET_ALL_GENES, QUERY_GET_ALL_PROTEINS, QUERY_GET_ALL_PROTEOFORMS, QUERY_GET_ALL_SMALL_MOLECULES
from lib.graph_database_access import get_pathways, get_query_result, make_proteoform_string, get_reactions
from lib.dictionaries import read_dictionary_one_to_set
from lib.networks import get_json_filename, create_pathway_interaction_network, read_graph, get_interactomes
import pandas as pd
import dataframe_image as dfi
import networkx as nx

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

config.set_root_wd()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Initial working directory: C:\git\ProteoformNetworks\src\Python
New working directory: c:\git\ProteoformNetworks


## Get number of entities in Reactome

### Number of genes

In [43]:
genes = get_query_result(QUERY_GET_ALL_GENES)
print(f"There are {len(genes)} genes.")
if not os.path.exists(DATA_REACTOME_PATH):
    os.makedirs(DATA_REACTOME_PATH)
genes["Id"].to_csv(DATA_REACTOME_PATH + "genes.csv", index=False, header=False)
genes

There are 10976 genes.


Unnamed: 0,Id
0,HSPA8
1,HDAC6
2,PARK7
3,PCNT
4,ARL13B
...,...
10971,DENND4B
10972,ALS2CL
10973,RIN2
10974,RIN1


### Number of proteins

In [44]:
# Calculate number of human Proteins we can get from Reactome. Proteins that participate in a Reaction and a Pathway.
proteins = get_query_result(QUERY_GET_ALL_PROTEINS)
print(f"There are {len(proteins)} proteins.")
proteins["Id"].to_csv(DATA_REACTOME_PATH + "proteins.csv", index=False, header=False)
proteins

There are 11074 proteins.


Unnamed: 0,Id
0,P11142
1,Q9UBN7
2,Q99497
3,O95613
4,Q3SXY8
...,...
11069,O75064
11070,Q60I27
11071,Q8WYP3
11072,Q13671


### Number of proteoforms

In [45]:
proteoforms = get_query_result(QUERY_GET_ALL_PROTEOFORMS)
proteoforms['Id'] = proteoforms.apply(lambda x: make_proteoform_string(x.Id), axis=1)
print(f"These {len(proteoforms)} proteoforms represent {len(proteins)} proteins coded by {len(genes)} genes, making {len(proteoforms)/len(genes)} proteoform per gene on average. ")
proteoforms["Id"].to_csv(DATA_REACTOME_PATH + "proteoforms.csv", index=False, header=False)
proteoforms

These 14246 proteoforms represent 11074 proteins coded by 10976 genes, making 1.2979227405247813 proteoform per gene on average. 


Unnamed: 0,Id
0,A0A075B6P5;
1,A0A075B6S6;
2,A0A096LP49;
3,A0A0A6YYK7;
4,A0A0C4DH25;
...,...
14241,Q9Y6X9;
14242,Q9Y6Y8;
14243,Q9Y6Y9;
14244,"Q9Y6Y9;00160:26,00160:114"


### Number of small molecules

In [46]:
# Calculate number of human small molecules we can get from Reactome
small_molecules = get_query_result(QUERY_GET_ALL_SMALL_MOLECULES)
small_molecules["Name"].to_csv(DATA_REACTOME_PATH + "small_molecules.csv", index=False, header=False)
small_molecules

Unnamed: 0,Name,stIds
0,GTP,"[R-ALL-29438:GTP [cytosol], R-ALL-113573:GTP [..."
1,AMP,"[R-ALL-76577:AMP [cytosol], R-ALL-164121:AMP [..."
2,ATP,"[R-ALL-113592:ATP [cytosol], R-ALL-29358:ATP [..."
3,ADP,"[R-ALL-29370:ADP [cytosol], R-ALL-113582:ADP [..."
4,GDP,"[R-ALL-29420:GDP [cytosol], R-ALL-113525:GDP [..."
...,...,...
2052,NSAID,[R-ALL-2672383:NSAID [extracellular region]]
2053,NAADP,[R-ALL-2730676:NAADP [cytosol]]
2054,divalent metal cation,[R-ALL-5252122:divalent metal cation [extracel...
2055,cation,"[R-ALL-5692450:cation [cytosol], R-ALL-5692484..."


### Number of reactions

In [47]:
reactions = get_reactions()["stId"]
print(f"There are {len(reactions)} reactions.")

There are 13806 reactions.


### Number of pathways

In [48]:
pathways = get_pathways()["stId"]
print(f"There are {len(pathways)} pathways.")

There are 2112 pathways.


## Proteoform diversity

### Proteoforms per gene

In [49]:
# Examples of genes with multiple proteoforms

map_genes_to_proteins = read_dictionary_one_to_set(INTERACTOMES_PATH, "mapping_proteins_to_genes.tsv", col_indices=(1, 0))
map_proteins_to_proteoforms = read_dictionary_one_to_set(INTERACTOMES_PATH, "mapping_proteins_to_proteoforms.tsv", col_indices=(0, 1))

assert len(map_genes_to_proteins.keys()) == len(genes)
assert len(map_proteins_to_proteoforms.keys()) == len(proteins)

map_genes_to_proteoforms = {}
for gene, protein_set in map_genes_to_proteins.items():
    map_genes_to_proteoforms.setdefault(gene, set())
    for protein in protein_set:
        map_genes_to_proteoforms[gene].update(map_proteins_to_proteoforms[protein])
    
map_genes_to_proteoforms

df_num_products_per_gene = pd.DataFrame(map_genes_to_proteoforms.items(), columns=["gene", "proteoforms"])
df_num_products_per_gene["proteins"] = df_num_products_per_gene.apply(lambda row: map_genes_to_proteins[row["gene"]], axis=1)
df_num_products_per_gene["num proteins"] = df_num_products_per_gene.apply(lambda row: len(row["proteins"]), axis=1)
df_num_products_per_gene["num proteoforms"] = df_num_products_per_gene.apply(lambda row: len(row["proteoforms"]), axis=1)
df_num_products_per_gene.sort_values(by=['num proteoforms'], inplace=True, ascending=False)
df_num_products_per_gene.set_index("gene", inplace=True)
dfi.export(df_num_products_per_gene.head(20), 'figures/genes_with_most_proteoforms.png')
df_num_products_per_gene.head(20)

Unnamed: 0_level_0,proteoforms,proteins,num proteins,num proteoforms
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
UBC,"{P0CG48;01148:139, P0CG48;01148:291, P0CG48;01...",{P0CG48},1,55
H3C1,"{P68431;00084:37, P68431;00047:12,00064:15,000...",{P68431},1,52
H3C15,"{Q71DI3;00064:19, Q71DI3;00084:37, Q71DI3;0004...",{Q71DI3},1,48
HLA-B,"{P30495;, P30493;, Q04826;, P10319;, P30475;, ...","{P30480, P30461, P30491, P30484, P30483, Q9536...",36,37
HLA-A,"{P18462;, P16189;, P01892;01148:null, P30447;,...","{P10314, P30459, P16189, P18462, P01892, P3044...",21,23
TP53,"{P04637;00046:15,00046:20,00046:269,00047:284,...",{P04637},1,23
UBB,"{P0CG47;01148:124, P0CG47;00134:76, P0CG47;011...",{P0CG47},1,19
RUNX2,"{Q13950-2;00046:280,00046:284,00046:288, Q1395...",{Q13950},1,16
RB1,"{P06400;00046:780, P06400;00046:249, P06400;00...",{P06400},1,16
PPP1R1B,"{Q9UD71;00046:137,00047:34,00047:75, Q9UD71;00...",{Q9UD71},1,16


In [55]:
df_num_products_per_gene.loc["PLOD3"]

proteoforms        {O60568;}
proteins            {O60568}
num proteins               1
num proteoforms            1
gene                   PLOD3
Name: PLOD3, dtype: object

### Examples of genes with most proteoforms

In [50]:
df_num_products_per_gene["gene"] = df_num_products_per_gene.index
for i, values in df_num_products_per_gene.head(5).iterrows():
    print(f"Gene: {values['gene']} has proteoforms:\n {values['proteoforms']}")

Gene: UBC has proteoforms:
 {'P0CG48;01148:139', 'P0CG48;01148:291', 'P0CG48;01148:82', 'P0CG48;01148:6', 'P0CG48;01148:11', 'P0CG48;01148:124', 'P0CG48;01148:276', 'P0CG48;00134:380', 'P0CG48;01148:443', 'P0CG48;01148:163', 'P0CG48;01148:428', 'P0CG48;01148:391', 'P0CG48;00134:608', 'P0CG48;01614:228', 'P0CG48;01148:671', 'P0CG48;01148:239', 'P0CG48;01148:543', 'P0CG48;01614:456', 'P0CG48;01148:519', 'P0CG48;01614:152', 'P0CG48;01614:304', 'P0CG48;00134:null', 'P0CG48;00134:76', 'P0CG48;01148:538', 'P0CG48;01148:158', 'P0CG48;01148:234', 'P0CG48;01148:315', 'P0CG48;01148:362', 'P0CG48;01614:608', 'P0CG48;01148:215', 'P0CG48;01148:367', 'P0CG48;01148:504', 'P0CG48;01148:614', 'P0CG48;01148:386', 'P0CG48;', 'P0CG48;00134:228', 'P0CG48;00134:456', 'P0CG48;01148:200', 'P0CG48;01148:580', 'P0CG48;01148:63', 'P0CG48;00134:152', 'P0CG48;01614:76', 'P0CG48;00134:532', 'P0CG48;01148:87', 'P0CG48;01148:619', 'P0CG48;01148:595', 'P0CG48;01148:467', 'P0CG48;01614:684', 'P0CG48;01148:352', 'P0CG48

### Number of genes with multiple proteoforms

In [51]:
num_genes_with_multiple_proteoforms = len(df_num_products_per_gene.loc[df_num_products_per_gene["num proteoforms"] > 1])
num_genes_with_no_proteoform_diversity = len(df_num_products_per_gene[df_num_products_per_gene["num proteoforms"] == 1])
assert num_genes_with_multiple_proteoforms + num_genes_with_no_proteoform_diversity == len(df_num_products_per_gene)
print(f"From {len(df_num_products_per_gene)} genes, {num_genes_with_multiple_proteoforms} have multiple proteoforms. {num_genes_with_no_proteoform_diversity} have no proteoform specific annotations.")

From 10976 genes, 1509 have multiple proteoforms. 9467 have no proteoform specific annotations.


### Number of pathways with participant proteoforms with annotations

In [52]:
# Create interactomes to make sure mapping files genes-->proteins and proteins-->proteoforms exist
interactomes = get_interactomes(config.DATA_REACTOME_PATH, INTERACTOMES_PATH)
interactomes

Reading participants of all reactions for level genes...
Reading participants of all reactions for level proteoforms...
Reading participants of all reactions for level sm...
Reading components of all complexes for level genes...
Reading components of all complexes for level proteoforms...
Reading components of all complexes for level sm...
Reading interaction network for  at genes level, method no_sm...
Reading interaction network for  at proteoforms level, method no_sm...
Reading interaction network for  at genes level, method with_sm...
Reading interaction network for  at proteoforms level, method with_sm...
Reading interaction network for  at genes level, method with_unique_sm...
Reading interaction network for  at proteoforms level, method with_unique_sm...


({'genes': <networkx.classes.graph.Graph at 0x1bebf630e50>,
  'proteoforms': <networkx.classes.graph.Graph at 0x1bebf631e10>},
 {'genes': <networkx.classes.graph.Graph at 0x1bebf633a00>,
  'proteoforms': <networkx.classes.graph.Graph at 0x1bebf633730>},
 {'genes': <networkx.classes.graph.Graph at 0x1bebf633430>,
  'proteoforms': <networkx.classes.graph.Graph at 0x1bebf632e90>})

In [53]:
selected_pathways = []
i = interactomes[0]["proteoforms"]
proteoforms_with_annotations = [node for node in i.nodes if any((c in {':', '-'}) for c in str(node))]
proteoforms_with_annotations

pathways = nx.get_node_attributes(i, "pathways")
for proteoform in proteoforms_with_annotations:
    for pathway in pathways[proteoform]:
        selected_pathways.append(pathway)

print(f"There are {len(selected_pathways)} pathways that contain proteoforms with annotations.")

There are 9596 pathways that contain proteoforms with annotations.
