# Size

## Load dependencies

In [9]:
import pandas as pd
import seaborn as sns
import config
import dataframe_image as dfi
from config import INTERACTOMES_PATH, DATA_REACTOME_PATH, genes, proteoforms
from lib.networks import get_interactomes, get_multiindex, get_increase_percentage
from networkx import edge_boundary
from queries import QUERY_GET_ALL_PROTEINS
from lib.graph_database_access import get_query_result

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

config.set_root_wd()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Initial working directory: C:\git\ProteoformNetworks\src\Python
New working directory: c:\git\ProteoformNetworks


## Create interactomes

In [10]:
interactomes_no_sm, interactomes_with_sm, interactomes_with_unique_sm = get_interactomes(DATA_REACTOME_PATH, INTERACTOMES_PATH)
interactomes = [interactomes_no_sm[genes], interactomes_no_sm[proteoforms], interactomes_with_sm[genes], interactomes_with_sm[proteoforms], interactomes_with_unique_sm[genes], interactomes_with_unique_sm[proteoforms]]

Reading participants of all reactions for level genes...
Reading participants of all reactions for level proteins...
Reading participants of all reactions for level proteoforms...
Reading participants of all reactions for level sm...
Reading components of all complexes for level genes...
Reading components of all complexes for level proteins...
Reading components of all complexes for level proteoforms...
Reading components of all complexes for level sm...
Reading interaction network for  at genes level, method no_sm...
Reading interaction network for  at proteins level, method no_sm...
Reading interaction network for  at proteoforms level, method no_sm...
Reading interaction network for  at genes level, method with_sm...
Reading interaction network for  at proteins level, method with_sm...
Reading interaction network for  at proteoforms level, method with_sm...
Reading interaction network for  at genes level, method with_unique_sm...
Reading interaction network for  at proteins level, 

## Number of nodes per type

In [11]:
index = get_multiindex()

num_interactions = pd.Series([i.number_of_edges() for i in interactomes], index=index)
num_nodes = pd.Series([i.number_of_nodes() for i in interactomes], index=index)
num_acc_ent = pd.Series([i.graph['num_entities'] for i in interactomes], index=index)
# num_small_molecules = pd.Series([interactome.graph['num_small_molecules'] for interactome in interactome_list], index=index)

# Calculate number of nodes of each type
df_sizes = pd.DataFrame({
    "Interactions": num_interactions,
    "Nodes": num_nodes,
}, index=index)
dfi.export(df_sizes, 'figures/interactome_num_nodes_by_type.png')
df_sizes

Unnamed: 0_level_0,Unnamed: 1_level_0,Interactions,Nodes
Small Molecules,Entity Level,Unnamed: 2_level_1,Unnamed: 3_level_1
Not Included,genes,366208,10976
Not Included,proteoforms,590415,14246
Included,genes,451490,13033
Included,proteoforms,681891,16303
Reaction-Unique Included,genes,808212,40575
Reaction-Unique Included,proteoforms,1047542,43845


In [12]:
proteins = len(get_query_result(QUERY_GET_ALL_PROTEINS))
print(f"{df_sizes.loc['Not Included', 'proteoforms']['Accessioned Entities']} proteoforms represent {proteins} proteins coded by {df_sizes.loc['Not Included', 'genes']['Accessioned Entities']} genes")

KeyError: 'Accessioned Entities'

## Number of proteoforms per gene

In [None]:
ratio = df_sizes.loc["Not Included", "proteoforms"]["Accessioned Entities"] / df_sizes.loc["Not Included", "genes"]["Accessioned Entities"]
print(f"There are {ratio} proteoforms per gene.")

There are 1.2979227405247813 proteoforms per gene.


## Changes in network size by proteoforms

In [None]:
node_difference = interactomes_no_sm[proteoforms].number_of_nodes() - interactomes_no_sm[genes].number_of_nodes()
nodes_change = round(node_difference * 100 / interactomes_no_sm[genes].number_of_nodes(), 2)
links_difference = interactomes_no_sm[proteoforms].number_of_edges() - interactomes_no_sm[genes].number_of_edges()
links_change = round(links_difference * 100 / interactomes_no_sm[genes].number_of_edges(), 2)
print(f"the proteoform representation yielded {node_difference} ({nodes_change} %) more nodes and {links_difference} ({links_change} %) more connections than the single gene representation")

the proteoform representation yielded 3270 (29.79 %) more nodes and 224207 (61.22 %) more connections than the single gene representation


## Number of proteoforms with annotations

In [None]:
proteoforms_with_annotations = [node for node in interactomes_no_sm['proteoforms'].nodes if any((c in {':', '-'}) for c in str(node))]
proteoforms_no_annotations = [node for node in interactomes_no_sm['proteoforms'].nodes if not any((c in {':', '-'}) for c in str(node))]

percentage = round(len(proteoforms_with_annotations) * 100 / interactomes_no_sm[proteoforms].number_of_nodes(), 2)
print(f"There are {len(proteoforms_with_annotations)} proteoforms with isoform or translational modification annotations, {percentage} % of the total.")
print(f"the vast majority of the proteoforms {len(interactomes_no_sm['proteoforms'].nodes) - len(proteoforms_with_annotations)} ({100 - percentage} %) are not annotated functionally.")

There are 3433 proteoforms with isoform or translational modification annotations, 24.1 % of the total.
the vast majority of the proteoforms 10813 (75.9 %) are not annotated functionally.


## Changes in network size by small molecules

In [None]:
diff_nodes_gene_level = interactomes_with_sm[genes].number_of_nodes() - interactomes_no_sm[genes].number_of_nodes()
diff_nodes_proteoform_level = interactomes_with_sm[proteoforms].number_of_nodes() - interactomes_no_sm[proteoforms].number_of_nodes()
assert diff_nodes_gene_level == diff_nodes_proteoform_level
change_gene_level = round(diff_nodes_gene_level * 100 / interactomes_no_sm[genes].number_of_nodes(), 2)
change_proteoform_level = round(diff_nodes_proteoform_level * 100 / interactomes_no_sm[proteoforms].number_of_nodes(), 2)
print(f"Small molecules addition increases the number of nodes by {diff_nodes_proteoform_level} ({change_gene_level} % and {change_proteoform_level} % respectively).")


nodes_change_gene_level = get_increase_percentage(interactomes_no_sm[genes].number_of_nodes(), interactomes_with_sm[genes].number_of_nodes())
nodes_change_proteoform_level = get_increase_percentage(interactomes_no_sm[proteoforms].number_of_nodes(), interactomes_with_sm[proteoforms].number_of_nodes())
print(f"the gene- and proteoform-centric networks with small molecules thus increases the number of nodes by {nodes_change_gene_level} % and {nodes_change_proteoform_level} %, respectively")

connections_change_gene_level = get_increase_percentage(interactomes_no_sm[genes].number_of_edges(), interactomes_with_sm[genes].number_of_edges())
difference_gene_level = interactomes_with_sm[genes].number_of_edges() - interactomes_no_sm[genes].number_of_edges()
connections_change_proteoform_level = get_increase_percentage(interactomes_no_sm[proteoforms].number_of_edges(), interactomes_with_sm[proteoforms].number_of_edges())
difference_proteoform_level = interactomes_with_sm[proteoforms].number_of_edges() - interactomes_no_sm[proteoforms].number_of_edges()
print(f"Adding small molecules creates {difference_gene_level} and {difference_proteoform_level} new connections, corresponding to an increase of {connections_change_gene_level} % and {connections_change_proteoform_level} %, for the gene- and proteoform-centric networks, respectively")

connections_change_gene_level = get_increase_percentage(interactomes_no_sm[genes].number_of_edges(), interactomes_with_unique_sm[genes].number_of_edges())
difference_gene_level = interactomes_with_unique_sm[genes].number_of_edges() - interactomes_no_sm[genes].number_of_edges()
connections_change_proteoform_level = get_increase_percentage(interactomes_no_sm[proteoforms].number_of_edges(), interactomes_with_unique_sm[proteoforms].number_of_edges())
difference_proteoform_level = interactomes_with_unique_sm[proteoforms].number_of_edges() - interactomes_no_sm[proteoforms].number_of_edges()
print(f"With reaction unique small molecules,")
print(f"the number of new connections is {difference_gene_level} and {difference_proteoform_level}, corresponding to an increase of {connections_change_gene_level} % and {connections_change_proteoform_level} %, for the gene- and proteoform-centric networks, respectively")

Small molecules addition increases the number of nodes by 2057 (18.74 % and 14.44 % respectively).
the gene- and proteoform-centric networks with small molecules thus increases the number of nodes by 18.74 % and 14.44 %, respectively
Adding small molecules creates 85282 and 91476 new connections, corresponding to an increase of 23.29 % and 15.49 %, for the gene- and proteoform-centric networks, respectively
With reaction unique small molecules,
the number of new connections is 442004 and 457127, corresponding to an increase of 120.7 % and 77.42 %, for the gene- and proteoform-centric networks, respectively


## Number of connections among proteoform nodes with/without annotations

In [None]:

edges_nn = edge_boundary(interactomes_no_sm['proteoforms'], proteoforms_no_annotations, proteoforms_no_annotations)
edges_an = edge_boundary(interactomes_no_sm['proteoforms'], proteoforms_with_annotations, proteoforms_no_annotations)
edges_aa = edge_boundary(interactomes_no_sm['proteoforms'], proteoforms_with_annotations, proteoforms_with_annotations)

num_edges_nn = len(list(edges_nn))
num_edges_an = len(list(edges_an))
num_edges_aa = len(list(edges_aa))

print(f"Total number of connections in the proteoform interactome: {interactomes_no_sm['proteoforms'].number_of_edges()}")

percentage = round(num_edges_aa * 100 / interactomes_no_sm['proteoforms'].number_of_edges(), 2)
print(f"Connections when BOTH HAVE annotations:                     {num_edges_aa}, that is {percentage} % of all connections.")

percentage = round(num_edges_an * 100 / interactomes_no_sm['proteoforms'].number_of_edges(), 2)
print(f"Connections when ONE has and ONE does NOT have annotations: {num_edges_an}, that is {percentage} % of all connections.")

percentage = round(num_edges_nn * 100 / interactomes_no_sm['proteoforms'].number_of_edges(), 2)
print(f"Connections when BOTH nodes do NOT have annotations:        {num_edges_nn}, that is {percentage} % of all connections.")


Total number of connections in the proteoform interactome: 590415
Connections when BOTH HAVE annotations:                     143255, that is 24.26 % of all connections.
Connections when ONE has and ONE does NOT have annotations: 101907, that is 17.26 % of all connections.
Connections when BOTH nodes do NOT have annotations:        345253, that is 58.48 % of all connections.
