* Goal 1: Find examples that suggest the use of small molecules in the interaction networks
* Goal 2: Show overview of the network topologies
* Goal 3: Show overview of modifications of proteins in the interface of disease modules
* Goal 4: Show overview of the disease module overlap

- [X] Create interactomes
    - [X] Interactomes without small molecules
    - [X] Interactomes with small molecules
- [ ] Show examples
- [ ] Create some disease modules
    - [X] Select multiform proteins
    - [ ] Select diseases containing multiforms
- [ ] A) Find pairs that overlap only with modified proteoforms
- [ ] B) Find pairs that overlap at gene/protein level and not overlap at proteoform level
    - [ ] Calculate overlap scores: node overlap, jaccard index, separation
    - [ ] Make
    - [ ] Plot overlap score distribution for all selected disease pairs
    - [ ] Plot selected pairs

## Set up configuration

In [12]:
import subprocess

import matplotlib as plt
import numpy as np
import pandas as pd
import seaborn as sns
from numpy.core.tests.test_einsum import sizes

import config
from config import LEVELS
from datasets.phegeni.filter_genes import create_filtered_file
from interactomes import get_interactome
from lib.dictionaries import read_dictionary_one_to_set
from lib.download import download_if_not_exists
from lib.graph_database import get_query_result
from queries import QUERY_GET_NUM_PROTEOFORMS_PER_PROTEIN

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

config.set_root_wd()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Working directory: c:\git\pathwayanalysisplatform\proteoformnetworks


## Create disease modules

In [6]:
# PheGenI Phenotypes
download_if_not_exists("resources/PheGenI/", 
                       "PheGenI_Association.txt", 
                       "https://www.ncbi.nlm.nih.gov/projects/gap/eqtl/EpiViewBE.cgi?type=dl.tab",
                       "PheGenI All_modules")
print("PheGenI files READY")

create_filtered_file("resources/PheGenI/PheGenI_Association.txt",
                     "resources/PheGenI/PheGenI_Association_genome_wide_significant.txt",
                     5e-8)

# Jensen Lab Diseases
download_if_not_exists("resources/Jensen/",
                       "human_disease_textmining_filtered.tsv",
                       "http://download.jensenlab.org/human_disease_textmining_filtered.tsv",
                       "Jensen Lab Diseases")
print("Jensen Lab files READY")

PheGenI files READY
Jensen Lab files READY


In [8]:
query = QUERY_GET_NUM_PROTEOFORMS_PER_PROTEIN
df = get_query_result(query)

df

Unnamed: 0,Protein,Proteoforms,NumProteoforms
0,P0CG48,"[[P0CG48], [P0CG48, 00134:152], [P0CG48, 00134...",82
1,P68431,"[[P68431], [P68431, 00046:11, 00047:12, 00064:...",52
2,Q71DI3,"[[Q71DI3], [Q71DI3, 00046:11, 00047:12, 00064:...",48
3,P02452,"[[P02452], [P02452, 00037:null], [P02452, 0003...",36
4,P08123,"[[P08123], [P08123, 00037:null], [P08123, 0003...",36
...,...,...,...
1486,Q9Y6K1,"[[Q9Y6K1], [Q9Y6K1, 01149:null]]",2
1487,Q9Y6Q2,"[[Q9Y6Q2], [Q9Y6Q2, 01150:null]]",2
1488,Q9Y6Q9,"[[Q9Y6Q9], [Q9Y6Q9, 00046:857]]",2
1489,Q9Y6W8,"[[Q9Y6W8-1], [Q9Y6W8-1, 00048:180]]",2


In [31]:
selected_proteins = set(df["Protein"].unique())
selected_proteins

{'Q9NTJ3',
 'Q8N163',
 'P42224',
 'P05060',
 'Q96GX5',
 'P15880',
 'Q13421',
 'Q969R5',
 'Q15375',
 'P12259',
 'P98177',
 'P15927',
 'P62318',
 'Q92529',
 'Q8N4Z0',
 'P04000',
 'P03999',
 'Q02763',
 'Q8N114',
 'P07225',
 'Q8IUE6',
 'Q16526',
 'Q14103',
 'Q9UHC7',
 'Q9HCM2',
 'P49427',
 'Q8N6P7',
 'Q13263',
 'P15692',
 'P24394',
 'Q9UKL3',
 'Q9Y6W5',
 'Q14694',
 'P42695',
 'P02679',
 'Q14766',
 'P30044',
 'Q96RR4',
 'Q13217',
 'P26045',
 'O00255',
 'Q02817',
 'Q13882',
 'O60260',
 'P36896',
 'O15350',
 'O43597',
 'Q00987',
 'Q13635',
 'P17844',
 'P34897',
 'P01009',
 'P51114',
 'Q9H8Y8',
 'P19823',
 'O94855',
 'P01106',
 'P04279',
 'Q9UBU3',
 'P35968',
 'O00257',
 'Q01094',
 'P09544',
 'P51610',
 'Q96AH8',
 'Q15797',
 'P51965',
 'Q9H832',
 'P19793',
 'P50542',
 'Q9H195',
 'O14905',
 'Q6P988',
 'Q9Y243',
 'P06241',
 'Q13224',
 'P01583',
 'O43914',
 'Q9UQM7',
 'Q96A08',
 'P30273',
 'P02765',
 'P16118',
 'O94905',
 'Q8IW41',
 'Q53H12',
 'Q14195',
 'P50548',
 'Q969Q5',
 'Q96F24',
 'P01210',

In [14]:
file_phegeni = "resources/PheGenI/PheGenI_Association_genome_wide_significant.txt"
df_diseases = pd.read_csv(file_phegeni, sep="\t")
df_diseases

Unnamed: 0,#,Trait,SNP rs,Context,Gene,Gene ID,Gene 2,Gene ID 2,Chromosome,Location,P-Value,Source,PubMed,Analysis ID,Study ID,Study Name
0,2,1-Alkyl-2-acetylglycerophosphocholine Esterase,7528419,UTR-3,CELSR2,1952,CELSR2,1952,1,109274569,0.00,NHGRI,22003152,0,,
1,3,1-Alkyl-2-acetylglycerophosphocholine Esterase,12740374,UTR-3,CELSR2,1952,CELSR2,1952,1,109274967,0.00,NHGRI,23118302,0,,
2,4,1-Alkyl-2-acetylglycerophosphocholine Esterase,599839,nearGene-3,PSRC1,84722,PSRC1,84722,1,109279543,0.00,NHGRI,20442857,0,,
3,5,1-Alkyl-2-acetylglycerophosphocholine Esterase,600550,intron,MS4A4E,643680,MS4A4E,643680,11,60230192,0.00,NHGRI,23118302,0,,
4,6,1-Alkyl-2-acetylglycerophosphocholine Esterase,964184,intron,ZPR1,8882,ZPR1,8882,11,116778200,0.00,NHGRI,22003152,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30210,136499,von Willebrand Factor,8176704,intron,ABO,28,ABO,28,9,133260147,0.00,NHGRI,26486471,0,,
30211,136500,von Willebrand Factor,687621,intron,ABO,28,ABO,28,9,133261661,0.00,NHGRI,20231535,0,,
30212,136501,von Willebrand Factor,643434,intron,ABO,28,ABO,28,9,133266941,0.00,NHGRI,23381943,0,,
30213,136502,von Willebrand Factor,505922,intron,ABO,28,ABO,28,9,133273812,0.00,NHGRI,23381943,0,,


In [18]:
map_disease_to_genes = read_dictionary_one_to_set("resources/PheGenI/", "PheGenI_Association_genome_wide_significant.txt", col_indices=(1, 4), ignore_header=True)
df_diseases.columns
map_disease_to_genes

{'1-Alkyl-2-acetylglycerophosphocholine Esterase': {'ABCG2',
  'APOC1',
  'CELSR2',
  'HERPUD1',
  'LDLR',
  'LPA',
  'MS4A4E',
  'PLA2G7',
  'PSRC1',
  'SCARB1',
  'VMP1',
  'ZPR1'},
 '3-hydroxy-1-methylpropylmercapturic acid': {'ACAP2',
  'APOOP5',
  'ASB2',
  'C8orf37-AS1',
  'CNTNAP4',
  'EGOT',
  'FAM133A',
  'ITM2A',
  'ITPR1',
  'KCNN2',
  'LOC102723313',
  'LOC102724092',
  'MESTP3',
  'NHSL2',
  'PBDC1',
  'PINX1',
  'PRLR',
  'PTCHD1-AS',
  'RBFOX3',
  'REPS2',
  'RNU105B',
  'RPLP0P11',
  'ST3GAL1',
  'STK10',
  'STX18-AS1',
  'TBX3',
  'TOP1P1',
  'TWIST2',
  'ZBTB10'},
 'ADAMTS13 protein, human': {'ADAMTS13', 'OBP2B', 'SUPT3H'},
 'Abdominal Fat': {'CDH18', 'PLCL1'},
 'Acne Vulgaris': {'C11orf49', 'RNASEH2C', 'RPL13AP13', 'SELP', 'TGFB2'},
 'Acquired Immunodeficiency Syndrome': {'HCP5', 'PARD3B'},
 'Acute Coronary Syndrome': {'ADCY9',
  'ARPC1A',
  'CST3',
  'CYP3A4',
  'CYP3A7-CYP3A51P',
  'GATS',
  'GJC3',
  'NYAP1',
  'SLCO1B1',
  'UGT2B7'},
 'Acute-On-Chronic Liver Fail

In [15]:
map_genes_to_proteins = read_dictionary_one_to_set(config.GRAPHS_PATH, "mapping_proteins_to_genes.tsv", col_indices=(1, 0))

In [27]:
map_disease_to_proteins = {disease: set() for disease, genes in map_disease_to_genes.items()}
for disease, genes in map_disease_to_genes.items():
    for g in genes:
        if g in map_genes_to_proteins:
            for protein in map_genes_to_proteins[g]:
                map_disease_to_proteins[disease].add(protein)
map_disease_to_proteins

{'1-Alkyl-2-acetylglycerophosphocholine Esterase': {'P01130',
  'P02654',
  'P08519',
  'Q13093',
  'Q15011',
  'Q8WTV0',
  'Q9UNQ0'},
 '3-hydroxy-1-methylpropylmercapturic acid': {'O94804',
  'P16471',
  'Q11201',
  'Q14643',
  'Q8NFH8',
  'Q8WVJ9',
  'Q96Q27',
  'Q9H2S1'},
 'ADAMTS13 protein, human': {'O75486', 'Q76LX8'},
 'Abdominal Fat': {'Q13634'},
 'Acne Vulgaris': {'P16109', 'P61812'},
 'Acquired Immunodeficiency Syndrome': set(),
 'Acute Coronary Syndrome': {'O60503',
  'P01034',
  'P08684',
  'P16662',
  'Q92747',
  'Q9Y6L6'},
 'Acute-On-Chronic Liver Failure': {'Q9UIR0'},
 'Adenocarcinoma Of Esophagus': {'O94788',
  'P13569',
  'P28289',
  'Q5VWX1',
  'Q6UUV9',
  'Q8WWH4',
  'Q8WXA8',
  'Q9UJ68'},
 'Adenocarcinoma of lung': {'O14746',
  'O15315',
  'O95445',
  'P05787',
  'P21802',
  'P32297',
  'P35680',
  'P36896',
  'P46379',
  'P51587',
  'Q96AJ9',
  'Q9H3D4',
  'Q9NWV8',
  'Q9UIR0'},
 'Adiponectin': {'P01042',
  'P15692',
  'P35249',
  'P55290',
  'Q15848',
  'Q86U86',
 

In [32]:
selected_diseases = set()
for disease, proteins in map_disease_to_proteins.items():
    if proteins & selected_proteins:
        selected_diseases.add(disease)
selected_diseases

{'3-hydroxy-1-methylpropylmercapturic acid',
 'ADAMTS13 protein, human',
 'Acute Coronary Syndrome',
 'Adenocarcinoma Of Esophagus',
 'Adenocarcinoma of lung',
 'Adiponectin',
 'Age-Related Hearing Impairment 1',
 'Aggression',
 'Agranulocytosis',
 'Alanine Transaminase',
 'Albuminuria',
 'Alcohol Drinking',
 'Alkaline Phosphatase',
 'Alopecia',
 'Alzheimer Disease',
 'Amino Acids',
 'Amyloid beta-Peptides',
 'Amyloidosis, Cerebral, with Spongiform Encephalopathy',
 'Amyotrophic Lateral Sclerosis',
 'Amyotrophic lateral sclerosis 1',
 'Anemia',
 'Angiopoietin-2',
 'Angiotensin-Converting Enzyme Inhibitors',
 'Anthropometry',
 'Anti-Neutrophil Cytoplasmic Antibody-Associated Vasculitis',
 'Anticoagulants',
 'Antidepressive Agents',
 'Antineoplastic Agents',
 'Aorta',
 'Aortic Aneurysm, Abdominal',
 'Aortic Aneurysm, Thoracic',
 'Apolipoproteins B',
 'Apolipoproteins E',
 'Arterial Pressure',
 'Arthritis, Juvenile',
 'Arthritis, Rheumatoid',
 'Asparaginase',
 'Asthma',
 'Astigmatism',
 '

In [33]:
len(selected_diseases)

375

In [None]:

file_vertices = "resources/Reactome/interactome_indexed_vertices.tsv"
file_edges = "resources/Reactome/interactome_edges.tsv"
file_indexes = "resources/Reactome/interactome_vertices"
file_proteins_to_genes = "resources/Reactome/mapping_proteins_to_genes.tsv"
file_proteins_to_proteoforms = "resources/Reactome/mapping_proteins_to_proteoforms.tsv"
output_path = "resources/PheGenI/modules/"

In [None]:

pass
# Generates module files: gene_modules.tsv, protein_modules.tsv and proteoform_modules.tsv
# Generates single module files at each level: gene, protein and proteoform file for each trait
# Module sizes for genes, protein and proteoform levels
# Variation in module sizes when going from one level to the other
# Three files (for genes, proteins and proteoforms) with the Overlap similarity of each 
# pair of traits.
# Creates a file with a selection of trait pair examples which show decrease in the overlap.

In [10]:
args = {
    "modules_file": "resources/PheGenI/PheGenI_Association_genome_wide_significant.txt",
    "gene_vertices": "resources/Reactome/genes_vertices.tsv",
    "protein_vertices": "resources/Reactome/proteins_vertices.tsv",
    "proteoform_vertices": "resources/Reactome/proteoforms_vertices.tsv",
    "small_molecules_vertices": "resources/Reactome/proteoforms_small_molecules_vertices.tsv",
    "gene_edges": "resources/Reactome/genes_interactions.tsv",
    "protein_edges": "resources/Reactome/proteins_interactions.tsv",
    "proteoform_edges": "resources/Reactome/proteoforms_interactions.tsv",
    "proteins_to_genes": "resources/UniProt/mapping_proteins_to_genes.tsv",
    "proteins_to_proteoforms": "resources/UniProt/mapping_proteins_to_proteoforms.tsv",
    "output_path": "reports/All_modules/"
}

result = subprocess.check_output("dir", stderr=subprocess.STDOUT, shell=True)

print(result.decode())

 Volume in drive C is Windows
 Volume Serial Number is EC1B-53E2

 Directory of c:\git\pathwayanalysisplatform\proteoformnetworks

07/01/2020  11:13 PM    <DIR>          .
07/01/2020  11:13 PM    <DIR>          ..
05/07/2020  02:28 PM                73 .env
07/13/2020  06:44 PM               811 .gitignore
09/08/2020  08:49 PM    <DIR>          .idea
05/07/2020  02:23 PM    <DIR>          .vscode
09/08/2020  08:30 PM    <DIR>          build
09/04/2020  03:17 PM    <DIR>          figures
10/11/2019  09:21 PM            11,359 LICENSE
10/11/2019  09:21 PM               414 README.md
09/02/2020  10:05 PM    <DIR>          reports
04/23/2020  10:52 PM    <DIR>          resources
10/11/2019  09:21 PM    <DIR>          src
11/09/2019  05:48 PM               618 Untitled-1.txt
02/26/2020  11:44 AM    <DIR>          venv
               5 File(s)         13,275 bytes
              10 Dir(s)  613,060,526,080 bytes free



## Compare disease modules accross levels

In [None]:
sizes_dict_connected = {level: pd.read_csv(f"reports/All_modules/module_sizes_{level}.tsv", sep="\t") for level in LEVELS}
sizes_dict_disconnected = {level: pd.read_csv(f"reports/modules_keep_disconnected/module_sizes_{level}.tsv", sep="\t") for level in LEVELS}
for level in LEVELS:
    print(f"\n{level}")
    print(sizes_dict_connected[level].describe())
    print(sizes_dict_disconnected[level].describe())
    sizes_dict_connected[level]["CONNECTEDNESS"] = "connected"
    sizes_dict_disconnected[level]["CONNECTEDNESS"] = "disconnected"

sizes_df_connected = pd.concat(sizes_dict_connected, names=["level", "index"])
sizes_df_disconnected = pd.concat(sizes_dict_disconnected, names=["level", "index"])
# sizes_df[sizes_df.index.get_level_values("level") == "genes"]
# sizes_df = pd.concat([sizes_df_connected, sizes_dict_disconnected], names=["level", "index"])
# sizes_df
sizes_df_connected["LEVEL"] = sizes_df_connected.index.get_level_values("level")
sizes_df_disconnected["LEVEL"] = sizes_df_disconnected.index.get_level_values("level")
sizes_df = pd.concat([sizes_df_connected, sizes_df_disconnected])
sizes_df

In [None]:
# Do the following plots for the All_modules in two cases: keeping disconnected nodes and removing them
    # Jitter plot showing the sizes for genes, proteins and proteoforms
    # Jitter plot showing the difference in size fom genes to proteins and from proteins to proteoforms.
# sizes_df = pd.DataFrame()
# for level in levels:
#     sizes_df[f"SIZES_{level.upper()}"] = sizes_dict[level]["SIZES"]

sns.set(style="ticks")
g = sns.FacetGrid(sizes_df, col="LEVEL", height=4, aspect=.5, hue="LEVEL", palette="Set2")
g.map(sns.barplot, "CONNECTEDNESS", "SIZE")
plt.show()

ax = sns.boxplot(x="SIZE", y="LEVEL", data=sizes_df_connected, whis=np.inf)
ax = sns.stripplot(x="SIZE", y="LEVEL", data=sizes_df_connected, jitter=True, color=".3")
plt.show()

ax = sns.violinplot(x="SIZE", y="LEVEL", data=sizes_df_connected, inner=None, color=".8")
ax = sns.stripplot(x="SIZE", y="LEVEL", data=sizes_df_connected, jitter=True)
plt.show()

g = sns.catplot(x="LEVEL", y="SIZE",
                hue="LEVEL", col="CONNECTEDNESS",
                data=sizes_df, kind="strip",
                jitter=True,
                height=4, aspect=.7);
plt.show()

g = sns.catplot(x="CONNECTEDNESS", y="SIZE",
                hue="CONNECTEDNESS", col="LEVEL",
                data=sizes_df, kind="strip",
                jitter=True,
                height=4, aspect=.7);
plt.show()

g = sns.catplot(x="SIZE", y="LEVEL", hue="CONNECTEDNESS", data=sizes_df, kind="violin")
plt.show()

g = sns.catplot(x="SIZE", y="CONNECTEDNESS", hue="LEVEL", data=sizes_df, kind="violin")
plt.show()

In [None]:
# Calculate and plot size variation
sizes_df = pd.concat(
    [pd.read_csv(f"reports/All_modules/module_sizes_{level}.tsv", sep="\t", names=["MODULES", f"SIZE_{level.upper()}"], header=0)[f"SIZE_{level.upper()}"] for level in LEVELS], 
    axis=1)

sizes_df[f"VARIATION_GENES_TO_PROTEINS"] = sizes_df["SIZE_PROTEINS"] - sizes_df["SIZE_GENES"]
sizes_df[f"VARIATION_PROTEINS_TO_PROTEOFORMS"] = sizes_df["SIZE_PROTEOFORMS"] - sizes_df["SIZE_PROTEINS"]

sizes_df['PERCENTAGE_GENES_TO_PROTEINS'] = sizes_df['VARIATION_GENES_TO_PROTEINS']*100/sizes_df['SIZE_GENES']
sizes_df['PERCENTAGE_PROTEINS_TO_PROTEOFORMS'] = sizes_df['VARIATION_PROTEINS_TO_PROTEOFORMS']*100/sizes_df['SIZE_PROTEINS']
sizes_df.fillna(0, inplace=True)
# sizes_df['PERCENTAGE_PROTEINS_TO_PROTEOFORMS'].unique()
sizes_df

In [None]:
size_variation = pd.concat({"GENES_TO_PROTEINS":sizes_df["VARIATION_GENES_TO_PROTEINS"], 
                            "PROTEINS_TO_PROTEOFORMS": sizes_df[f"VARIATION_PROTEINS_TO_PROTEOFORMS"]
                           }, names=["STEP", "INDEX"], axis=0)
size_variation = size_variation.to_frame()
size_variation["STEP"] = size_variation.index.get_level_values("STEP")
size_variation.columns = ["VARIATION", "STEP"]

size_variation

In [None]:
ax = sns.boxplot(x="VARIATION", y="STEP", data=size_variation, whis=np.inf)
ax = sns.stripplot(x="VARIATION", y="STEP", data=size_variation,
                   jitter=True, color=".3").set_title("SIZE VARIATION AT LEVEL TRANSITION")
plt.show()

ax = sns.violinplot(x="STEP", y="VARIATION", data=size_variation, inner=None, color=".8")
ax = sns.stripplot(x="STEP", y="VARIATION", data=size_variation, jitter=True).set_title("SIZE VARIATION AT LEVEL TRANSITION")
plt.show()

In [None]:
percentage_variation = pd.concat({"GENES_TO_PROTEINS":sizes_df["PERCENTAGE_GENES_TO_PROTEINS"], 
                            "PROTEINS_TO_PROTEOFORMS": sizes_df[f"PERCENTAGE_PROTEINS_TO_PROTEOFORMS"]
                           }, names=["STEP", "INDEX"], axis=0)
percentage_variation = percentage_variation.to_frame()
percentage_variation["STEP"] = percentage_variation.index.get_level_values("STEP")
percentage_variation.columns = ["PERCENTAGE", "STEP"]

percentage_variation

In [None]:
ax = sns.boxplot(x="PERCENTAGE", y="STEP", data=percentage_variation, whis=np.inf)
ax = sns.stripplot(x="PERCENTAGE", y="STEP", data=percentage_variation,
                   jitter=True, color=".3").set_title("PERCENTAGE OF SIZE VARIATION AT LEVEL TRANSITION")
plt.show()

ax = sns.violinplot(x="STEP", y="PERCENTAGE", data=percentage_variation, inner=None, color=".8")
ax = sns.stripplot(x="STEP", y="PERCENTAGE", data=percentage_variation, jitter=True).set_title("PERCENTAGE OF SIZE VARIATION AT LEVEL TRANSITION")
plt.show()

## Overlap analysis

In [None]:
scores = {level: pd.read_csv(f"reports/scores_{level}_overlap_similarity.tsv", sep='\t') for level in LEVELS}
assert len({len(scores[level]) for level in LEVELS}) == 1, "Error: the number of All_modules vary among levels."

In [None]:
import matplotlib.pyplot as plt

bins = 250
plt.figure(figsize=(13, 6), )
for level in LEVELS:
    sns.distplot(scores[level]['SCORE'] , hist = True, kde = True, kde_kws = {'linewidth': 1}, label=level, bins=bins)
plt.ylim(0, 20)
plt.legend(prop={'size': 16}, title = 'Overlap score density distribution')
plt.title('Overlap score distribution')
plt.xlabel('Overlap score')
plt.ylabel('Density')
plt.show()

In [None]:
df = pd.DataFrame(scores['genes'][['SCORE', 'OVERLAP_SIZE']])
df.columns = ['SCORE_GENES', 'OVERLAP_SIZE_GENES']
for level in ['proteins', 'proteoforms']:
    df['OVERLAP_SIZE_' + level.upper()] = scores[level]['OVERLAP_SIZE']
    df['SCORE_' + level.upper()] = scores[level]['SCORE']
df['SCORE_VARIATION'] = df['SCORE_PROTEOFORMS'] - df['SCORE_GENES'] 
df['SIZE_PERCENTAGE_VARIATION'] = scores['genes'].apply(lambda x: sizes['PERCENTAGE_VARIATION'].loc[x['TRAIT1']] + sizes['PERCENTAGE_VARIATION'].loc[x['TRAIT2']], axis=1)
df.index = scores['genes'].apply(lambda x: (x['TRAIT1'], x['TRAIT2']), axis=1)
df

In [None]:
plt.figure(figsize=(13, 6))
plt.scatter(df['SCORE_VARIATION'], df['SIZE_PERCENTAGE_VARIATION'], alpha=0.5)
plt.title('Variation in size VS variation in overlap')
plt.xlabel('Size percentage variation')
plt.ylabel('Overlap variation')
plt.show()

In [None]:
min_overlap_size = 10
max_overlap_size = 20

overlap_data = {level: pd.read_csv(f"reports/All_modules/pairs_overlap_data_{level}.tsv") for level in LEVELS}


In [None]:
# NODE_OVERLAP_SIZE_VARIATION_GENES_TO_PROTEINS
# NODE_OVERLAP_SIZE_VARIATION_PROTEINS_TO_PROTEOFORMS
# NODE_INTERFACE_SIZE_VARIATION_GENES_TO_PROTEINS
# NODE_INTERFACE_SIZE_VARIATION_PROTEINS_TO_PROTEOFORMS
# NODE_INTERFACE_SIZE_VARIATION_GENES_TO