# Disease module analysis

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from bokeh.io import curdoc
from bokeh.layouts import layout
from bokeh.models import DataTable, ColumnDataSource, TableColumn, Div

import config
from Python.analysis_diseases.filter_genes import create_filtered_file
from Python.lib import networks
from Python.lib.dictionaries import read_dictionary_one_to_set, merge_dictionaries
from Python.lib.download import download_if_not_exists

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

config.set_root_wd()

Initial working directory: C:\git\ProteoformNetworks\src\Python
New working directory: C:\git\ProteoformNetworks


### Download disease related gene sets

In [None]:
# PheGenI Phenotypes
download_if_not_exists("resources/PheGenI/",
                       "PheGenI_Association.txt",
                       "https://www.ncbi.nlm.nih.gov/projects/gap/eqtl/EpiViewBE.cgi?type=dl.tab",
                       "PheGenI All_modules")
print("PheGenI files READY")

# Filter
create_filtered_file("resources/PheGenI/PheGenI_Association.txt",
                     "resources/PheGenI/PheGenI_Association_genome_wide_significant.txt",
                     5e-8)

# Jensen Lab Diseases
download_if_not_exists("resources/Jensen/",
                       "human_disease_textmining_filtered.tsv",
                       "http://download.jensenlab.org/human_disease_textmining_filtered.tsv",
                       "Jensen Lab Diseases")
print("Jensen Lab files READY")

### Read gene sets

In [None]:
file_phegeni = "resources/PheGenI/PheGenI_Association_genome_wide_significant.txt"
df_diseases = pd.read_csv(file_phegeni, sep="\t")
df_diseases

In [None]:
map_disease_to_genes1 = read_dictionary_one_to_set("resources/PheGenI/", "PheGenI_Association_genome_wide_significant.txt", col_indices=(1, 4), ignore_header=True)
map_disease_to_genes2 = read_dictionary_one_to_set("resources/PheGenI/", "PheGenI_Association_genome_wide_significant.txt", col_indices=(1, 6), ignore_header=True)
df_diseases.columns
map_disease_to_genes = merge_dictionaries(map_disease_to_genes1, map_disease_to_genes2)
#map_disease_to_genes
print(f"Got map of {len(map_disease_to_genes.keys())} diseases to genes")

In [None]:
df_disease_to_genes = pd.DataFrame(data=[(d, len(g)) for d, g in map_disease_to_genes.items()], columns=["Disease", "Num"])
df_disease_to_genes['Entity'] = config.genes
df_disease_to_genes.sort_values(["Num"], ascending=False, inplace=True)
df_disease_to_genes

In [None]:
sns.violinplot(x="Entity", y="Num", data=df_disease_to_genes, color="0.8")
sns.stripplot(x="Entity", y="Num", data=df_disease_to_genes, jitter=True, zorder=1)
plt.show()

In [None]:
df_disease_to_genes = df_disease_to_genes[df_disease_to_genes['Num']<=100]
df_disease_to_genes

In [None]:
map_genes_to_proteins = read_dictionary_one_to_set(config.GRAPHS_PATH, "mapping_proteins_to_genes.tsv", col_indices=(1, 0))
df_genes_to_proteins = pd.DataFrame(data=[(g,len(ps)) for g, ps in map_genes_to_proteins.items()], columns=["Genes", "NumProteins"])
df_genes_to_proteins.sort_values(["NumProteins"], ascending=False, inplace=True)
df_genes_to_proteins

In [None]:
genes_arr=["Genes"]*len(df_genes_to_proteins)
sns.violinplot(x=df_genes_to_proteins["NumProteins"], color="0.8")
sns.stripplot(x=df_genes_to_proteins["NumProteins"], jitter=True, zorder=1)
plt.show()

In [None]:
filter_genes = df_genes_to_proteins[df_genes_to_proteins["NumProteins"] > 5]
filter_genes = set(filter_genes['Genes'])
filter_diseases = set()
for d, gs in map_disease_to_genes.items():
    if bool(gs & filter_genes):
        filter_diseases.add(d)
print(f"Diseases to delete: \n\n {filter_diseases}")
df_sel_diseases = df_disease_to_genes[~df_disease_to_genes['Disease'].isin(filter_diseases)]
df_sel_diseases


In [None]:
map_disease_to_proteins = {disease: set() for disease in df_sel_diseases["Disease"]}
for disease in df_sel_diseases["Disease"]:
    for g in map_disease_to_genes[disease]:
        if g in map_genes_to_proteins:
            for protein in map_genes_to_proteins[g]:
                map_disease_to_proteins[disease].add(protein)
#map_disease_to_proteins

In [None]:
map_disease_to_numproteins = {d: len(g) for d, g in map_disease_to_proteins.items()}
df_disease_to_proteins = pd.DataFrame(data=map_disease_to_numproteins.items(), columns=["Disease", "Num"])
df_disease_to_proteins['Entity'] = config.proteins
df_disease_to_proteins.sort_values(["Num"], ascending=False, inplace=True)
df_disease_to_proteins

In [None]:
df_disease_to_entitites = pd.concat([df_disease_to_genes, df_disease_to_proteins])
df_disease_to_entitites.sort_values(by=["Disease"], inplace=True)
df_disease_to_entitites

In [None]:
sns.violinplot(x="Entity", y="Num", data=df_disease_to_entitites, color="0.8")
sns.stripplot(x="Entity", y="Num", data=df_disease_to_entitites, jitter=True, zorder=1)
plt.show()

In [None]:
map_protein_to_diseases = {}
for d, ps in map_disease_to_proteins.items():
    for p in ps:
        if not p in map_protein_to_diseases.keys():
            map_protein_to_diseases[p] = set()
        map_protein_to_diseases[p].add(d)
#map_protein_to_diseases

In [None]:
map_protein_to_numdiseases = {p: len(ds) for p, ds in map_protein_to_diseases.items()}
df_p_nd = pd.DataFrame(data=map_protein_to_numdiseases.items(), columns=["Protein", "NumDiseases"])
df_p_nd.sort_values(["NumDiseases"], ascending=False, inplace=True)
df_p_nd

In [None]:
sns.violinplot(data=df_p_nd, color="0.8")
sns.stripplot(data=df_p_nd, jitter=True, zorder=1)
plt.show()

In [None]:
filter_proteins = df_p_nd[df_p_nd['NumDiseases'] > 10]
filter_proteins = set(filter_proteins["Protein"])
filter_proteins
filter_diseases = set()
for d, ps in map_disease_to_proteins.items():
    if bool(ps & filter_proteins):
        filter_diseases.add(d)
print(f"Diseases to delete: \n\n {filter_diseases}")
df_sel_diseases = df_disease_to_genes[~df_disease_to_genes['Disease'].isin(filter_diseases)]
df_sel_diseases


In [None]:
selected_diseases = set()
for disease, proteins in map_disease_to_proteins.items():
    if proteins & selected_proteins:
        selected_diseases.add(disease)
# selected_diseases
for i, val in enumerate(itertools.islice(selected_diseases, 10)):
    print(val)
len(selected_diseases)

In [None]:
participant_records = { l: get_participants(l, GRAPHS_PATH) for l in [*LEVELS, sm]}
components_records = { l: get_components(l, GRAPHS_PATH) for l in [*LEVELS, sm]}

In [None]:
interactomes_with_sm = { l: get_or_create_interaction_network(l, with_sm, participant_records, components_records, GRAPHS_PATH) for l in LEVELS}
for level, interactome in interactomes_with_sm.items():
     print_interactome_details(interactome)

In [None]:
# Create disease module at genes level
# -- We know what genes belong
# -- Create a subraph induced by the gene nodes and the neighboring small molecules

for trait in list(selected_diseases)[:1]:
    node_set = get_nodes_and_adjacent(map_disease_to_genes[trait], interactomes_with_sm[genes])
    S = interactomes_with_sm[genes].subgraph(node_set)
    p = plot_interaction_network(S, coloring=Coloring.ENTITY_TYPE, plot_width=600, plot_height=500, toolbar_location=None, title="Test title", legend_location="right")
    plt.show(p)
# Create disease module at proteins level

# Create disease module at proteoforms level

In [None]:
# TODO: Call C++ program to calculate overlap

In [None]:
# TODO: Plot relative size of lcc for each disease module

In [None]:
# TODO: Plot Number of genes (x) VS Observed module size (y)
# The observed module size is the number of vertices in the lcc of the disease module

### Read overlap data

In [3]:
overlap_data = pd.read_table("../../reports/overlap_similarity_score_variation_examples.tsv")
overlap_data.head()

data = dict(
    trait1 = overlap_data["TRAIT1"],
    trait2 = overlap_data["TRAIT2"],
    score = overlap_data["PROTEINS_TO_PROTEOFORMS"]
)
source = ColumnDataSource(data)

columns = [
        TableColumn(field="trait1", title="Trait1"),
        TableColumn(field="trait2", title="Trait2"),
        TableColumn(field="score", title="Score"),
    ]
data_table = DataTable(source=source, columns=columns, width=600, height=280)
plt.show(data_table)

FileNotFoundError: [Errno 2] File ../../reports/overlap_similarity_score_variation_examples.tsv does not exist: '../../reports/overlap_similarity_score_variation_examples.tsv'

In [None]:
def callback(attrname, old, new):
    print("Attribute changed: ", attrname)
    print("Old: ", old)
    print("New: ", new)
    selectionIndex=source.selected.indices[0]
    print("you have selected the row numr "+ str(selectionIndex) + " which is " 
          + source["trait1"][selectionIndex] + " -- " + source["trait2"][selectionIndex])

source.selected.on_change("indices", callback);

In [None]:
trait1 = "Adiponectin"
trait2 = "Thyrotropin"
path_to_modules = "../../reports/All_modules/"
path_to_figures = "../../figures/overlap_analysis/"

In [None]:
title = f"<p style=\"font-weight:bold;text-align:center;font-size:22px;width:1800px;\">" \
            f"<span style=\"color:green;\">{trait1}</span> with " \
            f"<span style=\"color:blue;\">{trait2}</span>" \
            f"</p>"

graphs_complete = {level: networks.create_graph(trait1, trait2, level, path_to_modules) for level in config.LEVELS}
graphs_interface = {level: networks.create_graph(trait1, trait2, level, path_to_modules, only_interface=True) for level in config.LEVELS}
 
figures_complete_modules = [networks.create_plot(level, graph) for level, graph in graphs_complete.items()]
figures_interfaces = [networks.create_plot(level, graph) for level, graph in graphs_interface.items()]

l = layout(
    [[data_table], 
     [Div(text=f"{title}")], 
     figures_complete_modules, 
     figures_interfaces,
])

plt.show(l)
curdoc().add_root(l)
curdoc().title = "Module pairs visualization"


In [2]:
# TODO: Make scatter plot where each dot is a disease.