# \*\*\*WARNING\*\*\*: Bad analysis technique

I realized that this folder of analyses falls prey to the fallacy of indirect comparison. We compare with_event_tumor to normal, and without_event_tumor to normal, and then compare the with_tumor and without_tumor results from that to each other. It's better to just directly compare with_tumor to without_tumor, which we do with a t-test in the finished analyses.

# Enrichment NB 03: Analyzing rSEA results

## Setup

In [1]:
import os
import pandas as pd
import numpy as np
import altair as alt
from toolz.curried import pipe

In [2]:
ALPHA = 0.05
MSV = 0.5

plots_dir = "altair_data"

go_results_file = os.path.join("results_02_rSEA", "enrichment_rsea_thresh_0.99_lib_GO_Biological_Process_2018.tsv")
reactome_results_file = os.path.join("results_02_rSEA", "enrichment_rsea_thresh_0.99_lib_ReactomePathways.tsv")

diff_expr_file = os.path.join("results_01_grouped_expr", "all_results.tsv.gz")

gene_set_dir = "gene_set_libraries"
go_gmt = os.path.join(gene_set_dir, "GO_Biological_Process_2018.gmt")
reactome_gmt = os.path.join(gene_set_dir, "ReactomePathways.gmt")

In [3]:
# Altair options
alt.data_transformers.disable_max_rows()

def json_dir(data, data_dir):
    os.makedirs(data_dir, exist_ok=True)
    return pipe(data, alt.to_json(filename=os.path.join(data_dir, "{prefix}-{hash}.{extension}")) )

alt.data_transformers.register("json_dir", json_dir)
alt.data_transformers.enable("json_dir", data_dir=plots_dir)

DataTransformerRegistry.enable('json_dir')

In [4]:
results_files = {
    "go": go_results_file,
    "reactome": reactome_results_file
}

results = {}

for gene_set in results_files.keys():
    
    result = pd.read_csv(results_files[gene_set], sep="\t", index_col=0)
    result = result[
        (result["Coverage"] > 0) & 
        (
            (result["SC.adjP"] <= ALPHA) | 
            (result["Comp.0.99.adjP"] <= ALPHA)
        )]
    
    grp_split = result["cancer_type_group"].str.split("_", n=1, expand=True)
    
    result = result.assign(
        cancer_type=grp_split[0],
        group=grp_split[1]
    ).\
    drop(columns="cancer_type_group").\
    sort_values(
        by=["cancer_type", "group", "SC.adjP", "Coverage", "Size"],
        ascending=[True, True, True, False, False]
    )
    
    results[gene_set] = result

## Distribution of proportion of gene sets covered

In [5]:
alt.Chart(results["go"]).mark_bar().encode(
    x=alt.X(
        "Coverage",
        bin=alt.Bin(step=0.01),
        scale=alt.Scale(domain=[0, 1])
    ),
    y=alt.Y(
        "count()"
    )
)

In [6]:
# Due to there being a duplicated gene name, and a gene set comprised only of that gene,
# there is one entry that's marked as having a coverage proportion of 2. So that doesn't
# mess up our chart, we'll just clip it out.

alt.Chart(results["reactome"]).mark_bar(clip=True).encode(
    x=alt.X(
        "Coverage",
        bin=alt.Bin(step=0.01),
        scale=alt.Scale(domain=[0, 1])
    ),
    y=alt.Y(
        "count()"
    )
)

## Look at individual groups

In [7]:
results["go"].groupby(["cancer_type", "group"]).head().set_index(["cancer_type", "group", "Name"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Size,Coverage,TDP.bound,TDP.estimate,SC.adjP,Comp.0.99.adjP
cancer_type,group,Name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
colon,with_both,protein export from nucleus (GO:0006611),29,0.03,1.0,1.0,6.644517e-07,6.644517e-07
colon,with_both,intracellular protein transport (GO:0006886),348,0.02,1.0,1.0,6.644517e-07,4.999214e-02
colon,with_both,nuclear export (GO:0051168),93,0.02,1.0,1.0,6.644517e-07,4.999214e-02
colon,with_both,mitotic nuclear envelope reassembly (GO:0007084),11,0.18,1.0,1.0,2.536441e-06,4.999214e-02
colon,with_both,peptidyl-serine dephosphorylation (GO:0070262),11,0.18,1.0,1.0,2.536441e-06,4.999214e-02
colon,with_gain,Golgi vesicle budding (GO:0048194),10,0.20,1.0,1.0,4.866931e-09,4.996401e-02
colon,with_gain,positive regulation of viral genome replication (GO:0045070),32,0.16,1.0,1.0,4.866931e-09,4.996401e-02
colon,with_gain,regulation of viral genome replication (GO:0045069),64,0.14,1.0,1.0,4.866931e-09,4.996401e-02
colon,with_gain,positive regulation of viral life cycle (GO:1903902),45,0.13,1.0,1.0,4.866931e-09,4.996401e-02
colon,with_gain,modification by symbiont of host morphology or physiology (GO:0044003),8,0.12,1.0,1.0,4.866931e-09,4.866931e-09


## Look at which gene sets are most enriched across cancer types, for each group

In [8]:
def plot_top_ten(
    enrich_file_path, 
    expr_file_path, 
    gmt_file_path, 
    enrichment_group,
    xtitle, 
    pathway_min_size, 
    min_cancers_enriched,
    sort_col, 
    sort_asc, 
    cutoff_col, 
    cutoff=0.05
):
    """Make a bubble chart where the x axis is pathway mean rank across all cancer types, the 
    y axis is cancer type, circle size is enrichment rank in cancer type, and circle color is
    gene set mean change in expression in tumor compared to normal.
    
    Parameters:
    enrich_file_path (str): The path to the file with rSEA enrichment analysis results.
    expr_file_path (str): The path to the file with differential expression analysis results.
    gmt_file_path (str): The path to the .gmt file that was used for the rSEA analysis.
    enrichment_group (str): Which CNV event group to look at. Either 'with_gain', 'without_gain', 
        'with_loss', 'without_loss', 'with_both', or 'without_both'. 
    xtitle (str): The title for the plot.
    pathway_min_size (int): The minimum number of genes a pathway/gene set must have to be included
        in the chart.
    min_cancers_enriched (int): The minimum number of cancers a pathway/gene set must be enriched
        in, in order to be included in the chart.
    sort_col (str): Which column in the rSEA results dataframe to use for ranking pathways.
    sort_asc (bool): Whether to rank the sort column in ascending order.
    cutoff_col (str): Which column in the rSEA results to use for a significance cutoff.
    cutoff (float, optional): What cutoff to use when deciding whether to count a pathway as enriched.
    
    Returns:
    altair.Chart: The chart.
    pandas.DataFrame: The enrichment data used for the chart
    pandas.DataFrame: The expression data used for the chart
    pandas.DataFrame: The summary of the enrichment data, created in the process of making the chart
    """
    
    # Read in the expression data, selecting only the data for the specified group
    all_expression_data = pd.read_csv(expr_file_path, sep="\t")
    
    all_expression_data = all_expression_data[all_expression_data["group"] == enrichment_group]

    # Make a column where all increases are +1 and all decreases 
    # are -1, because these are ratioed abundances, so we can't 
    # compare magnitudes between genes--we can only compare whether 
    # there was a change, and whether it was positive or negative
    all_expression_data = all_expression_data.assign(simplified_change=np.nan)

    # adj p < 0.05 and change > 1 => +1
    all_expression_data.loc[
        (all_expression_data["change"] > 0) & (all_expression_data["adj_p"] < 0.05), 
        "simplified_change"
    ] = 1

    # adj p >= 0.05 and change > 1 => +0.5
    all_expression_data.loc[(all_expression_data["change"] > 0) & (all_expression_data["adj_p"] >= 0.05),
        "simplified_change"
    ] = MSV

    # change == 0 => 0
    all_expression_data.loc[
        all_expression_data["change"] == 0,
        "simplified_change"
    ] = 0

    # adj p >= 0.05 and change < 1 => -0.5
    all_expression_data.loc[(all_expression_data["change"] < 0) & (all_expression_data["adj_p"] >= 0.05), 
        "simplified_change"
    ] = -MSV

    # adj p < 0.05 and change < 1 => -1
    all_expression_data.loc[
        (all_expression_data["change"] < 0) & (all_expression_data["adj_p"] < 0.05),
        "simplified_change"
    ] = -1

    # Select just the proteins where we chose to reject the null hypothesis of no change
    # We'll use this later to calculate average change in expression for each pathway
    expression_data = all_expression_data[all_expression_data["adj_p"] <= 0.05]
    
    # Read in the GMT file so we can have a list of which genes are in each pathway, in
    # order to calculate the average change in expression for the pathways
    with open(gmt_file_path, "r") as fp:
        gene_lists = fp.readlines()

    # Take the newline off the end, and split on the tab character to create a list of lists
    gene_lists = [l.strip().split("\t") for l in gene_lists]

    # Create a dataframe mapping pathway name and ID to contained genes
    pathway_names = [l[0] for l in gene_lists]
    pathway_genes = [l[2:] for l in gene_lists] # We skip index 1 -- it's a blank unused field.
    pathway_data = pd.DataFrame({
        "pathway_name": pathway_names,
        "pathway_genes": pathway_genes
    })

    # Read in the enrichment data
    enrichment_data = pd.\
    read_csv(enrich_file_path, sep="\t", index_col=0).\
    rename(columns={"Name": "pathway_name"})
    
    # Split the cancer_type_group column, then select the specified group
    # This assumes that there are no underscores in the cancer type names,
    # and that the cancer_type_group column has the format [CANCER]_[GROUP]
    grp_split = enrichment_data["cancer_type_group"].str.split("_", n=1, expand=True)
    
    enrichment_data = enrichment_data.assign(
        cancer_type=grp_split[0],
        group=grp_split[1]
    ).\
    drop(columns="cancer_type_group")
    
    enrichment_data = enrichment_data[enrichment_data["group"] == enrichment_group]
    
    # Merge pathway data into the enrichment data, so we know which pathways were enriched
    enrichment_data = enrichment_data.merge(
        pathway_data,
        how="left",
        left_on="pathway_name",
        right_on="pathway_name",
        validate="many_to_one"
    )
    
    # Select enrichment data where pathways meet the minimum size and the
    # p values for them being enriched pass the cutoff
    enrichment_data = enrichment_data[
        (enrichment_data["pathway_genes"].apply(len) >= pathway_min_size) &
        (enrichment_data[cutoff_col] <= cutoff)
    ]
    
    # Assign pathway ranks within each cancer type based on the sort_col.
    enrichment_data = enrichment_data.\
    assign(
        cancer_rank=enrichment_data.\
        groupby("cancer_type")[sort_col].\
        rank(ascending=sort_asc)
    ).\
    sort_values(by=["cancer_type", "cancer_rank"]).\
    reset_index(drop=True)

    # Make a table with summary info for all pathways
    enrichment_summary = enrichment_data[["pathway_name"]].drop_duplicates(keep="first")

    pathway_times_enriched = enrichment_summary["pathway_name"].apply(
        lambda x: enrichment_data[enrichment_data["pathway_name"] == x].shape[0])

    avg_rank = enrichment_summary["pathway_name"].apply(
        lambda x: enrichment_data.loc[enrichment_data["pathway_name"] == x, "cancer_rank"].mean())

    enrichment_summary = enrichment_summary.\
    assign(
        pathway_times_enriched=pathway_times_enriched,
        pathway_avg_rank=avg_rank).\
    sort_values(
        by=["pathway_times_enriched", "pathway_avg_rank"],
        ascending=[False, True]).\
    reset_index(drop=True)

    # Merge the summary into the original enrichment data
    enrichment_data = enrichment_data.\
    merge(
        enrichment_summary,
        how="outer",
        left_on="pathway_name",
        right_on="pathway_name",
        validate="many_to_one"
    ).\
    sort_values(
        by=["pathway_times_enriched", "pathway_avg_rank", "cancer_type"],
        ascending=[False, True, True]
    )

    # Select top 10 for our plot
    in_all = enrichment_summary.loc[
        enrichment_summary["pathway_times_enriched"] >= min_cancers_enriched,
        "pathway_name"
    ]
    
    if in_all.size <= 10:
        top_ten = in_all
    else:
        top_ten = in_all[:10]
    
    sel_enrich = enrichment_data[enrichment_data["pathway_name"].isin(top_ten)]

    # Calculate the mean expression for each pathway in each cancer type
    mean_exprs = []

    for idx in sel_enrich.index:
        genes = sel_enrich.loc[idx, "pathway_genes"]
        cancer_type = sel_enrich.loc[idx, "cancer_type"]

        genes_expr = expression_data.\
        loc[
            expression_data["protein_str"].isin(genes) &
            (expression_data["cancer_type"] == cancer_type),
            "simplified_change"
        ].\
        mean()

        mean_exprs.append(genes_expr)

    sel_enrich = sel_enrich.assign(mean_expr=mean_exprs)

    sel_enrich = sel_enrich.assign(
        rank_size=1 / sel_enrich["cancer_rank"],
        avg_rank_size=1 / sel_enrich["pathway_avg_rank"],
        avg_rank_label=sel_enrich["pathway_avg_rank"].apply(lambda x: round(x, 2)).astype(str))
    
    # Take care of duplicates for the upper plot
    for i in range(10):
        sel_enrich["avg_rank_label"] = sel_enrich["avg_rank_label"].where(
            cond=~(sel_enrich.duplicated(subset=["cancer_type", "avg_rank_label"], keep="first")),
            other=" " + sel_enrich["avg_rank_label"])

    individual = alt.Chart(sel_enrich).mark_circle().encode(
        x=alt.X(
            "pathway_name:N",
            sort=sel_enrich["pathway_name"].values,
            axis=alt.Axis(
                labelAngle=-30,
                labelFontSize=12,
                labelLimit=500,
                title="",
                titleFontSize=16
            )
        ),
        y=alt.Y(
            "cancer_type:N",
            axis=alt.Axis(
                title="Cancer type",
                titleFontSize=12
            ),
        ),
        size=alt.Size(
            "rank_size:Q",
            legend=None
        ),
        color=alt.Color(
            "mean_expr:Q",
            scale=alt.Scale(
                scheme="blueorange",
                domain=[-1, 1]
            ),
            legend=alt.Legend(
                title="Pathway tumor expression"
            )
        )
    ).properties(
        width=400,
        height=300
    )

    aggregate = alt.Chart(sel_enrich).mark_circle().encode(
        x=alt.X(
            "avg_rank_label:N",
            sort=sel_enrich["avg_rank_label"].values,
            axis=alt.Axis(
                labelAngle=-30,
                labelFontSize=12,
                labelLimit=500,
                title="Overall rank of pathway",
                titleFontSize=12
            )
        ),
        size=alt.Size(
            "avg_rank_size:Q",
            legend=None
        ),
    ).properties(
        width=400
    )

    full_plot = alt.vconcat(
        aggregate, individual
    ).properties(
        title=xtitle
    )
    
    return full_plot, enrichment_data, all_expression_data, enrichment_summary

In [9]:
all_groups = [
    'with_gain', 
    'without_gain', 
    'with_loss', 
    'without_loss', 
    'with_both', 
    'without_both'
]

reactome_plots = [
    plot_top_ten(
        enrich_file_path=reactome_results_file, 
        expr_file_path=diff_expr_file, 
        gmt_file_path=reactome_gmt,
        enrichment_group=eg,
        xtitle=f"Reactome {eg} data - rSEA, threshold = 0.99, sort by Comp_adjP, cutoff Comp_adjP",
        pathway_min_size=5,
        min_cancers_enriched=5,
        sort_col="Comp.0.99.adjP",
        sort_asc=True,
        cutoff_col="Comp.0.99.adjP",
        cutoff=0.05
    )[0]
    for eg in all_groups
]

In [10]:
alt.vconcat(*reactome_plots).configure_axis(grid=True)

In [11]:
all_groups = [
    'with_gain', 
    'without_gain', 
    'with_loss', 
    'without_loss', 
    'with_both', 
    'without_both'
]

go_plots = [
    plot_top_ten(
        enrich_file_path=go_results_file, 
        expr_file_path=diff_expr_file, 
        gmt_file_path=go_gmt,
        enrichment_group=eg,
        xtitle=f"GO {eg} data - rSEA, threshold = 0.99, sort by Comp_adjP, cutoff Comp_adjP",
        pathway_min_size=5,
        min_cancers_enriched=5,
        sort_col="Comp.0.99.adjP",
        sort_asc=True,
        cutoff_col="Comp.0.99.adjP",
        cutoff=0.05
    )[0]
    for eg in all_groups
]

In [12]:
alt.vconcat(*go_plots).configure_axis(grid=True)

## Look at which gene sets are most enriched for each cancer type in each group, and see if there are similarities

In [13]:
def plot_single_cancer(
    enrich_file_path, 
    expr_file_path, 
    gmt_file_path, 
    plot_cancer_type,
    enrichment_group,
    pathway_min_size, 
    sort_col, 
    sort_asc, 
    cutoff_col, 
    cutoff=0.05
):
    """Make a bubble chart where the x axis is pathway within a cancer type, the y axis is cancer type 
    (only one value), circle size is enrichment rank in cancer type, and circle color is gene set mean 
    change in expression in tumor compared to normal.
    
    Parameters:
    enrich_file_path (str): The path to the file with rSEA enrichment analysis results.
    expr_file_path (str): The path to the file with differential expression analysis results.
    gmt_file_path (str): The path to the .gmt file that was used for the rSEA analysis.
    plot_cancer_type (str): Which cancer type to make the plot for.
    enrichment_group (str): Which CNV event group to look at. Either 'with_gain', 'without_gain', 
        'with_loss', 'without_loss', 'with_both', or 'without_both'. 
    pathway_min_size (int): The minimum number of genes a pathway/gene set must have to be included
        in the chart.
    sort_col (str): Which column in the rSEA results dataframe to use for ranking pathways.
    sort_asc (bool): Whether to rank the sort column in ascending order.
    cutoff_col (str): Which column in the rSEA results to use for a significance cutoff.
    cutoff (float, optional): What cutoff to use when deciding whether to count a pathway as enriched.
    
    Returns:
    altair.Chart: The chart.
    pandas.DataFrame: The enrichment data used for the chart
    pandas.DataFrame: The expression data used for the chart
    pandas.DataFrame: The summary of the enrichment data, created in the process of making the chart
    """
    
    # Read in the expression data, selecting only the data for the specified group
    all_expression_data = pd.read_csv(expr_file_path, sep="\t")
    
    all_expression_data = all_expression_data[
        (all_expression_data["cancer_type"] == plot_cancer_type) &
        (all_expression_data["group"] == enrichment_group)
    ]

    # Make a column where all increases are +1 and all decreases 
    # are -1, because these are ratioed abundances, so we can't 
    # compare magnitudes between genes--we can only compare whether 
    # there was a change, and whether it was positive or negative
    all_expression_data = all_expression_data.assign(simplified_change=np.nan)

    # adj p < 0.05 and change > 1 => +1
    all_expression_data.loc[
        (all_expression_data["change"] > 0) & (all_expression_data["adj_p"] < 0.05), 
        "simplified_change"
    ] = 1

    # adj p >= 0.05 and change > 1 => +0.5
    all_expression_data.loc[(all_expression_data["change"] > 0) & (all_expression_data["adj_p"] >= 0.05),
        "simplified_change"
    ] = MSV

    # change == 0 => 0
    all_expression_data.loc[
        all_expression_data["change"] == 0,
        "simplified_change"
    ] = 0

    # adj p >= 0.05 and change < 1 => -0.5
    all_expression_data.loc[(all_expression_data["change"] < 0) & (all_expression_data["adj_p"] >= 0.05), 
        "simplified_change"
    ] = -MSV

    # adj p < 0.05 and change < 1 => -1
    all_expression_data.loc[
        (all_expression_data["change"] < 0) & (all_expression_data["adj_p"] < 0.05),
        "simplified_change"
    ] = -1

    # Select just the proteins where we chose to reject the null hypothesis of no change
    # We'll use this later to calculate average change in expression for each pathway
    expression_data = all_expression_data[all_expression_data["adj_p"] <= 0.05]
    
    # Read in the GMT file so we can have a list of which genes are in each pathway, in
    # order to calculate the average change in expression for the pathways
    with open(gmt_file_path, "r") as fp:
        gene_lists = fp.readlines()

    # Take the newline off the end, and split on the tab character to create a list of lists
    gene_lists = [l.strip().split("\t") for l in gene_lists]

    # Create a dataframe mapping pathway name and ID to contained genes
    pathway_names = [l[0] for l in gene_lists]
    pathway_genes = [l[2:] for l in gene_lists] # We skip index 1 -- it's a blank unused field.
    pathway_data = pd.DataFrame({
        "pathway_name": pathway_names,
        "pathway_genes": pathway_genes
    })

    # Read in the enrichment data
    enrichment_data = pd.\
    read_csv(enrich_file_path, sep="\t", index_col=0).\
    rename(columns={"Name": "pathway_name"})
    
    # Split the cancer_type_group column, then select the specified group
    # This assumes that there are no underscores in the cancer type names,
    # and that the cancer_type_group column has the format [CANCER]_[GROUP]
    grp_split = enrichment_data["cancer_type_group"].str.split("_", n=1, expand=True)
    
    enrichment_data = enrichment_data.assign(
        cancer_type=grp_split[0],
        group=grp_split[1]
    ).\
    drop(columns="cancer_type_group")
    
    enrichment_data = enrichment_data[
        (enrichment_data["cancer_type"] == plot_cancer_type) &
        (enrichment_data["group"] == enrichment_group)
    ]
    
    # Merge pathway data into the enrichment data, so we know which pathways were enriched
    enrichment_data = enrichment_data.merge(
        pathway_data,
        how="left",
        left_on="pathway_name",
        right_on="pathway_name",
        validate="many_to_one"
    )
    
    # Select enrichment data where pathways meet the minimum size and the
    # p values for them being enriched pass the cutoff
    enrichment_data = enrichment_data[
        (enrichment_data["pathway_genes"].apply(len) >= pathway_min_size) &
        (enrichment_data[cutoff_col] <= cutoff)
    ]
    
    # Assign pathway ranks within each cancer type based on the sort_col.
    enrichment_data = enrichment_data.\
    assign(
        cancer_rank=enrichment_data.\
        groupby("cancer_type")[sort_col].\
        rank(ascending=sort_asc)
    ).\
    sort_values(by=["cancer_type", "cancer_rank"]).\
    reset_index(drop=True)

    # Select top 10 for our plot
    sel_enrich = enrichment_data[:10]

    # Calculate the mean expression for each pathway in each cancer type
    mean_exprs = []

    for idx in sel_enrich.index:
        genes = sel_enrich.loc[idx, "pathway_genes"]
        cancer_type = sel_enrich.loc[idx, "cancer_type"]

        genes_expr = expression_data.\
        loc[
            expression_data["protein_str"].isin(genes) &
            (expression_data["cancer_type"] == cancer_type),
            "simplified_change"
        ].\
        mean()

        mean_exprs.append(genes_expr)

    sel_enrich = sel_enrich.assign(mean_expr=mean_exprs)

    sel_enrich = sel_enrich.assign(
        rank_size=1 / sel_enrich["cancer_rank"],
        rank_label=sel_enrich["cancer_rank"].apply(lambda x: round(x, 2)).astype(str))
    
    # Take care of duplicates for the upper plot
    for i in range(10):
        sel_enrich["rank_label"] = sel_enrich["rank_label"].where(
            cond=~(sel_enrich.duplicated(subset=["cancer_type", "rank_label"], keep="first")),
            other=" " + sel_enrich["rank_label"])
    
    # Make our plots!
    base = alt.Chart(sel_enrich).encode(
         y=alt.Y(
            "cancer_type:N",
            axis=alt.Axis(
                title=None
            ),
        ),
        size=alt.Size(
            "rank_size:Q",
            legend=None
        ),
        color=alt.Color(
            "mean_expr:Q",
            scale=alt.Scale(
                scheme="blueorange",
                domain=[-1, 1]
            ),
            legend=alt.Legend(
                title="Tumor expression"
            )
        )
    )

    bubbles = base.mark_circle().encode(
        x=alt.X(
            "pathway_name:N",
            sort=sel_enrich["pathway_name"].values,
            axis=alt.Axis(
                labelAngle=-30,
                labelFontSize=12,
                labelLimit=500,
                title="",
                titleFontSize=16
            )
        )
    )
    
    numbers = base.mark_circle().encode(
        x=alt.X(
            "rank_label:N",
            sort=sel_enrich["pathway_name"].values,
            axis=alt.Axis(
                labelAngle=0,
                labelFontSize=12,
                labelLimit=500,
                labelPadding=15,
                title="Pathway rank",
                titleFontSize=10
            )
        )
    )
    
    chart = alt.layer(bubbles, numbers).resolve_scale(
        x="independent"
    ).properties(
        width=400,
        height=30
    )
    
    return chart, enrichment_data, all_expression_data, sel_enrich

In [14]:
def group_chart_by_cancer(group):
    cancer_types = [
    #     "brca",
    #     "ccrcc",
        "colon",
    #     "endometrial",
    #     "gbm",
        "hnscc",
        "lscc",
        "luad",
        "ovarian"
    ]

    charts = [    
        plot_single_cancer(
            enrich_file_path=reactome_results_file, 
            expr_file_path=diff_expr_file, 
            gmt_file_path=reactome_gmt,
            plot_cancer_type=ct,
            enrichment_group=group,
            pathway_min_size=5,
            sort_col="Comp.0.99.adjP",
            sort_asc=True,
            cutoff_col="Comp.0.99.adjP",
            cutoff=0.05
        )[0]
        for ct in cancer_types
    ]

    return alt.vconcat(*charts).properties(
        title=f"Reactome data: {group} group"
    ).configure_title(
        anchor="middle"
    )

In [15]:
group_chart_by_cancer("with_gain")

In [16]:
group_chart_by_cancer("without_gain")

In [17]:
group_chart_by_cancer("with_loss")

In [18]:
group_chart_by_cancer("without_loss")

In [19]:
group_chart_by_cancer("with_both")

In [20]:
group_chart_by_cancer("without_both")