# Looking at which proteins are in the same bins across cancer types

1. Load in the groups' unique differential expression results.
2. For each group, see which proteins are consistently included across cancer types.

## Setup

In [1]:
import pandas as pd
import numpy as np
import os
import cnvutils
import altair as alt

In [2]:
res_dir = "results_01_grouped_expr"
unique_res_path = os.path.join(res_dir, "unique_results.tsv.gz")

unique_res = pd.read_csv(unique_res_path, sep="\t")

## Get gene location data

In [3]:
gene_locs = cnvutils.\
get_gene_locations().\
reset_index(drop=False)[["Name", "chromosome", "arm"]]

# Clean up the gene locations table
gene_locs = gene_locs[gene_locs["chromosome"].notna()]
gene_locs = gene_locs[~gene_locs.duplicated(keep="first")]
gene_locs = gene_locs[~gene_locs["chromosome"].str.endswith(("MT", ".1"))]

# Join gene locations into the differential expression table
unique_res = unique_res.merge(
    right=gene_locs,
    left_on="protein_str",
    right_on="Name",
    how="inner",
    validate="many_to_one"
)

In [4]:
unique_res = unique_res[[
    "cancer_type", 
    "group", 
    "protein", 
    "change", 
    "p_val", 
    "t_stat", 
    "protein_str",
    "adj_p",
    "chromosome",
    "arm"
]]

## Filter out genes that are contained in the events

In [5]:
# For the with_gain and without_gain groups, filter out genes located on
# 8q, which is the gained arm.
cis = unique_res[
    (
        unique_res["group"].str.endswith("gain") & 
        (unique_res["chromosome"] == "8") & 
        (unique_res["arm"] == "q")
    ) |
    (
        unique_res["group"].str.endswith("loss") & 
        (unique_res["chromosome"] == "8") & 
        (unique_res["arm"] == "p")
    ) |
    (
        unique_res["group"].str.endswith("both") & 
        (unique_res["chromosome"] == "8")
    )
]

In [6]:
# For the with_gain and without_gain groups, filter out genes located on
# 8q, which is the gained arm.

# For the with_loss and without_loss groups, filter out genes located on
# 8p, which is the lost arm.

# For the with_both and without_both groups, filter out all genes on
# chromosome 8.

trans = unique_res[
    ~(
        unique_res["group"].str.endswith("gain") & 
        (unique_res["chromosome"] == "8") & 
        (unique_res["arm"] == "q")
    ) |
    ~(
        unique_res["group"].str.endswith("loss") & 
        (unique_res["chromosome"] == "8") & 
        (unique_res["arm"] == "p")
    ) |
    ~(
        unique_res["group"].str.endswith("both") & 
        (unique_res["chromosome"] == "8")
    )
]

## Summarize which proteins are different in multiple groups

In [7]:
def make_summaries(prots, chart_title):
    prots = prots[["cancer_type", "group", "protein_str"]]

    prots_bin_summary = prots.\
    groupby(["group", "protein_str"]).\
    agg(**{"cancers": ("cancer_type", lambda x: x.drop_duplicates().sort_values().tolist())})

    prots_bin_summary = prots_bin_summary.\
    assign(cancer_count=prots_bin_summary["cancers"].apply(len)).\
    sort_values(by=["cancer_count", "group", "protein_str"], ascending=[False, True, True]).\
    reset_index(drop=False)
    
    chart_df = prots_bin_summary[prots_bin_summary["cancer_count"] > 1]["cancers"].\
    value_counts().\
    reset_index(drop=False).\
    rename(columns={
        "index": "included_cancers",
        "cancers": "protein_count"
    })
    
    chart_df = chart_df.\
    assign(
        group_size=chart_df["included_cancers"].apply(len),
        included_str=chart_df["included_cancers"].apply(lambda x: ", ".join(x)),
        tmp_sort=chart_df["included_cancers"].apply(lambda x: "".join(x))
    ).\
    sort_values(by=["group_size", "tmp_sort"], ascending=[False, True]).\
    drop(columns="tmp_sort")
    
    chart = alt.Chart(chart_df).mark_bar().encode(
        x=alt.X(
            "included_str",
            sort=chart_df["included_str"].values,
            axis=alt.Axis(
                title="Subset of cancers"
            )
        ),
        y=alt.Y(
            "protein_count",
            axis=alt.Axis(
                title="Number of proteins diff. expressed in all subset members"
            )
        ),
        color=alt.Color(
            "group_size:O"
        )
    ).properties(
        title=chart_title
    )
    
    return chart, chart_df

### Trans

In [8]:
trans_cht, trans_summary = make_summaries(trans, "Trans proteins")

### Cis

In [9]:
cis_cht, cis_summary = make_summaries(cis, "Cis proteins")

In [10]:
alt.vconcat(
    trans_cht,
    cis_cht
).resolve_scale(
    y="shared"
)

## Save results

In [11]:
output_dir = "results_04_expr_across_cancers"

if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

### Save the full tables

In [12]:
trans.insert(0, "protein_group", "trans")
cis.insert(0, "protein_group", "cis")

combined_full = trans.append(cis)

In [13]:
combined_full_path = os.path.join(output_dir, "combined_full.tsv.gz")
combined_full.to_csv(combined_full_path, sep="\t", index=False, compression="gzip")

### Save the summaries

In [14]:
# Combine the two summary tables into one, to make saving it easier
trans_summary.insert(0, "protein_group", "trans")
cis_summary.insert(0, "protein_group", "cis")

combined_summaries = trans_summary.append(cis_summary)

In [15]:
combined_summaries_path = os.path.join(output_dir, "combined_summaries.tsv.gz")
combined_summaries.to_csv(combined_summaries_path, sep="\t", index=False, compression="gzip")

## Thoughts on with vs. without event categories

Categories without events are more normal, so naturally they'll have more of the same proteins differentially expressed?

## Plan for tonight

- Make Venn diagrams showing how many proteins were shared between cancer types, in each group
    - Bar plot alternative:
        - We have 5 (or more later) cancer types, so 5C5 + 5C4 + 5C3 + 5C2 = 26 possible combos (excluding the combos of included in only one (5 possible combos), or included in none (1 possible combo))
- SEA for proteins common between many cancers
- Adapt trans effects code to do t-tests of cis effects
- Rough draft of presentation for next week