# Compare the distributions of proteins classified as different, equivalent, or ambiguous between samples with and without the event

Just as a sanity check.

In [1]:
import os
import pandas as pd
import numpy as np
import cnvutils
import altair as alt
import cptac

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
# These variables specify which chromosome and arm we're working on, and whether to do cis or trans effects
params = cnvutils.load_params(os.path.join("..", "data", "params.json"))
CHROMOSOME = params["CHROMOSOME"]
ARM = params["ARM"]
CIS_OR_TRANS = "cis"

EVENT_COLUMN = "event"

# Get data tables
CANCER_TYPES = params["CANCER_TYPES"]
data_types = ["proteomics"]
tables = cnvutils.load_tables(CANCER_TYPES, data_types, pancan=False)
proteomics = tables["proteomics"]

                                            

## Read in data tables and reformat

In [3]:
diff_df = pd.read_csv(os.path.join("..", "data", f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_ttest.tsv"), sep="\t")
equiv_df = pd.read_csv(os.path.join("..", "data", f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_equiv.tsv"), sep="\t")

## Make lists of different protein classes

Because our classification tests aren't perfect, there is some overlap, but that's okay.

In [4]:
diff_df.shape

(401, 5)

In [5]:
equiv_df.shape

(400, 4)

In [6]:
diff_prots = diff_df.loc[diff_df["adj_p"] <= 0.05, ["cancer_type", "protein"]]
equiv_prots = equiv_df.loc[equiv_df["adj_p"] <= 0.05, ["cancer_type", "protein"]]

In [7]:
equiv_fail = equiv_df.loc[((equiv_df["adj_p"] > 0.05) | equiv_df["adj_p"].isna()), ["cancer_type", "protein"]]
equiv_fail = pd.MultiIndex.from_arrays([equiv_fail["cancer_type"], equiv_fail["protein"]])

diff_fail = diff_df.loc[((diff_df["adj_p"] > 0.05) | diff_df["adj_p"].isna()), ["cancer_type", "protein"]]
diff_fail = pd.MultiIndex.from_arrays([diff_fail["cancer_type"], diff_fail["protein"]])

ambig_prots = equiv_fail[equiv_fail.isin(diff_fail)].to_frame().reset_index(drop=True)

In [8]:
ambig_prots.shape[0] + diff_prots.shape[0] + equiv_prots.shape[0]

402

## For each cancer type, get the expression data and plot it for the different groups

In [9]:
def plot_dists(cancer_type, prots, diff, equiv, ambig):
    
    # Get the proteomics dataframe
    prot = prots[cancer_type]
    
    # Drop any multiindex levels
    if prot.index.nlevels > 1:
        prot = prot.droplevel(1, axis=0)
    
    # Join in the event data and format the dataframe
    event = pd.read_csv(os.path.join(
        "..", 
        "data", 
        f"chr{CHROMOSOME}_{cancer_type}_has_event_{'harmonized' if PANCAN else 'AWG'}.tsv"
    ), sep='\t', index_col=0)
    event.index.rename('Name')
    
    prot = prot.\
    join(event).\
    dropna(subset=[EVENT_COLUMN]).\
    reset_index().\
    melt(
        id_vars=["Patient_ID", EVENT_COLUMN],
        var_name="protein",
        value_name="expr"
    ).\
    sort_values(by="Patient_ID")
    
    if isinstance(prot["protein"][0], tuple):
        split_prots = pd.DataFrame(prot["protein"].tolist())
        prot = prot.assign(
            protein=split_prots[0],
            Database_ID=split_prots[1]
        )
        
    else:
        prot = prot.assign(Database_ID=np.nan)
        
    # Select the group data for this cancer type
    diff = diff.loc[diff["cancer_type"] == cancer_type, "protein"]
    equiv = equiv.loc[equiv["cancer_type"] == cancer_type, "protein"]
    ambig = ambig.loc[ambig["cancer_type"] == cancer_type, "protein"]
    
    # Select the protein data for each group
    diff = prot[prot["protein"].isin(diff)].assign(group="diff")
    equiv = prot[prot["protein"].isin(equiv)].assign(group="equiv")
    ambig = prot[prot["protein"].isin(ambig)].assign(group="ambig")
    
    # Append into one table
    groups = diff.append(equiv).append(ambig)
    
    # Make the plot
    chart = alt.vconcat(*[
        alt.Chart(groups[groups["group"] == group]).mark_boxplot().encode(
            x=alt.X(
                EVENT_COLUMN,
                axis=alt.Axis(
                    labelAngle=-30,
                    title=""
                )
            ),
            y=alt.Y(
                "expr",
                axis=alt.Axis(
                    title=group
                )
            ),
            column=alt.Column(
                "protein",
                title=""
            ),
            color=alt.Color(
                EVENT_COLUMN
            )
        ) for group in groups["group"].unique()
    ])
    
    return chart

In [11]:
alt.vconcat(*[
    
    plot_dists(
        cancer_type=cancer_type, 
        prots=proteomics, 
        diff=diff_prots,
        equiv=equiv_prots,
        ambig=ambig_prots
    ).properties(
        title=cancer_type
    )
    
    for cancer_type in CANCER_TYPES
]).configure_title(
    fontSize=16
)