# Compare the distributions of proteins classified as different, equivalent, or ambiguous between samples with and without the event

Just as a sanity check.

In [1]:
import pandas as pd
import numpy as np
import cnvutils as ut
import altair as alt
import cptac
import cptac.utils as ut

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
CHROMOSOME = "8"
ARM = "p"
CIS_OR_TRANS = "cis"

In [3]:
if ARM == "p":
    EVENT_COLUMN = "loss_event"
    EXCLUDE_COLUMN = "gain_event"
    
elif ARM == "q":
    EVENT_COLUMN = "gain_event"
    EXCLUDE_COLUMN = "loss_event"

else:
    raise ValueError("Invalid value for ARM variable.")

## Read in data tables and reformat

In [4]:
result_tables = {}
result_tables["equiv"] = pd.read_csv(f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_equiv.tsv", sep="\t")
result_tables["diff"] = pd.read_csv(f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}effects_ttest.tsv", sep="\t")

In [5]:
for df_name in result_tables.keys():
    
    res_df = result_tables[df_name].\
    rename(columns={"Name": "protein"}).\
    set_index("protein")
    
    cancer_types = sorted(res_df.columns.to_series().str.split("_", n=1, expand=True)[0].unique())
    long_results = pd.DataFrame()

    for cancer_type in cancer_types:
        cancer_df = res_df.\
        loc[:, res_df.columns.str.startswith(cancer_type)].\
        dropna(axis="index", how="all").\
        reset_index(drop=False)

        # If the cancer type has database IDs, make a separate column that has them.
        # If not, create a column of NaNs (so that the tables all match)
        if f"{cancer_type}_Database_ID" in cancer_df.columns:
            cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
        else:
            cancer_df = cancer_df.assign(Database_ID=np.nan)

        # Rename the pvalue and diff columns to not have the cancer type, then reorder columns
        cancer_df = cancer_df.rename(columns={
            f"{cancer_type}_pvalue": "adj_p",
            f"{cancer_type}_diff": "change"
        }).\
        assign(cancer_type=cancer_type).\
        set_index(["cancer_type", "protein", "Database_ID"]).\
        reset_index(drop=False)

        # Append to the overall dataframe
        long_results = long_results.append(cancer_df)

    # Drop duplicate rows and reset the index
    long_results = long_results[~long_results.duplicated(keep=False)].\
    reset_index(drop=True)
    
    # Save
    result_tables[df_name] = long_results
    
diff_df = result_tables["diff"]
equiv_df = result_tables["equiv"]

## Make lists of different protein classes

Because our classification tests aren't perfect, there is some overlap, but that's okay.

In [6]:
diff_df.shape

(381, 5)

In [7]:
equiv_df.shape

(381, 4)

In [8]:
diff_prots = diff_df.loc[diff_df["adj_p"] <= 0.05, ["cancer_type", "protein"]]
equiv_prots = equiv_df.loc[equiv_df["adj_p"] <= 0.05, ["cancer_type", "protein"]]

In [9]:
equiv_fail = equiv_df.loc[((equiv_df["adj_p"] > 0.05) | equiv_df["adj_p"].isna()), ["cancer_type", "protein"]]
equiv_fail = pd.MultiIndex.from_arrays([equiv_fail["cancer_type"], equiv_fail["protein"]])

diff_fail = diff_df.loc[((diff_df["adj_p"] > 0.05) | diff_df["adj_p"].isna()), ["cancer_type", "protein"]]
diff_fail = pd.MultiIndex.from_arrays([diff_fail["cancer_type"], diff_fail["protein"]])

ambig_prots = equiv_fail[equiv_fail.isin(diff_fail)].to_frame().reset_index(drop=True)

In [10]:
ambig_prots.shape[0] + diff_prots.shape[0] + equiv_prots.shape[0]

383

## For each cancer type, get the expression data and plot it for the different groups

In [11]:
datasets = {
    "brca": cptac.Brca,
    "colon": cptac.Colon,
    "hnscc": cptac.Hnscc,
    "lscc": cptac.Lscc,
    "luad": cptac.Luad,
    "ovarian": cptac.Ovarian
}

In [12]:
def plot_dists(cancer_type, ds, diff, equiv, ambig):
    
    # Load the dataset
    ds = ds(no_internet=True)
    
    # Get the proteomics dataframe
    prot = ds.get_proteomics()
    
    # Join in the event data and format the dataframe
    event = pd.read_csv(f'{cancer_type}_has_event.tsv', sep='\t', index_col=0)
    if EXCLUDE_COLUMN:
        event.drop(EXCLUDE_COLUMN, axis=1, inplace=True)
    event.index.rename('Name')
    
    prot = prot.\
    join(event).\
    dropna(subset=[EVENT_COLUMN]).\
    reset_index().\
    melt(
        id_vars=["Patient_ID", EVENT_COLUMN],
        var_name="protein",
        value_name="expr"
    ).\
    sort_values(by="Patient_ID")
    
    if isinstance(prot["protein"][0], tuple):
        split_prots = pd.DataFrame(prot["protein"].tolist())
        prot = prot.assign(
            protein=split_prots[0],
            Database_ID=split_prots[1]
        )
        
    else:
        prot = prot.assign(Database_ID=np.nan)
        
    # Select the group data for this cancer type
    diff = diff.loc[diff["cancer_type"] == cancer_type, "protein"]
    equiv = equiv.loc[equiv["cancer_type"] == cancer_type, "protein"]
    ambig = ambig.loc[ambig["cancer_type"] == cancer_type, "protein"]
    
    # Select the protein data for each group
    diff = prot[prot["protein"].isin(diff)].assign(group="diff")
    equiv = prot[prot["protein"].isin(equiv)].assign(group="equiv")
    ambig = prot[prot["protein"].isin(ambig)].assign(group="ambig")
    
    # Append into one table
    groups = diff.append(equiv).append(ambig)
    
    # Make the plot
    chart = alt.vconcat(*[
        alt.Chart(groups[groups["group"] == group]).mark_boxplot().encode(
            x=alt.X(
                EVENT_COLUMN,
                axis=alt.Axis(
                    labelAngle=-30,
                    title=""
                )
            ),
            y=alt.Y(
                "expr",
                axis=alt.Axis(
                    title=group
                )
            ),
            column=alt.Column(
                "protein",
                title=""
            ),
            color=alt.Color(
                EVENT_COLUMN
            )
        ) for group in groups["group"].unique()
    ])
    
    return chart

In [13]:
alt.vconcat(*[
    
    plot_dists(
        cancer_type=cancer_type, 
        ds=func, 
        diff=diff_prots,
        equiv=equiv_prots,
        ambig=ambig_prots
    ).properties(
        title=cancer_type
    )
    
    for cancer_type, func in datasets.items()
]).configure_title(
    fontSize=16
)

                           



                               



                              



                               



                               

