# Commonly Different Proteins

Finds proteins within the event that are commonly different between samples with and without the event.

## Setup

In [1]:
import pandas as pd
import numpy as np
import os
import altair as alt

In [2]:
# These are all the variables you will need to change for the
# various events.

CHROMOSOME = '8' #Example: '8' (must be string)
# The arm must either be 'p' or 'q'
ARM = 'p' 
# Options: 'cis' or 'trans'
TRANS_OR_CIS = 'cis'

## Prepare the dataframe

In [3]:
ttest_results_file = f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}effects_ttest.tsv"

ttest_results = pd.\
read_csv(ttest_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [4]:
cancer_types = sorted(ttest_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = ttest_results.\
    loc[:, ttest_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue and diff columns to not have the cancer type
    cancer_df = cancer_df.rename(columns={
        f"{cancer_type}_pvalue": "adj_p",
        f"{cancer_type}_diff": "change"
    }).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p", "change"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

## Select proteins with significant change

In [5]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)
prots_cts = prots.groupby("cancer_type").count()[["protein"]]

fail_prots = long_results[long_results["adj_p"] > 0.05].reset_index(drop=True)
fail_cts = fail_prots.groupby("cancer_type").count()[["protein"]]

prots_cts.insert(0, "count_type", "Significant difference")
fail_cts.insert(0, "count_type", "No significant difference")

counts = prots_cts.append(fail_cts).sort_index().reset_index(drop=False)

## Make Plots

In [6]:
alt.Chart(counts).mark_bar().encode(
    x=alt.X(
        "count_type",
        axis=alt.Axis(
            title=None,
            labels=False
        ),
        sort=["Significant difference"]
    ),
    y=alt.Y(
        "protein",
        axis=alt.Axis(
            title="Number of proteins"
        )
    ),
    color=alt.Color(
        "count_type",
        title=None,
        sort=["Significant difference"],
        scale=alt.Scale(
            domain=["Significant difference", "No significant difference"],
            range=["#2d3da4", "#d1d1d1"]
        )
    )
).facet(
    column=alt.Column(
        "cancer_type",
        title=None
    )
).properties(
    title=f"Chr {CHROMOSOME}{ARM} {TRANS_OR_CIS} effects"
).configure_title(
    anchor="middle"
)

## Find how many cancers each protein was different in

In [7]:
def make_simple_change(change_val):
    if change_val == 0:
        return 0
    if change_val > 0:
        return 1
    if change_val < 0:
        return -1

prots = prots.assign(
    simplified_change=prots["change"].apply(make_simple_change)
)

In [8]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist()),
    "mean_simp_change": ("simplified_change", np.mean)
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,mean_simp_change,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATP6V1B2,"[brca, colon, hnscc, lscc, luad, ovarian]",-1.0,6
CCDC25,"[brca, colon, hnscc, lscc, luad, ovarian]",-1.0,6
CHMP7,"[brca, colon, hnscc, lscc, luad, ovarian]",-1.0,6
KIF13B,"[brca, colon, hnscc, lscc, luad, ovarian]",-1.0,6
PPP2R2A,"[brca, colon, hnscc, lscc, luad, ovarian]",-1.0,6
VPS37A,"[brca, colon, hnscc, lscc, luad, ovarian]",-1.0,6
XPO7,"[brca, colon, hnscc, lscc, luad, ovarian]",-1.0,6
DOCK5,"[brca, colon, hnscc, lscc, luad]",-1.0,5
PCM1,"[brca, colon, hnscc, lscc, luad]",-1.0,5
AGPAT5,"[brca, colon, hnscc, luad, ovarian]",-1.0,5


In [9]:
prots_summary["num_cancers"].value_counts().sort_index(ascending=False)

6     7
5     5
4     5
3     7
2    11
1    15
Name: num_cancers, dtype: int64

## Save Results

In [10]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv"
prots_summary.to_csv(output_file, sep="\t")

## Research common proteins

Now you should have a list of interesting proteins. Now it's time to do some research. Place here your findings on each of the interesting genes you found. What are these genes invovled in?