# Find proteins within the 8p loss event (cis) that are commonly different between samples with and without the event

## Setup

In [1]:
import pandas as pd
import numpy as np
import os
import altair as alt

In [2]:
CHROMOSOME = "8"
ARM = "q"
TRANS_OR_CIS = "cis"

ttest_results_file = f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}effects_ttest.tsv"

ttest_results = pd.\
read_csv(ttest_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
ttest_results

Unnamed: 0_level_0,brca_Database_ID,lscc_Database_ID,luad_Database_ID,ovarian_Database_ID,brca_pvalue,colon_pvalue,hnscc_pvalue,lscc_pvalue,luad_pvalue,ovarian_pvalue,brca_diff,colon_diff,hnscc_diff,lscc_diff,luad_diff,ovarian_diff
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ANKRD46,NP_001257308.1|NP_001257307.1,NP_001257308.1|NP_001257306.1,NP_001257308.1|NP_001257306.1,NP_001257307,8.642930e-02,,0.550043,0.267920,2.920110e-01,0.145248,0.532538,,0.121750,0.334030,0.403149,0.463378
ASAP1,NP_060952.2|NP_001234925.1,NP_060952.2|NP_001234925.1|NP_001349854.1|NP_0...,NP_060952.2|NP_001234925.1|NP_001349854.1|NP_0...,NP_001234925,8.228966e-02,6.903566e-01,0.824665,0.918349,6.958844e-01,0.159556,0.248438,0.050545,-0.014282,-0.027294,-0.089313,0.104059
ATAD2,NP_054828.2,NP_054828.2,NP_054828.2|NP_001341036.1|NP_060022.2|NP_0012...,NP_054828,1.448011e-07,1.917050e-01,0.172125,0.012599,2.544005e-03,0.748641,1.633439,0.206485,0.172059,0.911561,1.053695,0.073793
ATP6V1C1,NP_001686.1,NP_001686.1,NP_001686.1,NP_001686,5.767420e-04,5.183022e-01,0.348570,0.555959,6.218521e-01,0.019453,0.507593,0.037578,0.055516,-0.132158,0.113121,0.253735
C8orf37,NP_808880.1,NP_808880.1|NP_001350189.1,NP_808880.1|NP_001350189.1,NP_808880,2.476802e-01,,0.833915,0.037022,1.891116e-01,0.335792,0.275712,,-0.036747,0.577460,0.412627,0.207518
CA1,NP_001158302.1|NP_001278896.1,NP_001122301.1|NP_001278896.1,NP_001278897.1,NP_001158302,1.515719e-01,4.275252e-01,0.824665,0.717312,5.435762e-01,0.980384,0.817070,-0.207660,0.045576,0.240336,-0.488641,0.023665
CA1,NP_001158302.1|NP_001278896.1,NP_001122301.1|NP_001278896.1,NP_001122301.1|NP_001278896.1,NP_001158302,1.515719e-01,4.275252e-01,0.824665,0.717312,5.621904e-01,0.980384,0.817070,-0.207660,0.045576,0.240336,-0.308282,0.023665
CA1,NP_001158302.1|NP_001278896.1,NP_001278897.1,NP_001278897.1,NP_001158302,1.515719e-01,4.275252e-01,0.824665,0.918349,5.435762e-01,0.980384,0.817070,-0.207660,0.045576,0.085460,-0.488641,0.023665
CA1,NP_001158302.1|NP_001278896.1,NP_001278897.1,NP_001122301.1|NP_001278896.1,NP_001158302,1.515719e-01,4.275252e-01,0.824665,0.918349,5.621904e-01,0.980384,0.817070,-0.207660,0.045576,0.085460,-0.308282,0.023665
CA13,NP_940986.1,NP_940986.1,NP_940986.1,NP_940986,2.476802e-01,5.703720e-02,0.348570,0.552088,2.697870e-01,0.079900,0.393535,0.294875,0.115866,0.278019,0.698682,0.397009


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(ttest_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = ttest_results.\
    loc[:, ttest_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue and diff columns to not have the cancer type
    cancer_df = cancer_df.rename(columns={
        f"{cancer_type}_pvalue": "adj_p",
        f"{cancer_type}_diff": "change"
    }).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p", "change"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change
0,brca,ANKRD46,NP_001257308.1|NP_001257307.1,8.642930e-02,0.532538
1,brca,ASAP1,NP_060952.2|NP_001234925.1,8.228966e-02,0.248438
2,brca,ATAD2,NP_054828.2,1.448011e-07,1.633439
3,brca,ATP6V1C1,NP_001686.1,5.767420e-04,0.507593
4,brca,C8orf37,NP_808880.1,2.476802e-01,0.275712
5,brca,CA13,NP_940986.1,2.476802e-01,0.393535
6,brca,CA3,NP_005172.1,2.041942e-01,0.723476
7,brca,CHMP4C,NP_689497.1,1.217920e-01,0.423675
8,brca,CNGB3,NP_061971.3,5.669402e-02,-0.686776
9,brca,COL14A1,NP_066933.1,3.459163e-04,-1.834133


## Select the proteins with a significant change, and take a detour to make some plots

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)
prots_cts = prots.groupby("cancer_type").count()[["protein"]]

fail_prots = long_results[long_results["adj_p"] > 0.05].reset_index(drop=True)
fail_cts = fail_prots.groupby("cancer_type").count()[["protein"]]

prots_cts.insert(0, "count_type", "Significant difference")
fail_cts.insert(0, "count_type", "No significant difference")

counts = prots_cts.append(fail_cts).sort_index().reset_index(drop=False)

alt.Chart(counts).mark_bar().encode(
    x=alt.X(
        "count_type",
        axis=alt.Axis(
            title=None,
            labels=False
        ),
        sort=["Significant difference"]
    ),
    y=alt.Y(
        "protein",
        axis=alt.Axis(
            title="Number of proteins"
        )
    ),
    color=alt.Color(
        "count_type",
        title=None,
        sort=["Significant difference"],
        scale=alt.Scale(
            domain=["Significant difference", "No significant difference"],
            range=["#2d3da4", "#d1d1d1"]
        )
    )
).facet(
    column=alt.Column(
        "cancer_type",
        title=None
    )
).properties(
    title=f"Chr {CHROMOSOME}{ARM} {TRANS_OR_CIS} effects"
).configure_title(
    anchor="middle"
)

## Find how many cancers each protein was different in

In [7]:
def make_simple_change(change_val):
    if change_val == 0:
        return 0
    if change_val > 0:
        return 1
    if change_val < 0:
        return -1

prots = prots.assign(
    simplified_change=prots["change"].apply(make_simple_change)
)

In [8]:
prots

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change,simplified_change
0,brca,ATAD2,NP_054828.2,1.448011e-07,1.633439,1
1,brca,ATP6V1C1,NP_001686.1,5.767420e-04,0.507593,1
2,brca,COL14A1,NP_066933.1,3.459163e-04,-1.834133,-1
3,brca,COX6C,NP_004365.1,3.626047e-02,0.633653,1
4,brca,CPNE3,NP_003900.1|NP_055242.1|NP_705900.1,3.209433e-03,0.475191,1
5,brca,CSMD3,NP_937756.1|NP_937757.1|NP_443132.3,1.840187e-02,0.624113,1
6,brca,DCAF13,NP_056235.4,7.848835e-06,0.747763,1
7,brca,DECR1,NP_001317504.1|NP_001350.1,7.887976e-05,0.840867,1
8,brca,DPYS,NP_001376.1,6.943831e-03,-1.092211,-1
9,brca,DSCC1,NP_076999.2,1.534636e-05,1.157652,1


In [9]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist()),
    "mean_simp_change": ("simplified_change", np.mean)
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,mean_simp_change,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CPNE3,"[brca, colon, hnscc, lscc, luad, ovarian]",1,6
OTUD6B,"[brca, colon, hnscc, lscc, luad, ovarian]",1,6
POP1,"[brca, colon, hnscc, lscc, luad, ovarian]",1,6
RIDA,"[brca, colon, hnscc, lscc, luad, ovarian]",1,6
RMDN1,"[brca, colon, hnscc, lscc, luad, ovarian]",1,6
STK3,"[brca, colon, hnscc, lscc, luad, ovarian]",1,6
YWHAZ,"[brca, colon, hnscc, lscc, luad, ovarian]",1,6
DCAF13,"[brca, colon, hnscc, lscc, luad]",1,5
FAM91A1,"[brca, colon, hnscc, lscc, luad]",1,5
NUDCD1,"[brca, colon, hnscc, lscc, luad]",1,5


In [10]:
prots_summary["num_cancers"].value_counts().sort_index(ascending=False)

6     7
5     5
4     9
3    11
2    13
1    25
Name: num_cancers, dtype: int64

## Save results

In [11]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv"
prots_summary.to_csv(output_file, sep="\t")