# Find proteins within the 7p gain event (cis) that are commonly different between samples with and without the event

## Setup

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
CHROMOSOME = "7"
ARM = "p"
TRANS_OR_CIS = "cis"

ttest_results_file = f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}effects_ttest.tsv"

ttest_results = pd.\
read_csv(ttest_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
ttest_results

Unnamed: 0_level_0,brca_Database_ID,luad_Database_ID,ccrcc_Database_ID,lscc_Database_ID,ovarian_Database_ID,brca_pvalue,ccrcc_pvalue,colon_pvalue,endometrial_pvalue,gbm_pvalue,...,ovarian_pvalue,brca_diff,ccrcc_diff,colon_diff,endometrial_diff,gbm_diff,hnscc_diff,lscc_diff,luad_diff,ovarian_diff
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABCA13,NP_689914.3,,,,,0.694999,,,,,...,,0.271652,,,,,,,,
ACTB,NP_001092.1,NP_001092.1,NP_001092.1,NP_001092.1,NP_001092,0.044000,0.576605,0.528799,0.803136,0.614257,...,0.432547,0.676841,0.083024,0.163348,-0.161164,0.077727,-0.032212,0.156346,0.198439,-0.140323
ADAP1,NP_006860.1|NP_001271237.1|NP_001271239.1|NP_0...,NP_006860.1|NP_001271237.1|NP_001271238.1|NP_0...,NP_006860.1,NP_006860.1|NP_001271237.1|NP_001271238.1|NP_0...,NP_006860,0.782883,0.831565,0.049201,0.577107,0.868300,...,0.507395,0.115723,0.041141,0.236260,0.293432,0.067999,0.224683,0.198757,0.054962,0.193091
AEBP1,NP_001120.3,NP_001120.3,NP_001120.3,NP_001120.3,NP_001120,0.425282,0.607600,0.912794,0.844926,0.520111,...,0.927980,0.689486,-0.273956,-0.034249,-0.135591,0.203039,0.249068,1.047787,-0.352938,0.035739
AGR2,NP_006399.1,NP_006399.1,,NP_006399.1,NP_006399,0.599861,,0.301292,0.118944,0.244564,...,0.380235,0.696066,,-0.410820,1.844964,0.707360,0.061473,-0.252865,2.222339,0.482515
AGR3,NP_789783.1,NP_789783.1,,NP_789783.1,NP_789783,0.712293,,0.632601,0.075172,,...,0.319678,0.475802,,-0.184550,1.760061,0.211489,-0.051835,-0.479121,2.127087,0.500758
AHR,NP_001612.1,NP_001612.1,NP_001612.1,NP_001612.1,NP_001612,0.021564,0.936993,0.826525,0.211703,0.720948,...,0.630470,0.727305,0.018414,0.035818,0.546107,-0.121580,0.262888,0.361622,1.520007,-0.091651
AIMP2,NP_006294.2|NP_001313535.1|NP_001313538.1,NP_006294.2|NP_001349714.1|NP_001313535.1|NP_0...,NP_006294.2,NP_001313536.1,NP_006294,0.062382,0.957384,0.049201,0.585961,0.321234,...,0.732890,0.321932,0.005477,0.121789,-0.152450,0.086074,0.112943,-0.168035,0.161925,0.046486
AIMP2,NP_006294.2|NP_001313535.1|NP_001313538.1,NP_006294.2|NP_001349714.1|NP_001313535.1|NP_0...,NP_006294.2,NP_001313536.1,NP_001313536,0.062382,0.957384,0.049201,0.585961,0.321234,...,0.756178,0.321932,0.005477,0.121789,-0.152450,0.086074,0.112943,-0.168035,0.161925,0.043220
AIMP2,NP_006294.2|NP_001313535.1|NP_001313538.1,NP_006294.2|NP_001349714.1|NP_001313535.1|NP_0...,NP_006294.2,NP_006294.2|NP_001349714.1|NP_001313535.1|NP_0...,NP_006294,0.062382,0.957384,0.049201,0.585961,0.321234,...,0.732890,0.321932,0.005477,0.121789,-0.152450,0.086074,0.112943,0.077954,0.161925,0.046486


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(ttest_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = ttest_results.\
    loc[:, ttest_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue and diff columns to not have the cancer type
    cancer_df = cancer_df.rename(columns={
        f"{cancer_type}_pvalue": "adj_p",
        f"{cancer_type}_diff": "change"
    }).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p", "change"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change
0,brca,ABCA13,NP_689914.3,0.694999,0.271652
1,brca,ACTB,NP_001092.1,0.044000,0.676841
2,brca,ADAP1,NP_006860.1|NP_001271237.1|NP_001271239.1|NP_0...,0.782883,0.115723
3,brca,AEBP1,NP_001120.3,0.425282,0.689486
4,brca,AGR2,NP_006399.1,0.599861,0.696066
5,brca,AGR3,NP_789783.1,0.712293,0.475802
6,brca,AHR,NP_001612.1,0.021564,0.727305
7,brca,AMPH,NP_001626.1|NP_647477.1,0.640984,0.494188
8,brca,ANKMY2,NP_064715.1,0.246161,0.429587
9,brca,ANLN,NP_061155.2|NP_001271230.1|NP_001271231.1,0.950362,0.070197


## Select the proteins with a significant change

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)

In [7]:
prots.groupby("cancer_type").count()[["protein"]]

Unnamed: 0_level_0,protein
cancer_type,Unnamed: 1_level_1
brca,23
ccrcc,2
colon,19
endometrial,3
gbm,6
hnscc,29
lscc,29
luad,34
ovarian,17


## Find how many cancers each protein was different in

In [8]:
def make_simple_change(change_val):
    if change_val == 0:
        return 0
    if change_val > 0:
        return 1
    if change_val < 0:
        return -1

prots = prots.assign(
    simplified_change=prots["change"].apply(make_simple_change)
)

In [9]:
prots

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change,simplified_change
0,brca,ACTB,NP_001092.1,0.044000,0.676841,1
1,brca,AHR,NP_001612.1,0.021564,0.727305,1
2,brca,BZW2,NP_001153239.1,0.022850,0.879628,1
3,brca,C7orf25,NP_001093328.1|NP_076959.2,0.016791,0.542806,1
4,brca,CCM2,NP_113631.1|NP_001025006.1|NP_001161406.1|NP_0...,0.021564,0.495391,1
5,brca,CHCHD2,NP_001307256.1|NP_057223.1,0.003788,1.011146,1
6,brca,CYCS,NP_061820.1,0.025775,0.873126,1
7,brca,FBXL18,NP_079239.3|NP_001308142.1,0.000087,0.887400,1
8,brca,FKBP14,NP_060416.1,0.025479,1.308385,1
9,brca,FKBP9,NP_009201.2|NP_001271270.1|NP_001271272.1,0.029242,1.106326,1


In [10]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist()),
    "mean_simp_change": ("simplified_change", np.mean)
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,mean_simp_change,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NUDCD3,"[brca, ccrcc, colon, hnscc, lscc, luad, ovarian]",1,7
PURB,"[brca, colon, gbm, hnscc, lscc, luad, ovarian]",1,7
ANKMY2,"[ccrcc, endometrial, gbm, lscc, luad, ovarian]",1,6
YKT6,"[colon, gbm, hnscc, lscc, luad, ovarian]",1,6
TMED4,"[brca, colon, luad, ovarian]",1,4
CCM2,"[brca, gbm, lscc, luad]",1,4
SNX8,"[brca, hnscc, lscc, luad]",1,4
DNAAF5,"[colon, hnscc, lscc, luad]",1,4
FOXK1,"[colon, hnscc, lscc, ovarian]",1,4
AVL9,"[colon, lscc, luad, ovarian]",1,4


In [11]:
prots_summary[prots_summary["cancers"].apply(lambda x: "gbm" in x)]

Unnamed: 0_level_0,cancers,mean_simp_change,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PURB,"[brca, colon, gbm, hnscc, lscc, luad, ovarian]",1,7
ANKMY2,"[ccrcc, endometrial, gbm, lscc, luad, ovarian]",1,6
YKT6,"[colon, gbm, hnscc, lscc, luad, ovarian]",1,6
CCM2,"[brca, gbm, lscc, luad]",1,4
TWISTNB,"[gbm, hnscc, lscc]",1,3
GET4,"[gbm, luad, ovarian]",1,3


## Save results

In [12]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv"
prots_summary.to_csv(output_file, sep="\t")