# Find proteins outside the 7p gain event (trans) that are commonly different between samples with and without the event

## Setup

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
CHROMOSOME = "7"
ARM = "p"
TRANS_OR_CIS = "trans"

ttest_results_file = f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}effects_ttest.tsv"

ttest_results = pd.\
read_csv(ttest_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
ttest_results

Unnamed: 0_level_0,luad_Database_ID,ccrcc_Database_ID,brca_Database_ID,lscc_Database_ID,ovarian_Database_ID,brca_pvalue,ccrcc_pvalue,colon_pvalue,endometrial_pvalue,gbm_pvalue,...,ovarian_pvalue,brca_diff,ccrcc_diff,colon_diff,endometrial_diff,gbm_diff,hnscc_diff,lscc_diff,luad_diff,ovarian_diff
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,NP_570602.2,NP_570602.2,NP_570602.2,NP_570602.2,NP_570602,0.530942,0.958398,0.983114,0.886467,0.998597,...,0.832045,0.637778,-0.076982,0.020162,0.135875,-0.028792,0.005022,-0.069059,-0.386308,0.270000
A2M,NP_000005.2|NP_001334353.1|NP_001334354.1|K4JD...,NP_000005.2,NP_000005.2,NP_000005.2|NP_001334353.1|NP_001334354.1,NP_000005,0.559144,0.917086,0.944596,0.923183,0.998597,...,0.386362,0.679209,-0.147537,-0.046689,0.122971,-0.172593,0.038515,-0.288815,-0.568861,0.875176
A2ML1,,,NP_653271.2|NP_001269353.1,NP_653271.2|NP_001269353.1,,0.837349,,,0.335235,,...,,-0.759925,,,1.629831,,0.152537,0.445428,,
AAAS,NP_056480.1|NP_001166937.1,NP_056480.1,NP_056480.1|NP_001166937.1,NP_056480.1|NP_001166937.1,NP_056480,0.756462,0.928186,0.907737,0.160925,0.998597,...,0.946420,-0.138980,-0.022222,0.044058,-0.305859,0.014517,0.065201,-0.143891,-0.235198,-0.033951
AACS,NP_076417.2|NP_001306769.1|NP_001306768.1,NP_076417.2,NP_076417.2|NP_001306769.1|NP_001306768.1,NP_076417.2|NP_001306769.1|NP_001306768.1,NP_076417,0.673321,0.878211,0.969772,0.619052,0.998597,...,0.778830,0.607968,-0.197605,0.018276,0.313252,0.018262,0.149180,-0.478311,0.326282,-0.315740
AAGAB,NP_078942.3|NP_001258814.1,NP_078942.3,NP_078942.3|NP_001258815.1,NP_078942.3|NP_001258814.1,NP_078942,0.983022,0.938177,0.788722,0.672843,0.998597,...,0.851050,-0.025664,-0.047418,-0.078541,0.169570,-0.007000,0.047695,0.285221,-0.036647,0.131606
AAK1,,NP_055726.3,NP_055726.3,NP_055726.3,NP_055726,0.806717,0.913118,0.535559,0.901198,0.998597,...,0.801563,0.148686,-0.068680,-0.090858,-0.055098,0.037825,-0.056480,0.140147,,0.209449
AAMDC,NP_001303889.1|NP_001350493.1|NP_001303886.1|N...,,NP_078960.1|NP_001303886.1|NP_001303887.1,NP_001303889.1|NP_001350493.1,NP_078960,0.762502,,0.749697,0.763958,0.998597,...,0.770147,-0.347114,,0.113983,0.250421,-0.049799,-0.078817,-0.341376,-0.120059,-0.391071
AAMP,NP_001289474.1|NP_001078.2,NP_001078.2,NP_001289474.1|NP_001078.2,NP_001289474.1|NP_001078.2,NP_001078,0.689464,0.917086,0.881831,0.323383,0.998597,...,0.836081,-0.296894,-0.081632,-0.046473,-0.599320,-0.032336,0.095153,-0.311343,-0.345418,-0.171680
AAR2,NP_001258803.1,NP_001258803.1,NP_001258803.1,NP_001258803.1,NP_001258803,0.944092,0.959607,0.901309,0.348098,0.998597,...,0.865285,0.049962,0.023126,0.037615,-0.291286,0.050630,0.020543,-0.087412,-0.212407,0.122829


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(ttest_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = ttest_results.\
    loc[:, ttest_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue and diff columns to not have the cancer type
    cancer_df = cancer_df.rename(columns={
        f"{cancer_type}_pvalue": "adj_p",
        f"{cancer_type}_diff": "change"
    }).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p", "change"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change
0,brca,A1BG,NP_570602.2,0.530942,0.637778
1,brca,A2M,NP_000005.2,0.559144,0.679209
2,brca,A2ML1,NP_653271.2|NP_001269353.1,0.837349,-0.759925
3,brca,AAAS,NP_056480.1|NP_001166937.1,0.756462,-0.138980
4,brca,AACS,NP_076417.2|NP_001306769.1|NP_001306768.1,0.673321,0.607968
5,brca,AAGAB,NP_078942.3|NP_001258815.1,0.983022,-0.025664
6,brca,AAK1,NP_055726.3,0.806717,0.148686
7,brca,AAMDC,NP_078960.1|NP_001303886.1|NP_001303887.1,0.762502,-0.347114
8,brca,AAMP,NP_001289474.1|NP_001078.2,0.689464,-0.296894
9,brca,AAR2,NP_001258803.1,0.944092,0.049962


## Select the proteins with a significant change

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)

In [7]:
prots.groupby("cancer_type").count()[["protein"]]

Unnamed: 0_level_0,protein
cancer_type,Unnamed: 1_level_1
colon,3
endometrial,2
luad,261


## Find how many cancers each protein was different in

In [8]:
def make_simple_change(change_val):
    if change_val == 0:
        return 0
    if change_val > 0:
        return 1
    if change_val < 0:
        return -1

prots = prots.assign(
    simplified_change=prots["change"].apply(make_simple_change)
)

In [9]:
prots

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change,simplified_change
0,colon,PMPCA,,0.046910,0.301712,1
1,colon,PMPCB,,0.011985,0.298331,1
2,colon,SSBP1,,0.003918,0.519921,1
3,endometrial,RASA1,,0.034003,0.604804,1
4,endometrial,ZNF318,,0.034003,-0.425579,-1
5,luad,ABHD11,NP_683710.1|NP_683711.1|NP_001138836.1|NP_0013...,0.032506,1.571193,1
6,luad,ACAD8,NP_055199.1,0.032176,1.661435,1
7,luad,ACAP1,NP_055531.1,0.006885,-1.166400,-1
8,luad,ACAP2,NP_036419.3,0.020454,-0.576080,-1
9,luad,ACP6,NP_057445.4|NP_001310554.1,0.022059,1.310097,1


In [10]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist()),
    "mean_simp_change": ("simplified_change", np.mean)
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,mean_simp_change,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RASA1,"[endometrial, luad]",1,2
PMPCA,[colon],1,1
PMPCB,[colon],1,1
SSBP1,[colon],1,1
ZNF318,[endometrial],-1,1
ABHD11,[luad],1,1
ACAD8,[luad],1,1
ACAP1,[luad],-1,1
ACAP2,[luad],-1,1
ACP6,[luad],1,1


In [11]:
prots_summary[prots_summary["cancers"].apply(lambda x: "gbm" in x)]

Unnamed: 0_level_0,cancers,mean_simp_change,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [13]:
lp = prots[prots["cancer_type"] == "luad"]

In [17]:
(lp["simplified_change"] > 0).sum()

91

## Save results

In [12]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv"
prots_summary.to_csv(output_file, sep="\t")