# Find proteins outside the 8q loss event (trans) that are commonly different between samples with and without the event

# Only looking at interacting proteins

## Setup

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
CHROMOSOME = "8"
ARM = "q"
TRANS_OR_CIS = "trans"

ttest_results_file = f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}effects_filtered.tsv"

ttest_results = pd.\
read_csv(ttest_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
ttest_results

Unnamed: 0_level_0,lscc_Database_ID,brca_Database_ID,luad_Database_ID,ovarian_Database_ID,brca_pvalue,colon_pvalue,hnscc_pvalue,lscc_pvalue,luad_pvalue,ovarian_pvalue,brca_diff,colon_diff,hnscc_diff,lscc_diff,luad_diff,ovarian_diff
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AAR2,NP_001258803.1,NP_001258803.1,NP_001258803.1,NP_001258803,0.090443,0.029068,0.990972,0.966588,0.461274,0.942127,0.265935,0.195941,-0.010725,-0.010739,0.190339,-0.061427
ABL1,NP_009297.2|NP_005148.2,NP_009297.2|NP_005148.2,,NP_005148,0.528499,0.903333,0.366032,0.667328,,0.923138,-0.130589,-0.024027,0.092682,-0.115708,,0.059847
ABLIM1,NP_001003407.1|NP_002304.3|NP_001309812.1|NP_0...,NP_001309811.1|NP_001309815.1,NP_002304.3|NP_001339371.1|NP_001309821.1|NP_0...,NP_001309811,0.659300,0.585003,0.933711,0.749487,0.465441,0.942127,-0.174142,-0.075404,0.038030,0.244712,-0.616826,-0.091972
ABLIM1,NP_001003407.1|NP_002304.3|NP_001309812.1|NP_0...,NP_001309811.1|NP_001309815.1,NP_002304.3|NP_001339371.1|NP_001309821.1|NP_0...,NP_001003407,0.659300,0.585003,0.933711,0.749487,0.465441,0.942127,-0.174142,-0.075404,0.038030,0.244712,-0.616826,-0.093721
ABLIM1,NP_001003407.1|NP_002304.3|NP_001309812.1|NP_0...,NP_001309811.1|NP_001309815.1,NP_001309811.1|NP_001309813.1|NP_001339369.1|N...,NP_001309811,0.659300,0.585003,0.933711,0.749487,0.699201,0.942127,-0.174142,-0.075404,0.038030,0.244712,-0.226632,-0.091972
ABLIM1,NP_001003407.1|NP_002304.3|NP_001309812.1|NP_0...,NP_001309811.1|NP_001309815.1,NP_001309811.1|NP_001309813.1|NP_001339369.1|N...,NP_001003407,0.659300,0.585003,0.933711,0.749487,0.699201,0.942127,-0.174142,-0.075404,0.038030,0.244712,-0.226632,-0.093721
ABLIM1,NP_001309811.1|NP_001309813.1|NP_001339369.1|N...,NP_001309811.1|NP_001309815.1,NP_002304.3|NP_001339371.1|NP_001309821.1|NP_0...,NP_001309811,0.659300,0.585003,0.933711,0.937790,0.465441,0.942127,-0.174142,-0.075404,0.038030,-0.069924,-0.616826,-0.091972
ABLIM1,NP_001309811.1|NP_001309813.1|NP_001339369.1|N...,NP_001309811.1|NP_001309815.1,NP_002304.3|NP_001339371.1|NP_001309821.1|NP_0...,NP_001003407,0.659300,0.585003,0.933711,0.937790,0.465441,0.942127,-0.174142,-0.075404,0.038030,-0.069924,-0.616826,-0.093721
ABLIM1,NP_001309811.1|NP_001309813.1|NP_001339369.1|N...,NP_001309811.1|NP_001309815.1,NP_001309811.1|NP_001309813.1|NP_001339369.1|N...,NP_001309811,0.659300,0.585003,0.933711,0.937790,0.699201,0.942127,-0.174142,-0.075404,0.038030,-0.069924,-0.226632,-0.091972
ABLIM1,NP_001309811.1|NP_001309813.1|NP_001339369.1|N...,NP_001309811.1|NP_001309815.1,NP_001309811.1|NP_001309813.1|NP_001339369.1|N...,NP_001003407,0.659300,0.585003,0.933711,0.937790,0.699201,0.942127,-0.174142,-0.075404,0.038030,-0.069924,-0.226632,-0.093721


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(ttest_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = ttest_results.\
    loc[:, ttest_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue and diff columns to not have the cancer type
    cancer_df = cancer_df.rename(columns={
        f"{cancer_type}_pvalue": "adj_p",
        f"{cancer_type}_diff": "change"
    }).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p", "change"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change
0,brca,AAR2,NP_001258803.1,0.090443,0.265935
1,brca,ABL1,NP_009297.2|NP_005148.2,0.528499,-0.130589
2,brca,ADSL,NP_000017.1|NP_001304852.1|NP_001116850.1,0.515643,0.165103
3,brca,AGO2,NP_036286.2|NP_001158095.1,0.002397,0.450616
4,brca,AGR2,NP_006399.1,0.005402,-1.883850
5,brca,AJUBA,NP_116265.1|NP_001276026.1,0.991367,-0.005706
6,brca,AKAP5,NP_004848.3,0.086401,-0.866100
7,brca,AKT1,NP_005154.2,0.001363,-0.668985
8,brca,AKT1S1,NP_001092102.1|NP_115751.3,0.013274,-0.575202
9,brca,ALDH1B1,NP_000683.3,0.803995,0.109488


## Select the proteins with a significant change

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)

In [7]:
prots.groupby("cancer_type").count()[["protein"]]

Unnamed: 0_level_0,protein
cancer_type,Unnamed: 1_level_1
brca,69
colon,21
hnscc,4
lscc,2
luad,3
ovarian,4


## Find how many cancers each protein was different in

In [8]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist())
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1
PTK2,"[brca, colon, luad]",3
AGO2,"[brca, colon]",2
FBL,"[brca, colon]",2
SMC1A,"[brca, colon]",2
TIMM13,"[brca, colon]",2
MAPKAPK2,"[brca, hnscc]",2
SURF2,"[brca, luad]",2
YWHAB,"[colon, ovarian]",2
RASAL3,"[hnscc, lscc]",2
AGR2,[brca],1


In [9]:
prots_summary["num_cancers"].value_counts().sort_index(ascending=False)

3     1
2     8
1    84
Name: num_cancers, dtype: int64

## Save results

In [10]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}_filtered.tsv"
prots_summary.to_csv(output_file, sep="\t")