# Find proteins outside the 8q gain event (trans) that are commonly different between samples with and without the event

## Setup

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
CHROMOSOME = "8"
ARM = "q"
TRANS_OR_CIS = "trans"

ttest_results_file = f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}effects.tsv"

ttest_results = pd.\
read_csv(ttest_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
ttest_results

Unnamed: 0_level_0,lscc_Database_ID,luad_Database_ID,brca_Database_ID,ovarian_Database_ID,brca_pvalue,colon_pvalue,hnscc_pvalue,lscc_pvalue,luad_pvalue,ovarian_pvalue,brca_diff,colon_diff,hnscc_diff,lscc_diff,luad_diff,ovarian_diff
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A1BG,NP_570602.2,NP_570602.2,NP_570602.2,NP_570602,0.067466,0.183228,0.699736,0.794905,0.980069,0.910369,-0.604953,-0.363979,-0.075722,0.147324,0.018228,-0.096252
A2M,NP_000005.2|NP_001334353.1|NP_001334354.1,NP_000005.2|NP_001334353.1|NP_001334354.1|K4JD...,NP_000005.2,NP_000005,0.983949,0.569279,0.833774,0.868751,0.814395,0.987964,-0.010696,-0.153202,-0.073016,-0.139115,0.181837,0.022644
A2ML1,NP_653271.2|NP_001269353.1,,NP_653271.2|NP_001269353.1,,0.127918,,0.647205,0.604040,,,1.331073,,0.241978,1.267434,,
AAAS,NP_056480.1|NP_001166937.1,NP_056480.1|NP_001166937.1,NP_056480.1|NP_001166937.1,NP_056480,0.210818,0.609299,0.881926,0.772112,0.350913,0.873217,0.182912,0.089587,0.020537,-0.075273,0.208068,0.045529
AACS,NP_076417.2|NP_001306769.1|NP_001306768.1,NP_076417.2|NP_001306769.1|NP_001306768.1,NP_076417.2|NP_001306769.1|NP_001306768.1,NP_076417,0.138973,0.908476,0.749539,0.915220,0.673566,0.926280,-0.601837,0.022990,0.088849,-0.092006,0.326426,-0.068586
AAGAB,NP_078942.3|NP_001258814.1,NP_078942.3|NP_001258814.1,NP_078942.3|NP_001258815.1,NP_078942,0.075670,0.322962,0.881614,0.839262,0.973398,0.884892,-0.409365,-0.144167,0.034488,-0.091437,-0.020575,-0.064130
AAK1,NP_055726.3,,NP_055726.3,NP_055726,0.176410,0.213117,0.447466,0.396367,,0.774287,-0.215584,-0.110944,-0.078541,-0.261571,,-0.111739
AAMDC,NP_001303889.1|NP_001350493.1,NP_001303889.1|NP_001350493.1|NP_001303886.1|N...,NP_078960.1|NP_001303886.1|NP_001303887.1,NP_078960,0.031293,0.295965,0.935797,0.653037,0.758739,0.865407,-0.917606,0.183176,-0.026586,-0.278158,-0.227490,-0.132402
AAMP,NP_001289474.1|NP_001078.2,NP_001289474.1|NP_001078.2,NP_001289474.1|NP_001078.2,NP_001078,0.985558,0.989918,0.879966,0.357651,0.627132,0.794877,-0.005741,-0.002691,-0.034102,-0.404214,-0.200570,-0.112696
AAR2,NP_001258803.1,NP_001258803.1,NP_001258803.1,NP_001258803,0.088193,0.065394,0.858159,0.337295,0.530154,0.775761,0.257924,0.193667,0.034607,-0.290468,0.177188,-0.107719


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(ttest_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = ttest_results.\
    loc[:, ttest_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue and diff columns to not have the cancer type
    cancer_df = cancer_df.rename(columns={
        f"{cancer_type}_pvalue": "adj_p",
        f"{cancer_type}_diff": "change"
    }).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p", "change"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change
0,brca,A1BG,NP_570602.2,0.067466,-0.604953
1,brca,A2M,NP_000005.2,0.983949,-0.010696
2,brca,A2ML1,NP_653271.2|NP_001269353.1,0.127918,1.331073
3,brca,AAAS,NP_056480.1|NP_001166937.1,0.210818,0.182912
4,brca,AACS,NP_076417.2|NP_001306769.1|NP_001306768.1,0.138973,-0.601837
5,brca,AAGAB,NP_078942.3|NP_001258815.1,0.075670,-0.409365
6,brca,AAK1,NP_055726.3,0.176410,-0.215584
7,brca,AAMDC,NP_078960.1|NP_001303886.1|NP_001303887.1,0.031293,-0.917606
8,brca,AAMP,NP_001289474.1|NP_001078.2,0.985558,-0.005741
9,brca,AAR2,NP_001258803.1,0.088193,0.257924


## Select the proteins with a significant change

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)

In [7]:
prots.groupby("cancer_type").count()[["protein"]]

Unnamed: 0_level_0,protein
cancer_type,Unnamed: 1_level_1
brca,1265
colon,95
hnscc,1
lscc,7
luad,27


## Find how many cancers each protein was different in

In [8]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist())
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1
UQCRH,"[brca, colon, luad]",3
AKAP1,"[brca, colon]",2
ANKFY1,"[brca, colon]",2
COQ5,"[brca, colon]",2
COQ8A,"[brca, colon]",2
DAP3,"[brca, colon]",2
DDX27,"[brca, colon]",2
DNAJC19,"[brca, colon]",2
EIF6,"[brca, colon]",2
FBL,"[brca, colon]",2


In [9]:
prots_summary["cancers"].value_counts().sort_index()

[brca]                 1220
[brca, colon]            34
[brca, colon, luad]       1
[brca, hnscc]             1
[brca, lscc]              1
[brca, luad]              5
[colon]                  59
[colon, luad]             1
[lscc]                    6
[luad]                   20
Name: cancers, dtype: int64

## Save results

In [10]:
prots_summary = prots_summary.assign(
    cancers=prots_summary["cancers"].apply(lambda x: "_".join(x))
)

output_file = f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv"
prots_summary.to_csv(output_file, sep="\t")