# Find proteins within the 8p loss event (cis) that are commonly different between samples with and without the event

## Setup

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
ttest_results_file = "8p_ciseffects_ttest.tsv"

ttest_results = pd.\
read_csv(ttest_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
ttest_results

Unnamed: 0_level_0,lscc_Database_ID,luad_Database_ID,ovarian_Database_ID,colon_pvalue,hnscc_pvalue,lscc_pvalue,luad_pvalue,ovarian_pvalue,colon_diff,hnscc_diff,lscc_diff,luad_diff,ovarian_diff
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ADAM9,NP_003807.1,NP_003807.1,NP_003807,9.217138e-02,3.874248e-01,7.234107e-01,5.381204e-01,0.316109,-0.132762,0.127473,0.205015,-0.197411,-0.119232
AGPAT5,NP_060831.2,NP_060831.2,NP_060831,1.784798e-03,4.144620e-04,2.428473e-04,3.542617e-04,0.008375,-0.306640,-0.289336,-0.801230,-0.911537,-0.348131
ANK1,NP_065210.2|NP_065209.2|NP_000028.3|NP_065208....,NP_065208.2|NP_065209.2|NP_000028.3|NP_0011359...,NP_065209,8.376555e-01,9.754607e-01,8.937133e-01,1.813711e-01,0.710495,0.032974,0.005145,0.087647,-0.485350,0.098023
ANK1,NP_065210.2|NP_065209.2|NP_000028.3|NP_065208....,NP_065208.2|NP_065209.2|NP_000028.3|NP_0011359...,NP_000028,8.376555e-01,9.754607e-01,8.937133e-01,1.813711e-01,0.710495,0.032974,0.005145,0.087647,-0.485350,0.097773
AP3M2,NP_001127768.1,NP_001127768.1,NP_006794,8.648390e-01,9.475867e-01,5.817922e-01,5.377699e-02,0.050197,0.015910,0.008195,-0.118796,-0.293145,-0.148108
ARHGEF10,NP_001295082.1,NP_001295082.1|NP_055444.2|NP_001295081.1,NP_001295081,6.060274e-01,3.776415e-02,4.732373e-01,3.859399e-03,0.121318,-0.089861,-0.173877,-0.205605,-0.539466,-0.218074
ASAH1,NP_004306.3|NP_808592.2|NP_001350672.1|NP_0011...,NP_808592.2|NP_004306.3|NP_001350672.1|NP_0011...,NP_004306,1.813987e-02,7.027072e-02,9.435357e-01,1.342303e-01,0.061973,-0.273454,-0.216390,0.024729,-0.572110,-0.257519
ASH2L,NP_004665.2|NP_001269201.1,NP_004665.2|NP_001269201.1,NP_004665,6.201417e-01,5.212684e-01,9.169348e-01,6.991324e-02,0.485797,-0.030861,0.071438,0.047895,-0.280663,-0.062874
ATP6V1B2,NP_001684.2,NP_001684.2,NP_001684,5.145222e-09,1.604904e-02,4.471684e-02,4.437654e-05,0.020202,-0.306285,-0.153127,-0.307374,-0.578047,-0.164293
BAG4,NP_004865.1|NP_001191807.1,NP_004865.1|NP_001191807.1,NP_001191807,2.427596e-03,2.752325e-02,9.435357e-01,5.059892e-02,0.224783,-0.576897,0.275669,0.041599,-0.398664,-0.211627


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(ttest_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = ttest_results.\
    loc[:, ttest_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue and diff columns to not have the cancer type
    cancer_df = cancer_df.rename(columns={
        f"{cancer_type}_pvalue": "adj_p",
        f"{cancer_type}_diff": "change"
    }).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p", "change"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change
0,colon,ADAM9,,9.217138e-02,-0.132762
1,colon,AGPAT5,,1.784798e-03,-0.306640
2,colon,AP3M2,,8.648390e-01,0.015910
3,colon,ARHGEF10,,6.060274e-01,-0.089861
4,colon,ASAH1,,1.813987e-02,-0.273454
5,colon,ASH2L,,6.201417e-01,-0.030861
6,colon,ATP6V1B2,,5.145222e-09,-0.306285
7,colon,BAG4,,2.427596e-03,-0.576897
8,colon,BIN3,,6.222756e-02,-0.131983
9,colon,CCDC25,,6.453610e-03,-0.281196


## Select the proteins with a significant change

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)

In [7]:
prots.groupby("cancer_type").count()[["protein"]]

Unnamed: 0_level_0,protein
cancer_type,Unnamed: 1_level_1
colon,28
hnscc,24
lscc,29
luad,39
ovarian,20


## Find how many cancers each protein was different in

In [8]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist())
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1
AGPAT5,"[colon, hnscc, lscc, luad, ovarian]",5
ATP6V1B2,"[colon, hnscc, lscc, luad, ovarian]",5
CCDC25,"[colon, hnscc, lscc, luad, ovarian]",5
CHMP7,"[colon, hnscc, lscc, luad, ovarian]",5
ERI1,"[colon, hnscc, lscc, luad, ovarian]",5
PPP2CB,"[colon, hnscc, lscc, luad, ovarian]",5
PPP2R2A,"[colon, hnscc, lscc, luad, ovarian]",5
VPS37A,"[colon, hnscc, lscc, luad, ovarian]",5
XPO7,"[colon, hnscc, lscc, luad, ovarian]",5
FAM160B2,"[colon, hnscc, lscc, luad]",4


## Info for most common proteins

- [AGPAT5](https://www.uniprot.org/uniprot/Q9NUQ2)  (colon, hnscc, lscc, luad, ovarian)     5
    - Converts 1-acyl-sn-glycerol-3-phosphate (lysophosphatidic acid or LPA) into 1,2-diacyl-sn-glycerol-3-phosphate (phosphatidic acid or PA) by incorporating an acyl moiety at the sn-2 position of the glycerol backbone (PubMed:21173190). Acts on LPA containing saturated or unsaturated fatty acids C15:0-C20:4 at the sn-1 position using C18:1-CoA as the acyl donor (PubMed:21173190). Also acts on lysophosphatidylethanolamine using oleoyl-CoA, but not arachidonoyl-CoA, and lysophosphatidylinositol using arachidonoyl-CoA, but not oleoyl-CoA (PubMed:21173190). Activity toward lysophosphatidylglycerol not detectable (PubMed:21173190).
- ATP6V1B2 	(colon, hnscc, lscc, luad, ovarian) 	5
- CCDC25 	(colon, hnscc, lscc, luad, ovarian) 	5
- CHMP7 	(colon, hnscc, lscc, luad, ovarian) 	5
- [ERI1](https://www.uniprot.org/uniprot/Q8IV48)    (colon, hnscc, lscc, luad, ovarian)     5
    - RNA exonuclease that binds to the 3'-end of histone mRNAs and degrades them, suggesting that it plays an essential role in histone mRNA decay after replication. A 2' and 3'-hydroxyl groups at the last nucleotide of the histone 3'-end is required for efficient degradation of RNA substrates. Also able to degrade the 3'-overhangs of short interfering RNAs (siRNAs) in vitro, suggesting a possible role as regulator of RNA interference (RNAi). Requires for binding the 5'-ACCCA-3' sequence present in stem-loop structure. Able to bind other mRNAs. Required for 5.8S rRNA 3'-end processing. Also binds to 5.8s ribosomal RNA. Binds with high affinity to the stem-loop structure of replication-dependent histone pre-mRNAs.
- [PPP2CB](https://www.uniprot.org/uniprot/P62714)  (colon, hnscc, lscc, luad, ovarian)     5
    - PP2A can modulate the activity of phosphorylase B kinase casein kinase 2, mitogen-stimulated S6 kinase, and MAP-2 kinase.
- [PPP2R2A](https://www.uniprot.org/uniprot/P63151)     (colon, hnscc, lscc, luad, ovarian)     5
    - The B regulatory subunit might modulate substrate selectivity and catalytic activity, and also might direct the localization of the catalytic enzyme to a particular subcellular compartment.
- [VPS37A](https://www.uniprot.org/uniprot/Q8NEZ2)  (colon, hnscc, lscc, luad, ovarian)     5
    - Component of the ESCRT-I complex, a regulator of vesicular trafficking process. Required for the sorting of endocytic ubiquitinated cargos into multivesicular bodies. May be involved in cell growth and differentiation.
- [XPO7](https://www.uniprot.org/uniprot/Q9UIA9)    (colon, hnscc, lscc, luad, ovarian)  5
    - Mediates the nuclear export of proteins (cargos) with broad substrate specificity. In the nucleus binds cooperatively to its cargo and to the GTPase Ran in its active GTP-bound form. Docking of this trimeric complex to the nuclear pore complex (NPC) is mediated through binding to nucleoporins. Upon transit of a nuclear export complex into the cytoplasm, disassembling of the complex and hydrolysis of Ran-GTP to Ran-GDP (induced by RANBP1 and RANGAP1, respectively) cause release of the cargo from the export receptor. XPO7 then return to the nuclear compartment and mediate another round of transport. The directionality of nuclear export is thought to be conferred by an asymmetric distribution of the GTP- and GDP-bound forms of Ran between the cytoplasm and nucleus.