# Find proteins within the 8q gain event (cis) that are commonly different between samples with and without the event

## Setup

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
ttest_results_file = "8q_ciseffects_ttest.tsv"

ttest_results = pd.\
read_csv(ttest_results_file, sep="\t").\
rename(columns={"Name": "protein"}).\
set_index("protein")

In [3]:
ttest_results

Unnamed: 0_level_0,lscc_Database_ID,luad_Database_ID,ovarian_Database_ID,colon_pvalue,hnscc_pvalue,lscc_pvalue,luad_pvalue,ovarian_pvalue,colon_diff,hnscc_diff,lscc_diff,luad_diff,ovarian_diff
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AGO2,NP_036286.2|NP_001158095.1,NP_036286.2|NP_001158095.1,NP_036286,7.966714e-05,0.155522,0.895833,0.000319,0.021877,0.198152,0.064190,-0.025176,0.507758,0.158962
ANXA13,NP_001003954.1|NP_004297.2,,NP_001003954,2.286427e-01,0.625139,0.644770,,0.584759,0.256949,-0.209491,0.326287,,-0.095858
ARFGEF1,NP_006412.2,NP_006412.2,NP_006412,9.296295e-07,0.000191,0.015962,0.000019,0.147768,0.406884,0.211536,0.496405,0.852162,0.151355
ARMC1,NP_060590.1|NP_001273631.1,NP_060590.1|NP_001273631.1,NP_060590,7.909853e-01,0.184984,0.485272,0.107070,0.406482,0.018855,0.072794,0.127354,0.273590,0.076160
ASPH,NP_004309.2|NP_001158222.1|NP_115855.1|NP_0011...,NP_004309.2|NP_001158222.1|NP_115855.1|NP_0011...,NP_004309,5.145319e-02,0.854530,0.595837,0.064947,0.104947,0.159204,0.025505,0.246483,0.718922,0.168159
ASPH,NP_004309.2|NP_001158222.1|NP_115855.1|NP_0011...,NP_004309.2|NP_001158222.1|NP_115855.1|NP_0011...,NP_001158224,5.145319e-02,0.854530,0.595837,0.064947,0.521497,0.159204,0.025505,0.246483,0.718922,-0.087785
ATAD2,NP_054828.2,NP_054828.2|NP_001341036.1|NP_060022.2|NP_0012...,NP_054828,2.065439e-01,0.275093,0.502809,0.000945,0.968678,0.206485,0.140799,0.313909,1.055920,0.010878
ATP6V1C1,NP_001686.1,NP_001686.1,NP_001686,4.928422e-01,0.287678,0.503641,0.307830,0.000730,0.040628,0.060945,0.133239,0.197309,0.339252
ATP6V1H,NP_057025.2|NP_998784.1,NP_057025.2|NP_998784.1,NP_998785,2.534536e-01,0.994458,0.331302,0.030695,0.192540,-0.073698,-0.002028,-0.188046,-0.324389,0.109146
BOP1,NP_056016.1,NP_056016.1,NP_056016,7.601546e-05,0.014225,0.475816,0.008061,0.586095,0.253560,0.165231,0.185525,0.452211,0.053596


## Reshape the input dataframe
We want to get our table to have these columns:
- cancer_type
- protein
- Database_ID
- change
- p_value

Since some cancer types have database IDs and some don't, we'll slice out and reshape the info for each cancer type individually.

In [4]:
cancer_types = sorted(ttest_results.columns.to_series().str.split("_", n=1, expand=True)[0].unique())

long_results = pd.DataFrame()

for cancer_type in cancer_types:
    cancer_df = ttest_results.\
    loc[:, ttest_results.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)
    
    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)
        
    # Rename the pvalue and diff columns to not have the cancer type
    cancer_df = cancer_df.rename(columns={
        f"{cancer_type}_pvalue": "adj_p",
        f"{cancer_type}_diff": "change"
    }).\
    assign(cancer_type=cancer_type)
    
    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p", "change"]]
    
    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [5]:
long_results

Unnamed: 0,cancer_type,protein,Database_ID,adj_p,change
0,colon,AGO2,,7.966714e-05,0.198152
1,colon,ANXA13,,2.286427e-01,0.256949
2,colon,ARFGEF1,,9.296295e-07,0.406884
3,colon,ARMC1,,7.909853e-01,0.018855
4,colon,ATAD2,,2.065439e-01,0.206485
5,colon,ATP6V1C1,,4.928422e-01,0.040628
6,colon,ATP6V1H,,2.534536e-01,-0.073698
7,colon,BOP1,,7.601546e-05,0.253560
8,colon,C8orf33,,2.797495e-03,0.291128
9,colon,C8orf82,,4.198027e-07,0.455687


## Select the proteins with a significant change

In [6]:
prots = long_results[long_results["adj_p"] <= 0.05].reset_index(drop=True)

In [9]:
prots.groupby("cancer_type").count()["protein"]

cancer_type
colon      58
hnscc      44
lscc       24
luad       66
ovarian    58
Name: protein, dtype: int64

## Find how many cancers each protein was different in

In [7]:
prots_summary = prots.groupby("protein").agg(**{
    "cancers": ("cancer_type", lambda x: x.sort_values().drop_duplicates(keep="first").tolist())
})

prots_summary = prots_summary.\
assign(
    num_cancers=prots_summary["cancers"].apply(len),
    tmp_sort=prots_summary["cancers"].apply(lambda x: "".join(x))
).\
sort_values(by=["num_cancers", "tmp_sort"], ascending=[False, True]).\
drop(columns="tmp_sort")

prots_summary

Unnamed: 0_level_0,cancers,num_cancers
protein,Unnamed: 1_level_1,Unnamed: 2_level_1
CPNE3,"[colon, hnscc, lscc, luad, ovarian]",5
MTDH,"[colon, hnscc, lscc, luad, ovarian]",5
NUDCD1,"[colon, hnscc, lscc, luad, ovarian]",5
OTUD6B,"[colon, hnscc, lscc, luad, ovarian]",5
POP1,"[colon, hnscc, lscc, luad, ovarian]",5
PTK2,"[colon, hnscc, lscc, luad, ovarian]",5
RIDA,"[colon, hnscc, lscc, luad, ovarian]",5
RMDN1,"[colon, hnscc, lscc, luad, ovarian]",5
STK3,"[colon, hnscc, lscc, luad, ovarian]",5
YTHDF3,"[colon, hnscc, lscc, luad, ovarian]",5
