# Assign overlap p values to individual proteins and perform multiple testing correction

In [1]:
import pandas as pd
import numpy as np
import statsmodels.stats.multitest

In [2]:
CHROMOSOMES = ["8"]
ARMS = ["p", "q"]
TRANS_OR_CIS_OPTS = ["cis", "trans"]

## Load files

In [3]:
all_overlaps = pd.DataFrame()
all_pvals = pd.DataFrame()

for CHROMOSOME in CHROMOSOMES:
    for ARM in ARMS:
        for TRANS_OR_CIS in TRANS_OR_CIS_OPTS:
            
            overlaps = pd.\
            read_csv(f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t").\
            assign(group=f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}")
            
            pvals = pd.\
            read_csv(f"overlap_pvals_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t").\
            assign(group=f"{CHROMOSOME}{ARM}_{TRANS_OR_CIS}")
            
            all_overlaps = all_overlaps.append(overlaps)
            all_pvals = all_pvals.append(pvals)

## Merge in the _p_ values for each protein

We will exclude proteins that were only different in one cancer type (no overlap).

In [4]:
all_overlaps = all_overlaps[all_overlaps["num_cancers"] > 1].\
merge(
    right=all_pvals,
    left_on=["num_cancers", "group"],
    right_on=["overlap_size", "group"],
    how="inner"
)

## Apply multiple testing correction

In [5]:
reject, adj_pvals, alpha_sidak, alpha_bonf = statsmodels.stats.multitest.multipletests(
    pvals=all_overlaps["pvals"], 
    alpha=0.05, 
    method="fdr_bh"
)

all_overlaps = all_overlaps.\
assign(adj_p=adj_pvals)

all_overlaps = all_overlaps[all_overlaps["adj_p"] <= 0.05].\
sort_values(by=["adj_p", "protein", "group"])

## Split table by protein group

In [6]:
groups = { group:df.reset_index(drop=True) for group, df in all_overlaps.groupby("group")}
pd.options.display.max_rows = None

## Print results

In [7]:
groups["8p_cis"]

Unnamed: 0,protein,cancers,mean_simp_change,num_cancers,group,overlap_size,pvals,adj_p
0,AGPAT5,brca_colon_hnscc_lscc_luad_ovarian,-1.0,6,8p_cis,6,0.005844,0.036096
1,ATP6V1B2,brca_colon_hnscc_lscc_luad_ovarian,-1.0,6,8p_cis,6,0.005844,0.036096
2,CHMP7,brca_colon_hnscc_lscc_luad_ovarian,-1.0,6,8p_cis,6,0.005844,0.036096
3,ERI1,brca_colon_hnscc_lscc_luad_ovarian,-1.0,6,8p_cis,6,0.005844,0.036096
4,KIF13B,brca_colon_hnscc_lscc_luad_ovarian,-1.0,6,8p_cis,6,0.005844,0.036096
5,MSRA,brca_colon_hnscc_lscc_luad_ovarian,-1.0,6,8p_cis,6,0.005844,0.036096
6,PPP2R2A,brca_colon_hnscc_lscc_luad_ovarian,-1.0,6,8p_cis,6,0.005844,0.036096
7,VPS37A,brca_colon_hnscc_lscc_luad_ovarian,-1.0,6,8p_cis,6,0.005844,0.036096
8,XPO7,brca_colon_hnscc_lscc_luad_ovarian,-1.0,6,8p_cis,6,0.005844,0.036096


In [8]:
groups["8p_trans"]

Unnamed: 0,protein,cancers,mean_simp_change,num_cancers,group,overlap_size,pvals,adj_p
0,ATP6V1E1,colon_lscc_luad,-1.0,3,8p_trans,3,1.9e-05,0.000565
1,ATP6V1H,colon_lscc_luad,-1.0,3,8p_trans,3,1.9e-05,0.000565
2,CNOT8,brca_lscc_luad,1.0,3,8p_trans,3,1.9e-05,0.000565
3,ATP6V1A,colon_luad,-1.0,2,8p_trans,2,0.006707,0.036096
4,ATP6V1G1,colon_lscc,-1.0,2,8p_trans,2,0.006707,0.036096
5,CRELD2,colon_lscc,-1.0,2,8p_trans,2,0.006707,0.036096
6,CYB5R4,lscc_luad,-1.0,2,8p_trans,2,0.006707,0.036096
7,CYP7B1,hnscc_lscc,-1.0,2,8p_trans,2,0.006707,0.036096
8,DNAJC19,colon_lscc,1.0,2,8p_trans,2,0.006707,0.036096
9,HIST1H1D,hnscc_luad,-1.0,2,8p_trans,2,0.006707,0.036096


In [9]:
groups["8q_cis"]

Unnamed: 0,protein,cancers,mean_simp_change,num_cancers,group,overlap_size,pvals,adj_p
0,CPNE3,brca_colon_hnscc_lscc_luad_ovarian,1.0,6,8q_cis,6,0.004409,0.036096
1,NUDCD1,brca_colon_hnscc_lscc_luad_ovarian,1.0,6,8q_cis,6,0.004409,0.036096
2,OTUD6B,brca_colon_hnscc_lscc_luad_ovarian,1.0,6,8q_cis,6,0.004409,0.036096
3,POP1,brca_colon_hnscc_lscc_luad_ovarian,1.0,6,8q_cis,6,0.004409,0.036096
4,RIDA,brca_colon_hnscc_lscc_luad_ovarian,1.0,6,8q_cis,6,0.004409,0.036096
5,RMDN1,brca_colon_hnscc_lscc_luad_ovarian,1.0,6,8q_cis,6,0.004409,0.036096
6,STK3,brca_colon_hnscc_lscc_luad_ovarian,1.0,6,8q_cis,6,0.004409,0.036096
7,YWHAZ,brca_colon_hnscc_lscc_luad_ovarian,1.0,6,8q_cis,6,0.004409,0.036096


In [10]:
groups["8q_trans"]

Unnamed: 0,protein,cancers,mean_simp_change,num_cancers,group,overlap_size,pvals,adj_p
0,STAU2,brca_colon_hnscc_lscc_luad,1.0,5,8q_trans,5,0.0,0.0
1,YTHDF3,brca_colon_hnscc_lscc_luad,1.0,5,8q_trans,5,0.0,0.0
2,ARFGEF1,brca_colon_hnscc_luad,1.0,4,8q_trans,4,9e-06,0.000367
3,CYC1,brca_colon_lscc_luad,1.0,4,8q_trans,4,9e-06,0.000367
4,HGH1,brca_colon_hnscc_luad,1.0,4,8q_trans,4,9e-06,0.000367
5,HSF1,brca_hnscc_lscc_luad,1.0,4,8q_trans,4,9e-06,0.000367
6,MTFR1,brca_hnscc_lscc_luad,1.0,4,8q_trans,4,9e-06,0.000367
7,AGO2,brca_colon_luad,1.0,3,8q_trans,3,0.001316,0.014431
8,ANKFY1,brca_colon_lscc,-1.0,3,8q_trans,3,0.001316,0.014431
9,BOP1,brca_colon_lscc,1.0,3,8q_trans,3,0.001316,0.014431


## Save results

In [11]:
all_overlaps.to_csv("sig_diff_all_overlaps.tsv", sep="\t", index=False)