# Assign overlap p values to individual proteins and perform multiple testing correction

In [1]:
import pandas as pd
import numpy as np
import statsmodels.stats.multitest

In [2]:
CHROMOSOME = "8"
ARM = "p"
TRANS_OR_CIS = "cis"

In [3]:
overlaps = pd.read_csv(f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t")
pvals = pd.read_csv(f"overlap_pvals_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t")

## Merge in the _p_ values for each protein

We will exclude proteins that were only different in one cancer type (no overlap).

In [4]:
overlaps = overlaps[overlaps["num_cancers"] > 1].\
merge(
    right=pvals,
    left_on="num_cancers",
    right_on="overlap_size",
    how="inner"
)

## Apply multiple testing correction

In [5]:
reject, adj_pvals, alpha_sidak, alpha_bonf = statsmodels.stats.multitest.multipletests(
    pvals=overlaps["pvals"], 
    alpha=0.05, 
    method="fdr_bh"
)

overlaps = overlaps.assign(
    adj_p=adj_pvals
)

In [6]:
overlaps

Unnamed: 0,protein,cancers,mean_simp_change,num_cancers,overlap_size,pvals,adj_p
0,AGPAT5,brca_colon_hnscc_lscc_luad_ovarian,-1,6,6,0.006474,0.028773
1,ATP6V1B2,brca_colon_hnscc_lscc_luad_ovarian,-1,6,6,0.006474,0.028773
2,CHMP7,brca_colon_hnscc_lscc_luad_ovarian,-1,6,6,0.006474,0.028773
3,ERI1,brca_colon_hnscc_lscc_luad_ovarian,-1,6,6,0.006474,0.028773
4,PPP2CB,brca_colon_hnscc_lscc_luad_ovarian,-1,6,6,0.006474,0.028773
5,PPP2R2A,brca_colon_hnscc_lscc_luad_ovarian,-1,6,6,0.006474,0.028773
6,SARAF,brca_colon_hnscc_lscc_luad_ovarian,-1,6,6,0.006474,0.028773
7,VPS37A,brca_colon_hnscc_lscc_luad_ovarian,-1,6,6,0.006474,0.028773
8,XPO7,brca_colon_hnscc_lscc_luad_ovarian,-1,6,6,0.006474,0.028773
9,FAM160B2,brca_colon_hnscc_lscc_luad,-1,5,5,0.055178,0.122617


In [7]:
overlaps.to_csv(f"sig_diff_overlaps_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t", index=False)