# Assign overlap p values to individual proteins and perform multiple testing correction

In [1]:
import pandas as pd
import numpy as np
import statsmodels.stats.multitest

In [2]:
CHROMOSOME = "8"
ARM = "q"
TRANS_OR_CIS = "trans"

In [3]:
overlaps = pd.read_csv(f"pancancer_summary_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t")
pvals = pd.read_csv(f"overlap_pvals_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t")

## Merge in the _p_ values for each protein

We will exclude proteins that were only different in one cancer type (no overlap).

In [4]:
overlaps = overlaps[overlaps["num_cancers"] > 1].\
merge(
    right=pvals,
    left_on="num_cancers",
    right_on="overlap_size",
    how="inner"
)

## Apply multiple testing correction

In [5]:
reject, adj_pvals, alpha_sidak, alpha_bonf = statsmodels.stats.multitest.multipletests(
    pvals=overlaps["pvals"], 
    alpha=0.05, 
    method="fdr_bh"
)

overlaps = overlaps.assign(
    adj_p=adj_pvals
)

In [6]:
pd.options.display.max_rows = None
overlaps

Unnamed: 0,protein,cancers,mean_simp_change,num_cancers,overlap_size,pvals,adj_p
0,ARFGEF1,brca_colon_hnscc_luad,1,4,4,6.371091e-07,1.3e-05
1,HGH1,brca_colon_hnscc_luad,1,4,4,6.371091e-07,1.3e-05
2,STAU2,brca_colon_hnscc_luad,1,4,4,6.371091e-07,1.3e-05
3,YTHDF3,brca_colon_hnscc_luad,1,4,4,6.371091e-07,1.3e-05
4,MRPL15,brca_colon_hnscc,1,3,3,0.0001586402,0.000793
5,MRPL39,brca_colon_hnscc,1,3,3,0.0001586402,0.000793
6,MRPL45,brca_colon_hnscc,1,3,3,0.0001586402,0.000793
7,AGO2,brca_colon_luad,1,3,3,0.0001586402,0.000793
8,CYC1,brca_colon_luad,1,3,3,0.0001586402,0.000793
9,PTK2,brca_colon_luad,1,3,3,0.0001586402,0.000793


In [7]:
overlaps.to_csv(f"sig_diff_overlaps_{CHROMOSOME}{ARM}_{TRANS_OR_CIS}.tsv", sep="\t", index=False)