In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import fdrcorrection
from statsmodels.stats.proportion import proportion_confint
from tqdm import tqdm
from collections import defaultdict

PATHES_ORDER = ["path_big", "path_medium", "path_small", "path_random"]
ITER=30
PVAL_NAME='pval_adj' #empPvalue
ALPHA=0.05
pathes = {
    "path_big": "KEGG_FOCAL_ADHESION",
    "path_medium": "KEGG_PPAR_SIGNALING_PATHWAY",
    "path_small": "KEGG_STEROID_BIOSYNTHESIS",
    "path_random": "XXXXX",
}

PATHES_ORDER = ["path_big", "path_medium", "path_small", "path_random"]

def return_stats_pascal(data, pathname, ALPHA):
    genesets = set(data[data[PVAL_NAME]<ALPHA].Name)
#     print(pathname, genesets)
    check = {
        "avg#": len(genesets),
        "FPR": int((len(genesets) > 0) and (pathname not in genesets)),
        "TPR": int(pathname in genesets),
    }
    return check

def aggregate_summaries(sum1):
    grouped_mean = sum1.groupby(level=0).mean()

    # Group by level 0 and calculate the size of each group
    grouped_size = sum1.groupby(level=0).size()
    grouped_size = grouped_size.reset_index(name="size")
    # Merge the calculated mean and size DataFrames
    aggsum1 = (
        grouped_mean.reset_index().merge(grouped_size, on="index").set_index("index")
    )

    aggsum1.index = pd.CategoricalIndex(
        aggsum1.index, categories=PATHES_ORDER, ordered=True
    )
    aggsum1 = aggsum1.sort_index()
    return aggsum1

def get_confidences(sum1):
    confidences = defaultdict(dict)
    for pathname in PATHES_ORDER:
        filtered_sum = sum1[sum1.index.isin([pathname], level=0)]
        for col in ['FPR', 'TPR']:
            confidences[pathname][f"{col}_confidence"] = proportion_confint(filtered_sum[col].sum(),filtered_sum.shape[0], method='wilson')
    aggsum1_conf = pd.DataFrame(confidences).T
    return aggsum1_conf

In [53]:
model='PASCAL'
results_TPR = dict()
results_FPR = dict()

summary_table1 = dict()
for pathname in pathes:
    for i in range(ITER):
#         i = 0
#         pathname = "path_small"
        index=f"{pathname}_{i}"
        file_name = f"output/SIM_{index}_{index}_gwas_rsid.PathwaySet--msigBIOCARTA_KEGG_REACTOME--sum.txt"

        data = pd.read_csv(file_name, sep='\t')
        data['pval_adj'] = fdrcorrection(data.empPvalue)[1]
        cur_res  = return_stats_pascal(data, pathes[pathname], ALPHA)
        summary_table1[(pathname, i)] = cur_res
        
sum1 = pd.DataFrame(summary_table1).T
aggsum1 = aggregate_summaries(sum1)
aggsum1_conf = get_confidences(sum1)
aggregated = pd.merge(aggsum1, aggsum1_conf, left_index=True, right_index=True)
display(aggregated)

for path in ["path_small", "path_medium", "path_big"]:
    results_TPR[(model, path)] = aggregated.TPR[path], *aggregated.TPR_confidence[path]
for path in ["path_random"]:
    results_FPR[(model, path)] = aggregated.FPR[path], *aggregated.FPR_confidence[path]

Unnamed: 0,avg#,FPR,TPR,size,FPR_confidence,TPR_confidence
path_big,2.066667,0.166667,0.066667,30,"(0.07336542371848548, 0.33564350506416035)","(0.018477023791270378, 0.21323458362616926)"
path_medium,0.1,0.066667,0.0,30,"(0.018477023791270378, 0.21323458362616926)","(0.0, 0.1135133931739688)"
path_small,0.2,0.033333,0.133333,30,"(0.005908590381612455, 0.16670390991409176)","(0.05309655484054743, 0.296813266820363)"
path_random,0.0,0.0,0.0,30,"(0.0, 0.1135133931739688)","(0.0, 0.1135133931739688)"


In [54]:
results_TPR, results_FPR

({('PASCAL', 'path_small'): (0.13333333333333333,
   0.05309655484054743,
   0.296813266820363),
  ('PASCAL', 'path_medium'): (0.0, 0.0, 0.1135133931739688),
  ('PASCAL', 'path_big'): (0.06666666666666667,
   0.018477023791270378,
   0.21323458362616926)},
 {('PASCAL', 'path_random'): (0.0, 0.0, 0.1135133931739688)})