In [1]:
import cnvutils

import numpy as np
import os

from cnvutils.filenames import (
    get_ttest_results_path,
)

from cnvutils.constants import (
    SIG_CUTOFF,
)

In [2]:
def _get_protein_n(ttest_df):
    return ttest_df.\
    notna().\
    any(axis=1).\
    reset_index(drop=False).\
    pivot(index="protein", columns="group", values=0).\
    any(axis=1).\
    sum()

In [3]:
def groups_overlaps(
    chromosome,
    arm,
    gain_or_loss,
    cis_or_trans,
    proteomics_or_transcriptomics,
    comparison,
    source,
    level=None,
    data_dir=os.path.join(os.getcwd(), "..", "data"),
):
    
    if comparison == "has_event":
        comparison_name = "has_vs_not_has_event"
        groups = ["tumor", "normal"]
        
    elif comparison == "tumor":
        comparison_name = "tumor_vs_normal"
        groups = ["has_event", "not_has_event"]

    all_ttest = pd.DataFrame()
    for group in groups:
        
        group_path = get_ttest_results_path(
            data_dir=data_dir,
            source=source,
            level=level,
            chromosome=chromosome,
            arm=arm,
            gain_or_loss=gain_or_loss,
            cis_or_trans=cis_or_trans,
            proteomics_or_transcriptomics=proteomics_or_transcriptomics,
            group=group,
            comparison_name=comparison_name,
        )
        
        group_ttest = pd.\
        read_csv(group_path, sep="\t").\
        dropna(subset="adj_p")
        
        # We're going to combine adj_p and change for isoforms, weighted by sample
        # size (n) for each isoform
        
        # 1. Calculate sample sizes for each isoform
        group_ttest = group_ttest.assign(
            n=group_ttest["tumor_sample_size"] + group_ttest["not_tumor_sample_size"],
        ).\
        drop(columns=["tumor_sample_size", "not_tumor_sample_size"])
        
        # 2. Calculate the overall sample size for all isoforms in each protein
        group_ns = group_ttest.\
        groupby(["cancer_type", "protein"]).\
        apply(lambda x: x["n"].sum()).\
        rename("group_n")
        
        group_ttest = group_ttest.merge(
            right=group_ns,
            on=["protein", "cancer_type"],
            how="outer",
        )
        
        # 3. Calculate weights by dividing the sample size for each isoform by the 
        # overall sample size for the protein. Note that in this way, the weights within
        # each protein sum to 1.
        group_ttest = group_ttest.assign(weight=group_ttest["n"] / group_ttest["group_n"])
        
        # 4. Multiply the weights by the adj_p and change columns
        group_ttest = group_ttest.assign(
            adj_p=group_ttest["adj_p"] * group_ttest["weight"],
            change=group_ttest["change"] * group_ttest["weight"],
        )
        
        # 5. Sum the weighted adj_p and change columns to get the mean for isoforms
        # Proteins with only one form will be unaffected 
        # Because the weights were normalized to sum to 1 within each protein, we
        # don't have to divide the sums by the number of isoforms per protein
        group_ttest = group_ttest.\
        groupby(["protein", "cancer_type"])[["adj_p", "change"]].\
        sum().\
        reset_index(drop=False)
        
        # To capture whether a protein is up- or down-regulated, multiply p
        # values by the sign of the associated change
        group_ttest = group_ttest.assign(
            sign_adj_p=group_ttest["adj_p"] * np.sign(group_ttest["change"])
        )

        # Pivot the table to have one row per protein and one column per cancer
        group_ttest = group_ttest.pivot(
            index="protein",
            columns="cancer_type",
            values="sign_adj_p",
        ).\
        reset_index(drop=False).\
        assign(group=group)
        
        group_ttest.columns.name = None
        
        all_ttest = pd.concat([all_ttest, group_ttest])
        
    all_ttest = all_ttest.\
    reset_index(drop=True).\
    set_index(["group", "protein"])

    # Count how many proteins only show up in has_event or not_has_event, across
    # all cancer types aggregated together
    all_sig = ((
            (all_ttest <= SIG_CUTOFF) &
            (all_ttest >= -SIG_CUTOFF)
        ).\
        sum(axis=1).\
        rename("sig_count").\
        reset_index(drop=False).\
        pivot(
            index="protein",
            columns="group",
            values="sig_count",
    ) > 0)
    
    has_uniq = all_sig["has_event"] & ~all_sig["not_has_event"]
    not_has_uniq = ~all_sig["has_event"] & all_sig["not_has_event"]
    shared = all_sig["has_event"] & all_sig["not_has_event"]
    neither = ~all_sig["has_event"] & ~all_sig["not_has_event"]
    
    all_counts = pd.DataFrame({
        "has_uniq": [has_uniq.sum()],
        "not_has_uniq": [not_has_uniq.sum()],
        "shared": shared.sum(),
        "neither": neither.sum(),
        "n": _get_protein_n(all_ttest),
    }, index=["all"])
    
    # Do that for each cancer type
    for cancer_type in all_ttest.columns:
        
        cancer_ttest = all_ttest[[cancer_type]]
#         return cancer_ttest
        cancer_sig = (
            (cancer_ttest <= SIG_CUTOFF) &
            (cancer_ttest >= -SIG_CUTOFF)
        ).\
        dropna().\
        reset_index(drop=False).\
        pivot(
            index="protein",
            columns="group",
            values=cancer_type,
        ).\
        fillna(False)
        
        has_uniq = cancer_sig["has_event"] & ~cancer_sig["not_has_event"]
        not_has_uniq = ~cancer_sig["has_event"] & cancer_sig["not_has_event"]
        shared = cancer_sig["has_event"] & cancer_sig["not_has_event"]
        neither = ~cancer_sig["has_event"] & ~cancer_sig["not_has_event"]

        cancer_counts = pd.DataFrame({
            "has_uniq": [has_uniq.sum()],
            "not_has_uniq": [not_has_uniq.sum()],
            "shared": shared.sum(),
            "neither": neither.sum(),
            "n": _get_protein_n(cancer_ttest),
        }, index=[cancer_type])
        
        all_counts = pd.concat([all_counts, cancer_counts])
        
    all_counts = all_counts.assign(
        has_uniq_prop=all_counts["has_uniq"] / all_counts["n"],
        not_has_prop=all_counts["not_has_uniq"] / all_counts["n"],
        shared_prop=all_counts["shared"] / all_counts["n"],
        neither_prop=all_counts["neither"] / all_counts["n"],
    )
    
    return all_counts

res = groups_overlaps(
    chromosome=8,
    arm="p",
    gain_or_loss="loss",
    cis_or_trans="cis",
    proteomics_or_transcriptomics="proteomics",
    comparison="tumor",
    source="gistic",
    level="gene",
)

res

Unnamed: 0,has_uniq,not_has_uniq,shared,neither,n,has_uniq_prop,not_has_prop,shared_prop,neither_prop
all,2,20,112,8,142,0.014085,0.140845,0.788732,0.056338
coad,11,18,39,74,91,0.120879,0.197802,0.428571,0.813187
hnscc,11,24,50,57,122,0.090164,0.196721,0.409836,0.467213
lscc,13,16,78,35,129,0.100775,0.124031,0.604651,0.271318
luad,14,38,59,31,129,0.108527,0.294574,0.457364,0.24031
ov,15,0,0,127,80,0.1875,0.0,0.0,1.5875
pdac,1,61,14,66,120,0.008333,0.508333,0.116667,0.55


In [4]:
res.drop(index=["all", "ov"]).mean()

has_uniq          10.000000
not_has_uniq      31.400000
shared            48.000000
neither           52.600000
n                118.200000
has_uniq_prop      0.085736
not_has_prop       0.264292
shared_prop        0.403418
neither_prop       0.468406
dtype: float64