In [1]:
import cnvutils

import numpy as np
import os

from cnvutils.filenames import (
    get_ttest_results_path,
)

from cnvutils.constants import (
    SIG_CUTOFF,
)

In [8]:
def groups_overlaps(
    chromosome,
    arm,
    gain_or_loss,
    cis_or_trans,
    proteomics_or_transcriptomics,
    comparison,
    source,
    level=None,
    data_dir=os.path.join(os.getcwd(), "..", "data"),
):
    
    if comparison == "has_event":
        comparison_name = "has_vs_not_has_event"
        groups = ["tumor", "normal"]
        
    elif comparison == "tumor":
        comparison_name = "tumor_vs_normal"
        groups = ["has_event", "not_has_event"]

    all_ttest = pd.DataFrame()
    for group in groups:
        
        group_path = get_ttest_results_path(
            data_dir=data_dir,
            source=source,
            level=level,
            chromosome=chromosome,
            arm=arm,
            gain_or_loss=gain_or_loss,
            cis_or_trans=cis_or_trans,
            proteomics_or_transcriptomics=proteomics_or_transcriptomics,
            group=group,
            comparison_name=comparison_name,
        )
        
        group_ttest = pd.\
        read_csv(group_path, sep="\t").\
        dropna(subset="adj_p")
        
        # We're going to combine adj_p and change for isoforms, weighted by sample
        # size (n) for each isoform
        
        # 1. Calculate sample sizes for each isoform
        group_ttest = group_ttest.assign(
            n=group_ttest["tumor_sample_size"] + group_ttest["not_tumor_sample_size"],
        ).\
        drop(columns=["tumor_sample_size", "not_tumor_sample_size"])
        
        # 2. Calculate the overall sample size for all isoforms in each protein
        group_ns = group_ttest.\
        groupby(["cancer_type", "protein"]).\
        apply(lambda x: x["n"].sum()).\
        rename("group_n")
        
        group_ttest = group_ttest.merge(
            right=group_ns,
            on=["protein", "cancer_type"],
            how="outer",
        )
        
        # 3. Calculate weights by dividing the sample size for each isoform by the 
        # overall sample size for the protein. Note that in this way, the weights within
        # each protein sum to 1.
        group_ttest = group_ttest.assign(weight=group_ttest["n"] / group_ttest["group_n"])
        
        # 4. Multiply the weights by the adj_p and change columns
        group_ttest = group_ttest.assign(
            adj_p=group_ttest["adj_p"] * group_ttest["weight"],
            change=group_ttest["change"] * group_ttest["weight"],
        )
        
        # 5. Sum the weighted adj_p and change columns to get the mean for isoforms
        # Proteins with only one form will be unaffected 
        # Because the weights were normalized to sum to 1 within each protein, we
        # don't have to divide the sums by the number of isoforms per protein
        group_ttest = group_ttest.\
        groupby(["protein", "cancer_type"])[["adj_p", "change"]].\
        sum().\
        reset_index(drop=False)
        
        # To capture whether a protein is up- or down-regulated, multiply p
        # values by the sign of the associated change
        group_ttest = group_ttest.assign(
            sign_adj_p=group_ttest["adj_p"] * np.sign(group_ttest["change"])
        )

        # Pivot the table to have one row per protein and one column per cancer
        group_ttest = group_ttest.pivot(
            index="protein",
            columns="cancer_type",
            values="sign_adj_p",
        ).\
        reset_index(drop=False).\
        assign(group=group)
        
        group_ttest.columns.name = None
        
        all_ttest = pd.concat([all_ttest, group_ttest])
        
    all_ttest = all_ttest.\
    reset_index(drop=True).\
    set_index(["group", "protein"])
    
    # Count how many proteins only show up in has_event or not_has_event, across
    # all cancer types aggregated together
    all_sig_count = (
        (all_ttest <= SIG_CUTOFF) &
        (all_ttest >= -SIG_CUTOFF)
    ).\
    sum(axis=1).\
    rename("sig_count").\
    reset_index(drop=False).\
    pivot(
        index="protein",
        columns="group",
        values="sig_count",
    ).\
    isna().\
    sum().\
    rename("all")
    
    all_sig_props = all_sig_count #/ all_ttest.shape[0]
    all_sig_props = all_sig_props.to_frame().transpose()
    
    # Do that for each cancer type
    for cancer_type in all_ttest.columns:
        
        cancer_ttest = all_ttest[[cancer_type]]
        
        cancer_sig_count = (
            (cancer_ttest <= SIG_CUTOFF) &
            (cancer_ttest >= -SIG_CUTOFF)
        ).\
        dropna().\
        reset_index(drop=False).\
        pivot(
            index="protein",
            columns="group",
            values=cancer_type,
        ).\
        isna().\
        sum().\
        rename(cancer_type)
        
        cancer_sig_prop = cancer_sig_count# / cancer_ttest.shape[0]
        
        all_sig_props = pd.concat([all_sig_props, cancer_sig_prop.to_frame().transpose()])
    
    return all_sig_props

res = groups_overlaps(
    chromosome=8,
    arm="p",
    gain_or_loss="loss",
    cis_or_trans="cis",
    proteomics_or_transcriptomics="proteomics",
    comparison="tumor",
    source="gistic",
    level="gene",
)

res

                                  coad
group         protein                 
has_event     ADAM28               NaN
              ADAM9       4.970439e-01
              ADAMDEC1   -1.422175e-06
              ADGRA2      9.329758e-01
              AGPAT5      3.484816e-01
              ANGPT2     -6.729069e-04
              ARHGEF10   -6.122925e-03
              ASAH1      -6.403538e-03
              ASH2L       9.589616e-06
              ATP6V1B2   -2.435455e-03
              BAG4       -6.719475e-01
              BIN3       -4.259459e-09
              BMP1        9.904116e-01
              BNIP3L               NaN
              BRF2       -1.852613e-01
              CCAR2       6.694730e-02
              CCDC25     -4.308169e-04
              CDCA2                NaN
              CHMP7      -1.626773e-06
              CLDN23               NaN
              CLN8                 NaN
              CLU        -1.333792e-06
              CNOT7       5.694846e-01
              CSGALNACT1 

group,has_event,not_has_event
all,8,0
coad,8,0
hnscc,8,0
lscc,8,0
luad,8,0
ov,8,0
pdac,8,0
