# Enrichment NB 2: Within each group, calculate an adjusted p value for whether each protein is differentially expressed
1. Load saved tables
2. Slice out the has and has_not groups
3. Send each protein in each group through permutation test, or t-test (which?), compared to the normal samples (non-paired)
4. Create a table with these columns:
    - group
    - cancer_type
    - protein
    - protein_str
    - change
    - p_val
    - adj_p
5. Save tables

## Setup

In [1]:
import cptac.utils as ut
import statsmodels
import numpy as np
import pandas as pd
import os
import scipy

In [2]:
proteomics_dir = "proteomics_tables"
diff_expr_dir = "diff_expr"

if not os.path.isdir(diff_expr_dir):
    os.mkdir(diff_expr_dir)

In [3]:
cancer_types = [
#     "brca", # No normal samples
#    "ccrcc", # No event table
    "colon",
#    "endometrial",  # No event table
#    "gbm",  # No event table
    "hnscc",
    "lscc",
    "luad",
    "ovarian",
]

## Load the table for each cancer type and split out samples with and without each event

In [4]:
with_gain = {}
without_gain = {}

with_loss = {}
without_loss = {}

with_both = {}
without_both = {}

all_normal = {}

for cancer_type in cancer_types:
    
    df_path = os.path.join(proteomics_dir, f"{cancer_type}_prot_event.tsv.gz")
    df = pd.read_csv(df_path, sep="\t", header=[0, 1], index_col=0, low_memory=False)
    
    # Handle dfs without multiindexed columns
    if df.columns.names[1] != "Database_ID":
        df = pd.read_csv(df_path, sep="\t", index_col=0, low_memory=False)
        
    # Otherwise, collapse the multiindex into tuples, taking out nan fillers for gain_event and loss_event columns
    else:
        tuples = df.columns.\
            to_flat_index().\
            tolist()
            
        tuples = [tuple([x[0], x[1]]) if x[1] != "nan" else x[0] for x in tuples]
        df.columns = pd.Index(tuples)
            
    # Split out the tumor and normal samples
    tumor = df[~df.index.str.endswith(".N")]
    normal = df[df.index.str.endswith(".N")].\
        drop(columns=["gain_event", "loss_event"])
    
    # Save the normal samples
    all_normal[cancer_type] = normal
    
    # Within the tumor samples, drop any rows where the event status is NaN
    gains = tumor[pd.notnull(tumor["gain_event"])].\
        astype({"gain_event": bool})
    losses = tumor[pd.notnull(tumor["loss_event"])].\
        astype({"loss_event": bool})
    
    # Separate and save samples with and without each event
    with_gain[cancer_type] = gains[gains["gain_event"]].\
        drop(columns=["gain_event", "loss_event"])
    without_gain[cancer_type] = gains[~gains["gain_event"]].\
        drop(columns=["gain_event", "loss_event"])
    
    with_loss[cancer_type] = losses[losses["loss_event"]].\
        drop(columns=["gain_event", "loss_event"])
    without_loss[cancer_type] = losses[~losses["loss_event"]].\
        drop(columns=["gain_event", "loss_event"])
    
    # Get the samples with both events
    both = tumor[pd.notnull(tumor["gain_event"]) & pd.notnull(tumor["loss_event"])].\
        astype({"gain_event": bool, "loss_event": bool})
    
    with_both[cancer_type] = both[both["gain_event"] & both["loss_event"]].\
        drop(columns=["gain_event", "loss_event"])
    without_both[cancer_type] = both[~(both["gain_event"] & both["loss_event"])].\
        drop(columns=["gain_event", "loss_event"])

## For each group, see which proteins are differentially expressed compared to normal

In [5]:
def compare_tumor_expr(tumor, normal, permutation, equal_var):
    """Test whether tumor and normal have significantly different expression.
    
    Parameters:
    tumor (pandas.DataFrame): A dataframe where the rows are samples, and the columns are proteins.
    normal (pandas.DataFrame): A dataframe where the rows are samples, and the columns are proteins. Needs to 
        have the same columns as the tumor dataframe.
    permutation (bool): If True, test with permutation tests. If False, test with scipy.stats.ttest_ind.
    equal_var (bool): If True, and permutation is False, then use Student's 2 sample independent t-test. If 
        False, and permutation is False, use Welch's approximate t-test. If permutation is True, this parameter 
        is irrelevant.
        
    Returns:
    results (pandas.DataFrame): A dataframe with a row for each protein, and a column identifying the protein, 
        a column with the change for that protein (np.mean(tumor) - np.mean(normal)), and a column with the p
        value for that change.
    """
    
    proteins = []
    changes = []
    p_vals = []
    
    for protein in tumor.columns:
        
        # Get the data for the protein
        protein_tumor = tumor[protein]
        protein_normal = normal[protein]
        
        # Skip it if either array is all null
        if protein_tumor.isna().all() or protein_normal.isna().all():
            continue
        
        if permutation:
            change, p_val = ut.permutation_test_means(
                protein_tumor,
                protein_normal,
                num_permutations=NUM_PERMUTATIONS)
        else:
            tstat, p_val = scipy.stats.ttest_ind(
                protein_tumor,
                protein_normal,
                equal_var=equal_var,
                nan_policy="omit"
            )
            
            change = np.mean(protein_tumor) - np.mean(protein_normal)
        
        proteins.append(protein)
        changes.append(change)
        p_vals.append(p_val)
        
    results = pd.DataFrame({
        "protein": proteins,
        "change": changes, 
        "p_val": p_vals})
    
    return results

In [6]:
groups = {
    "with_gain": with_gain,
    "without_gain": without_gain,
    "with_loss": with_loss,
    "without_loss": without_loss,
    "with_both": with_both,
    "without_both": without_both,
}


In [7]:
all_results = pd.DataFrame()

for group in groups.keys():
    group_dict = groups[group]
    
    for cancer_type in group_dict.keys():
        cancer_df = group_dict[cancer_type]
        normal_df = all_normal[cancer_type]
        
        cancer_results = compare_tumor_expr(
            tumor=cancer_df,
            normal=normal_df,
            permutation=False,
            equal_var=False
        )
        
        cancer_results.insert(0, "cancer_type", cancer_type)
        cancer_results.insert(0, "group", group)
        
        all_results = all_results.append(cancer_results)
        
        print(f"Finished {group} - {cancer_type}")

Finished with_gain - colon
Finished with_gain - hnscc
Finished with_gain - lscc
Finished with_gain - luad
Finished with_gain - ovarian
Finished without_gain - colon
Finished without_gain - hnscc
Finished without_gain - lscc
Finished without_gain - luad
Finished without_gain - ovarian
Finished with_loss - colon
Finished with_loss - hnscc
Finished with_loss - lscc
Finished with_loss - luad
Finished with_loss - ovarian
Finished without_loss - colon
Finished without_loss - hnscc
Finished without_loss - lscc
Finished without_loss - luad
Finished without_loss - ovarian
Finished with_both - colon
Finished with_both - hnscc
Finished with_both - lscc
Finished with_both - luad
Finished with_both - ovarian
Finished without_both - colon
Finished without_both - hnscc
Finished without_both - lscc
Finished without_both - luad
Finished without_both - ovarian


In [8]:
# For rows with tuples as the protein names, split out just the protein names
protein_str = all_results["protein"].apply(lambda x: x[0] if type(x) == tuple else x)
all_results = all_results.assign(protein_str=protein_str)

## Multiple testing correction

In [9]:
reject_null, adj_p, alpha_sidak, alpha_bonf = statsmodels.stats.multitest.multipletests(
    pvals=all_results["p_val"],
    alpha=0.05,
    method="fdr_bh")

all_results = all_results.\
    assign(adj_p=adj_p).\
    reset_index()

## Save results

In [10]:
results_file_path = os.path.join(diff_expr_dir, "all_results.tsv.gz")
all_results.to_csv(results_file_path, sep="\t", compression="gzip")