# Enrichment NB 01: Calculate differentially expressed proteins in event groups

1. Group samples into those with the arm level event and those without it
    1. Load proteomics tables
    2. Read in the has_event tables
    3. Join into the proteomics tables
2. Within each group, calculate an adjusted p value for whether each protein is differentially expressed
    1. Load saved tables
    2. Slice out the "with" and "without" groups
    3. Send each protein in each group through permutation test, or t-test (which?), compared to the normal samples (non-paired)
    4. Create a table with these columns:
        - group
        - cancer_type
        - protein
        - protein_str
        - change
        - p_val
        - adj_p
3. Look for pathways enriched with proteins that are only shown differentially expressed in one group
    1. Load saved table
    2. For each cancer type:
        1. Separately select proteins that only have a significant difference in samples with the event, and proteins that only have a significant difference in samples without the event.
        2. Also select proteins that are significantly upregulated in samples with the event but downregulated in samples without the event, and vice versa
        3. For each group, run an enrichment analysis
            1. Which algorithm?
            2. Which gene set library? Probably either GO or Reactome.
            3. Rank by difference in expression (should we take absolute value? Check)

## Setup

In [1]:
import cptac
import cptac.utils as ut
import statsmodels
import numpy as np
import pandas as pd
import os
import scipy

In [2]:
alpha = 0.05

events_dir = ".."
output_dir = "results_01_grouped_expr"

if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

In [3]:
# Create a dictionary of the datasets
# We don't load them yet--we'll do it one at a time to save RAM
dss = {
#     "brca": cptac.Brca, # No normal samples
#    "ccrcc": cptac.Ccrcc, # No event table
    "colon": cptac.Colon,
#    "endometrial": cptac.Endometrial,  # No event table
#    "gbm": cptac.Gbm,  # No event table
    "hnscc": cptac.Hnscc,
    "lscc": cptac.Lscc,
    "luad": cptac.Luad,
    "ovarian": cptac.Ovarian,
}

## Part 1

### Load and join tables for each cancer type

In [4]:
# We don't load the dataset until we're inside the function, so that it
# will pass out of scope when the function ends and be garbage collected,
# which will save RAM.

def load_and_join_prot(dataset, event_table_dir):
    
    ds = dataset(no_internet=True)
    prot = ds.get_proteomics()
    
    event_table_path = os.path.join(event_table_dir, f"{ds.get_cancer_type()}_has_event.tsv")
    event_table = pd.read_csv(event_table_path, sep="\t", index_col=0)
    
    # Add a null index level to the event data if the protein table has two levels, for joining
    if prot.columns.nlevels == 2:
        event_table.columns.name = "Name"
        event_table = event_table.transpose().reset_index(drop=False)
        event_table.insert(1, "Database_ID", ["nan", "nan"])
        event_table = event_table.\
            set_index(["Name", "Database_ID"]).\
            transpose()
    
    joined = prot.join(other=event_table, how="left")
    
    return joined

In [5]:
prot_and_events = {}

for cancer_type, dataset_func in dss.items():
    prot_and_events[cancer_type] = load_and_join_prot(
        dataset=dataset_func, 
        event_table_dir=events_dir
    )

Loading lscc v3.2.             



Loading luad v3.1.1..       



                               

## Part 2

### Split out samples with and without each event

In [6]:
with_gain = {}
without_gain = {}

with_loss = {}
without_loss = {}

with_both = {}
without_both = {}

all_normal = {}

for cancer_type in dss.keys():
    
    df = prot_and_events[cancer_type]
    
    # If it has a multiindex columns, collapse the multiindex into tuples, taking 
    # out nan fillers for gain_event and loss_event columns
    if df.columns.nlevels == 2:
        tuples = df.columns.\
            to_flat_index().\
            tolist()
            
        tuples = [tuple([x[0], x[1]]) if x[1] != "nan" else x[0] for x in tuples]
        df.columns = pd.Index(tuples)
            
    # Split out the tumor and normal samples
    tumor = df[~df.index.str.endswith(".N")]
    normal = df[df.index.str.endswith(".N")].\
        drop(columns=["gain_event", "loss_event"])
    
    # Save the normal samples
    all_normal[cancer_type] = normal
    
    # Within the tumor samples, drop any rows where the event status is NaN
    gains = tumor[pd.notnull(tumor["gain_event"])].\
        astype({"gain_event": bool})
    losses = tumor[pd.notnull(tumor["loss_event"])].\
        astype({"loss_event": bool})
    
    # Separate and save samples with and without each event
    with_gain[cancer_type] = gains[gains["gain_event"]].\
        drop(columns=["gain_event", "loss_event"])
    without_gain[cancer_type] = gains[~gains["gain_event"]].\
        drop(columns=["gain_event", "loss_event"])
    
    with_loss[cancer_type] = losses[losses["loss_event"]].\
        drop(columns=["gain_event", "loss_event"])
    without_loss[cancer_type] = losses[~losses["loss_event"]].\
        drop(columns=["gain_event", "loss_event"])
    
    # Get the samples with both events
    both = tumor[pd.notnull(tumor["gain_event"]) & pd.notnull(tumor["loss_event"])].\
        astype({"gain_event": bool, "loss_event": bool})
    
    with_both[cancer_type] = both[both["gain_event"] & both["loss_event"]].\
        drop(columns=["gain_event", "loss_event"])
    without_both[cancer_type] = both[~(both["gain_event"] & both["loss_event"])].\
        drop(columns=["gain_event", "loss_event"])

### For each group, see which proteins are differentially expressed compared to normal

In [7]:
def compare_tumor_expr(tumor, normal, permutation, equal_var):
    """Test whether tumor and normal have significantly different expression.
    
    Parameters:
    tumor (pandas.DataFrame): A dataframe where the rows are samples, and the columns are proteins.
    normal (pandas.DataFrame): A dataframe where the rows are samples, and the columns are proteins. Needs to 
        have the same columns as the tumor dataframe.
    permutation (bool): If True, test with permutation tests. If False, test with scipy.stats.ttest_ind.
    equal_var (bool): If True, and permutation is False, then use Student's 2 sample independent t-test. If 
        False, and permutation is False, use Welch's approximate t-test. If permutation is True, this parameter 
        is irrelevant.
        
    Returns:
    results (pandas.DataFrame): A dataframe with a row for each protein, and a column identifying the protein, 
        a column with the change for that protein (np.mean(tumor) - np.mean(normal)), and a column with the p
        value for that change.
    """
    
    proteins = []
    changes = []
    p_vals = []
    t_stats = []
    
    for protein in tumor.columns:
        
        # Get the data for the protein
        protein_tumor = tumor[protein]
        protein_normal = normal[protein]
        
        # Skip it if either array is all null
        if protein_tumor.isna().all() or protein_normal.isna().all():
            continue
        
        if permutation:
            change, p_val = ut.permutation_test_means(
                protein_tumor,
                protein_normal,
                num_permutations=NUM_PERMUTATIONS)
            
            t_stat = np.nan
        else:
            t_stat, p_val = scipy.stats.ttest_ind(
                protein_tumor,
                protein_normal,
                equal_var=equal_var,
                nan_policy="omit"
            )
            
            # Skip samples without enough values
            # TODO: Should we set a cutoff of a minimum number of reads for each protein that we include?
            if str(p_val) == "--":
                continue
            
            change = np.mean(protein_tumor) - np.mean(protein_normal)
        
        proteins.append(protein)
        changes.append(change)
        p_vals.append(p_val)
        t_stats.append(t_stat)
        
    results = pd.DataFrame({
        "protein": proteins,
        "change": changes, 
        "p_val": p_vals,
        "t_stat": t_stats
    })
    
    return results

In [8]:
groups = {
    "with_gain": with_gain,
    "without_gain": without_gain,
    "with_loss": with_loss,
    "without_loss": without_loss,
    "with_both": with_both,
    "without_both": without_both,
}


In [9]:
all_results = pd.DataFrame()

for group in groups.keys():
    group_dict = groups[group]
    
    for cancer_type in group_dict.keys():
        cancer_df = group_dict[cancer_type]
        normal_df = all_normal[cancer_type]
        
        cancer_results = compare_tumor_expr(
            tumor=cancer_df,
            normal=normal_df,
            permutation=False,
            equal_var=False
        )
        
        cancer_results.insert(0, "cancer_type", cancer_type)
        cancer_results.insert(0, "group", group)
        
        all_results = all_results.append(cancer_results)
        
        print(f"Finished {group} - {cancer_type}")

Finished with_gain - colon
Finished with_gain - hnscc
Finished with_gain - lscc
Finished with_gain - luad
Finished with_gain - ovarian
Finished without_gain - colon
Finished without_gain - hnscc
Finished without_gain - lscc
Finished without_gain - luad
Finished without_gain - ovarian
Finished with_loss - colon
Finished with_loss - hnscc
Finished with_loss - lscc
Finished with_loss - luad
Finished with_loss - ovarian
Finished without_loss - colon
Finished without_loss - hnscc
Finished without_loss - lscc
Finished without_loss - luad
Finished without_loss - ovarian
Finished with_both - colon
Finished with_both - hnscc
Finished with_both - lscc
Finished with_both - luad
Finished with_both - ovarian
Finished without_both - colon
Finished without_both - hnscc
Finished without_both - lscc
Finished without_both - luad
Finished without_both - ovarian


In [10]:
# For rows with tuples as the protein names, split out just the protein names
protein_str = all_results["protein"].apply(lambda x: x[0] if type(x) == tuple else x)
all_results = all_results.assign(protein_str=protein_str)

### Multiple testing correction

In [11]:
reject_null, adj_p, alpha_sidak, alpha_bonf = statsmodels.stats.multitest.multipletests(
    pvals=all_results["p_val"],
    alpha=0.05,
    method="fdr_bh")

all_results = all_results.assign(adj_p=adj_p)

### Save all group results

In [12]:
all_results_path = os.path.join(output_dir, "all_results.tsv.gz")
all_results.to_csv(all_results_path, sep="\t", compression="gzip", index=False)

## Part 3

### Filter out proteins whose p value for differential expression was below the cutoff

In [13]:
all_results = all_results[all_results["adj_p"] <= alpha]

### Find which proteins were uniquely differentially expressed in each group (compared to the group's complement)

In [14]:
groups = pd.DataFrame()

for cancer_type in all_results["cancer_type"].unique():
    
    # Get data for this cancer type
    cancer_df = all_results[all_results["cancer_type"] == cancer_type]
    
    # Separate groups
    with_gain = cancer_df[cancer_df["group"] == "with_gain"]
    without_gain = cancer_df[cancer_df["group"] == "without_gain"]
    
    with_loss = cancer_df[cancer_df["group"] == "with_loss"]
    without_loss = cancer_df[cancer_df["group"] == "without_loss"]
    
    with_both = cancer_df[cancer_df["group"] == "with_both"]
    without_both = cancer_df[cancer_df["group"] == "without_both"]
    
    
    
    # Identify proteins that are up in samples with the gain, but not in samples without the gain
    only_with_gain = with_gain[~with_gain["protein"].isin(without_gain["protein"])]
    groups = groups.append(only_with_gain)
    
    # Identify proteins that are up in samples without the gain, but not in samples with the gain
    only_without_gain = without_gain[~without_gain["protein"].isin(with_gain["protein"])]
    groups = groups.append(only_without_gain)
    
    
    
    # Identify proteins that are up in samples with the loss, but not in samples without the loss
    only_with_loss = with_loss[~with_loss["protein"].isin(without_loss["protein"])]
    groups = groups.append(only_with_loss)

    # Identify proteins that are up in samples without the loss, but not in samples with the loss
    only_without_loss = without_loss[~without_loss["protein"].isin(with_loss["protein"])]
    groups = groups.append(only_without_loss)

    
    
    # Identify proteins that are up in samples with the gain and the loss, but not in samples 
    # without the gain and the loss
    only_with_both = with_both[~with_both["protein"].isin(without_both["protein"])]
    groups = groups.append(only_with_both)

    # Identify proteins that are up in samples without the gain and the loss, but not in samples 
    # with the gain and the loss
    only_without_both = without_both[~without_both["protein"].isin(with_both["protein"])]
    groups = groups.append(only_without_both)

### Save results of proteins unique to each group

In [15]:
# Create a combined grouping column for easier use in the next step
groups.insert(0, "cancer_type_group", groups["cancer_type"] + "_" + groups["group"])

groups_file = os.path.join(output_dir, "unique_results.tsv.gz")
groups.to_csv(groups_file, sep="\t", index=False, compression="gzip")