# Equivalence tests for trans genes

Here we look for genes OUTSIDE the event discussed that are NOT affected by the arm-level event.

## Setup (Install necessary packages)

We will start by importing necessary packages and collecting all of the proteomics data we will need to run the tests.

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import cptac
from scipy import stats
import cnvutils
import cptac.utils
import statsmodels.stats.power
import statsmodels.stats.weightstats
import statsmodels.stats.multitest

In [2]:
# These variables specify which chromosome and arm we're working on, and whether to do cis or trans effects
params = cnvutils.load_params(os.path.join("..", "data", "params.json"))
CHROMOSOME = params["CHROMOSOME"]
ARM = params["ARM"]
CIS_OR_TRANS = "trans"

# Get data tables
CANCER_TYPES = params["CANCER_TYPES"]
data_types = ["proteomics"]
tables = cnvutils.load_tables(CANCER_TYPES, data_types, pancan=False)
proteomics = tables["proteomics"]

# Get event location data
event_metadata = cnvutils.load_params(os.path.join("..", "data", "event_metadata.json"))
EVENT_START = event_metadata["START"]
EVENT_END = event_metadata["END"]

EVENT_COLUMN = "event"

                                            

## Select the proteins we're interested in

If we're looking at cis effects, we select proteins within the event. If we're looking at trans effects, we select proteins outside of the event.

Note that the cnvutils.get_event_genes function uses Ensembl gene IDs for the Database_ID column, while the proteomics dataframes that have a Database_ID column use RefSeq protein IDs. So, when we're selecting the genes we want, we ignore the Database_ID column if it is present, and just use gene names.

In [3]:
selected_genes = cnvutils.\
get_event_genes(
    chrm=CHROMOSOME,
    event_start=EVENT_START,
    event_end=EVENT_END,
    cis_or_trans=CIS_OR_TRANS
)["Name"].\
drop_duplicates(keep="first")

for cancer_type in proteomics.keys():
    df = proteomics[cancer_type].transpose()
    
    if df.index.nlevels == 1:
        df = df[df.index.isin(selected_genes)]
    else:
        df = df[df.index.isin(selected_genes, level="Name")]

    proteomics[cancer_type] = df

## Append Event Data

We now append the data from the event table that should have been created in a previous notebook.

In [4]:
has_event = dict()
for cancer_type in proteomics.keys():
    
    df = proteomics[cancer_type]
    df = df.transpose()
    
    event = pd.read_csv(
        os.path.join(
            "..", 
            "data", 
            f"chr{CHROMOSOME}_{cancer_type}_has_event_{'harmonized' if PANCAN else 'AWG'}.tsv"
        ), 
        sep='\t', 
        index_col=0,
        dtype={EVENT_COLUMN: bool}
    )
        
    event.index.rename('Name')
    df = df.join(other=event, how="inner")
    
    has_event[cancer_type] = df[EVENT_COLUMN]
    proteomics[cancer_type] = df

  sort=sort,


## Run equivalence tests

To determine the upper and lower bounds for our equivalence tests, we will use power calculations to determine the minimum effect size we could detect in the first place for each gene. Note that the power calculations are for Student's t test, while our TOST equivalence tests use Welch's t test, but this is okay because Student's is more powerful than Welch's, so the minimum effect size will be underestimated, thus not hurting our accuracy. If it ends up being too strict, we'll alter our approach.

In [5]:
results_df = None
for cancer_type in proteomics.keys():
    prot_df = proteomics[cancer_type]
    
    comparisons = []
    pvals = []
    nulls = []
    
    # Iterate over all columns except the event column
    for prot in prot_df.columns[~(prot_df.columns == EVENT_COLUMN)]:
        
        # Get the data
        in_event = prot_df.loc[prot_df[EVENT_COLUMN], [prot]].iloc[:, 0].dropna()
        out_event = prot_df.loc[~prot_df[EVENT_COLUMN], [prot]].iloc[:, 0].dropna()
        
        # Calculate the minimum effect size, to use for upper and lower bounds for the TOST
        # Formula from the power calculation for a two sample Student's t-test from The
        # Analysis of Biological Data by Whitlock and Schluter, 2nd edition (2015), Roberts and
        # Company Publishers, pg. 444. Original formula was sample_size = 16 * (stdev / min_event) ^ 2
        
        # We separately calculate the minimum effect size for each group based on its standard
        # deviation and sample size, since the locations of the groups may differ even if the
        # standard deviations are similar, and because the size of each group is usually different.

        in_min_effect = 4 * np.std(in_event) / np.sqrt(in_event.size)
        out_min_effect = 4 * np.std(out_event) / np.sqrt(out_event.size)
        
        min_effect = np.mean([in_min_effect, out_min_effect])

        # Run TOST test
        # We pass usevar="unequal" to use Welch's t test and free us from the 
        # assumption of equal variance between groups
        pval, res_lower, res_upper = statsmodels.stats.weightstats.ttost_ind(
            x1=in_event,
            x2=out_event,
            low=-1.25 * min_effect,
            upp=1.25 * min_effect,
            usevar="unequal"
        )
        
        if pd.notnull(pval):
            comparisons.append(prot)
            pvals.append(pval)
        else:
            nulls.append(prot)
            continue

    # Multiple testing correction
    reject, pvals, alpha_sidak, alpha_bonf = statsmodels.stats.multitest.multipletests(
        pvals=pvals, 
        alpha=0.05, 
        method="fdr_bh"
    )
        
    results = pd.DataFrame({"Comparison": comparisons, "P_Value": pvals})

    results.set_index('Comparison', inplace=True)
    if isinstance(results.index[0], tuple):
        results[['Name', f'{cancer_type}_Database_ID']] = pd.DataFrame(
            results.index.values.tolist(),
            index=results.index
        )
        results.set_index(['Name', f'{cancer_type}_Database_ID'], inplace=True)
    else:
        results.index.name='Name'
    results.rename(columns={'P_Value': f'{cancer_type}_pvalue'}, inplace=True)
    if results_df is None:
        results_df = results
    else:
        results_df = results_df.join(results)

  return self.sum / self.sum_weights
  return self.sumsquares / self.sum_weights
  return np.sqrt(d1._var / (d1.nobs - 1) + d2._var / (d2.nobs - 1))
  sem1 = d1._var / (d1.nobs - 1)
  sem2 = d2._var / (d2.nobs - 1)


## Reformat and save results

In [6]:
results_df = results_df.\
reset_index(drop=False).\
rename(columns={"Name": "protein"}).\
set_index("protein")

long_results = pd.DataFrame()

for cancer_type in CANCER_TYPES:
    cancer_df = results_df.\
    loc[:, results_df.columns.str.startswith(cancer_type)].\
    dropna(axis="index", how="all").\
    reset_index(drop=False)

    # If the cancer type has database IDs, make a separate column that has them.
    # If not, create a column of NaNs (so that the tables all match)
    if f"{cancer_type}_Database_ID" in cancer_df.columns:
        cancer_df = cancer_df.rename(columns={f"{cancer_type}_Database_ID": "Database_ID"})
    else:
        cancer_df = cancer_df.assign(Database_ID=np.nan)

    # Rename the pvalue and diff columns to not have the cancer type
    cancer_df = cancer_df.rename(columns={f"{cancer_type}_pvalue": "adj_p"}).\
    assign(cancer_type=cancer_type)

    # Reorder the columns
    cancer_df = cancer_df[["cancer_type", "protein", "Database_ID", "adj_p"]]

    # Append to the overall dataframe
    long_results = long_results.append(cancer_df)

# Drop duplicate rows and reset the index
long_results = long_results[~long_results.duplicated(keep=False)].\
reset_index(drop=True)

In [7]:
# This will save the resulting table in the same directory as this notebook.
# Modify if you would like to save to a different location.
save_path = os.path.join("..", "data", f"{CHROMOSOME}{ARM}_{CIS_OR_TRANS}_equiv.tsv")
long_results.to_csv(save_path, sep='\t', index=False)