In [3]:
import pandas as pd
import numpy as np
import scipy.stats
import collections

import CPTAC.Endometrial as en

Welcome to the CPTAC data service package. Available datasets may be
viewed using CPTAC.list_data(). In order to access a specific data
set, import a CPTAC subfolder using either 'import CPTAC.Dataset' or
'from CPTAC import Dataset'.
******
Version: 0.3.1
******
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


In [8]:
'''
@Param df:
    A dataframe containing the label column, and one or more real valued comparison columns.
    
@Param label_column:
    The name of the label column. This column must be in the dataframe, and must contain exactly 2 unique values.
    
@Param comparison_columns:
    A list of columns on which t-tests will be performed. Each column must be in the dataframe, and must be real valued.

@Param alpha (default = .05):
    Significance level. Will be adjusted using bonferroni correction if more than 1 comparison is done.
    
@Param verbose (default = False):
    Boolean. If true, will print p-value of every comparison, whether or not it meets significance cutoff.

@Return:
    A pandas dataframe of column names and corresponding p-values which were determined to be significant in 
    the comparison, sorted by significance (smallest p-values at the head). The 2 columns of the dataframe are 
    'Comparison' and 'P_Value'.
    Returns None if dataframe was not formatted properly, or if no comparison was significant.
    
This method takes as a parameter a dataframe. Must be formatted in the following way. 1 column declared as the label column, with 
the name of this column passed in as the second parameter. The Label column must contain exactly 2 unique entries,
and every row in the dataframe must have one of these 2 values in this column. The remaining columns will be real 
valued columns on which t-tests will be done. A list of real valued columns on which to do t-tests will be passed in 
as the third parameter. No t-test will be done on columns not included in this list.

The wrap_ttest method will then compare the two groups, as partitioned by the two values in the Label column, and 
perform t-tests for each real valued column in the passed in list, generating a p-value.
The cutoff for significance will be determined using a bonferroni correction, and the significant columns, 
with their p-values, will be returned as a dataframe, sorted by p-value.
'''

def wrap_ttest(df, label_column, comparison_columns, alpha=.05, verbose=False):
    try:
        '''Verify precondition that label column exists and has exactly 2 unique values'''
        label_values = df[label_column].unique()
        if len(label_values) != 2:
            print("Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.")
            return None
        
        '''Partition dataframe into two sets, one for each of the two unique values from the label column'''
        partition1 = df.loc[df[label_column] == label_values[0]]
        partition2 = df.loc[df[label_column] == label_values[1]]
        
        '''Determine the number of real valued columns on which we will do t-tests'''
        number_of_comparisons = len(comparison_columns)
        
        '''Use a bonferroni correction to adjust for multiple testing by altering the p-value needed for acceptance'''
        bonferroni_cutoff = alpha/number_of_comparisons
        
        '''Store significant comparisons with their p-values in a dictionary'''
        significant_comparisons = {}
        
        '''Loop through each comparison column, perform the t-test, and determine whether it meets the significance cutoff'''
        for column in comparison_columns:
            stat, pval = scipy.stats.ttest_ind(partition1[column].dropna(axis=0), partition2[column].dropna(axis=0))
            if verbose:
                print(column, ": ", pval)
            if pval <= bonferroni_cutoff:
                significant_comparisons[column] = pval
        
        '''If no comparison met the significance cutoff, notify that no comparison was signficant, and return None'''
        if len(significant_comparisons) == 0:
            print("No significant comparisons.")
            return None
        
            '''If one or more comparison did meet the significance cutoff, sort the dictionary by significance and return it to the caller'''
        else:
            '''Sort dictionary to list smallest p-values first'''
            sorted_significant_comparisons = sorted(significant_comparisons.items(), key=lambda kv: kv[1])
            '''Format as a dataframe and return to caller'''
            sorted_significant_comparisons_df = pd.DataFrame.from_dict(sorted_significant_comparisons)
            sorted_significant_comparisons_df.columns = ['Comparison', 'P_Value']
            return sorted_significant_comparisons_df
    
    except:
        print("Incorrectly Formatted Dataframe!")
        return None

## Test 1

In [12]:
proteomics = en.get_proteomics()

genedf = en.append_mutations_to_omics(mutation_genes=['ARID1A'], omics_df=proteomics, omics_genes=['ARID1A', 'ACTL6A','PTEN','PIK3CA','KMT2D','DPF2'])

genedf = genedf.loc[genedf['Sample_Status'] == 'Tumor']

#print(genedf)

for ind, row in genedf.iterrows():
    if 'Wildtype_Tumor' not in row['ARID1A_Mutation']:
        genedf.at[ind,'Label'] = 'Mutated'
    else:
        genedf.at[ind,'Label'] = 'Wildtype'
        

genedf = genedf.drop('ARID1A_Mutation', axis=1)
genedf = genedf.drop('ARID1A_Location', axis=1)
genedf = genedf.drop('Sample_Status', axis=1)

func_results = wrap_ttest(genedf, 'Label', ['ARID1A_proteomics','PTEN_proteomics','KMT2D_proteomics','ACTL6A_proteomics'])
func_results.head()

Unnamed: 0,Comparison,P_Value
0,ARID1A_proteomics,1.025018e-10
1,PTEN_proteomics,0.0004310897
2,ACTL6A_proteomics,0.002747935


## Test 2

In [13]:
transcriptomics = en.get_transcriptomics()

transdf = en.append_mutations_to_omics(mutation_genes=['PTEN'], omics_df=transcriptomics, omics_genes=['ARID1A', 'ACTL6A','PTEN','PIK3CA','KMT2D','DPF2'])
transdf.head()

transdf = transdf.loc[transdf['Sample_Status'] == 'Tumor']

cols = list(transdf.columns)
cols.remove('Sample_Status')
cols.remove('PTEN_Location')
cols.remove('PTEN_Mutation')

for ind, row in transdf.iterrows():
    if 'Wildtype_Tumor' not in row['PTEN_Mutation'] :
        transdf.at[ind,'Label'] = 'Mutated'
    else:
        transdf.at[ind,'Label'] = 'Wildtype'

transdf.head()


func_results = wrap_ttest(transdf, 'Label', cols)
print(func_results)

               Comparison       P_Value
0  PIK3CA_transcriptomics  3.971222e-07
1  ACTL6A_transcriptomics  5.347264e-05


## Test 3

In [14]:
phos = en.get_phosphoproteomics()

phosdf = en.append_mutations_to_omics(mutation_genes=['PIK3CA'], omics_df=phos, omics_genes=['ARID1A', 'ACTL6A','PTEN','PIK3CA','KMT2D','DPF2'])
phosdf.head()

cols = list(phosdf.columns)
cols.remove('Sample_Status')
cols.remove('PIK3CA_Location')
cols.remove('PIK3CA_Mutation')

for ind, row in phosdf.iterrows():
    if row['PIK3CA_Mutation'] != 'Wildtype_Tumor':
        phosdf.at[ind,'Label'] = 'Mutated'
    else:
        phosdf.at[ind,'Label'] = 'Wildtype'

phosdf.head()

func_results = wrap_ttest(phosdf, 'Label', cols)
print(func_results)

Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.
None
