In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.stats.multitest
import operator
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

  import pandas.util.testing as tm


In [2]:
en = cptac.Endometrial()

                                                

In [63]:
mut = en.get_genotype_all_vars("PIK3CA", mutation_hotspot=['E542K', 'E545K', 'H1047R'])
prot = en.get_proteomics(tissue_type="tumor")

joined = mut.join(prot)

en_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]

wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

endo_hotspot = en_hotspot_df.append(wildtype)

prot_and_mutations = endo_hotspot.drop(columns = ["Mutation_Status", "Location"])
prot_and_mutations



Name,Mutation,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,AAED1,AAGAB,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,Missense_Mutation,-1.18,-0.863,-0.802,0.222,0.256,0.665,1.28,-0.339,0.412,...,-0.0877,,0.0229,0.109,,-0.332,-0.433,-1.02,-0.123,-0.0859
C3L-00032,Missense_Mutation,-0.528,-1.32,0.435,,-0.24,1.04,-0.0213,-0.0479,0.419,...,0.00112,-0.145,0.0105,-0.116,,0.151,-0.074,-0.54,0.32,-0.419
C3L-00362,Missense_Mutation,-0.924,-0.445,1.57,-0.678,0.173,0.436,0.227,-0.00248,0.479,...,-0.0496,,0.108,-0.153,,0.109,-0.336,-0.822,-0.0338,0.121
C3L-00601,Missense_Mutation,-0.454,-0.242,,,0.258,0.219,-0.249,-1.33,0.204,...,-0.158,,0.667,1.3,0.441,0.13,-0.0659,-0.923,-0.163,0.112
C3L-00605,Missense_Mutation,-0.24,0.594,3.4,0.154,0.0932,0.283,-0.0789,-0.611,0.37,...,-0.0511,-0.0136,0.214,0.683,0.433,-0.208,-0.347,-0.911,0.0692,-0.232
C3L-00921,Missense_Mutation,-1.19,-1.19,3.66,1.27,-0.105,-0.0021,0.0813,-0.691,-0.295,...,0.097,0.254,0.151,0.285,-0.239,-0.0681,-0.218,-0.0873,-0.127,0.374
C3L-00947,Missense_Mutation,-0.485,0.917,0.304,-0.00675,0.178,-0.423,0.224,-0.247,0.0787,...,0.0429,-0.212,0.169,0.521,-0.147,0.354,0.17,-1.01,-0.11,0.131
C3N-00323,Missense_Mutation,-0.817,-1.13,,,0.102,0.371,0.644,0.281,0.114,...,-0.0222,,-0.279,0.423,,0.294,-0.13,-0.106,-0.299,0.385
C3N-00324,Missense_Mutation,-0.449,-0.44,1.15,,-0.645,0.316,0.0809,-0.47,0.804,...,0.0449,0.0452,-0.152,-0.197,0.295,-0.188,-0.265,0.25,-0.113,-0.302
C3N-00383,Missense_Mutation,0.0729,0.567,1.71,,-0.278,0.646,0.378,-0.0146,1.17,...,0.279,-0.378,0.0125,-0.305,,0.149,-0.0526,-0.0311,0.432,0.00801


In [56]:
label_values = prot_and_mutations["Mutation"].unique()

'''Partition dataframe into two sets, one for each of the two unique values from the label column'''
partition1 = prot_and_mutations.loc[prot_and_mutations["Mutation"] == label_values[0]]
partition2 = prot_and_mutations.loc[prot_and_mutations["Mutation"] == label_values[1]]

'''If no comparison columns specified, use all columns except the specified labed column'''
comparison_columns = list(prot_and_mutations.columns)
comparison_columns.remove("Mutation")

number_of_comparisons = len(comparison_columns)

'''Store comparisons and p-values in two arrays'''
comparisons = []
pvals = []

'''Loop through each comparison column, perform the t-test, and record the p-val'''
times_through = 0
for column in comparison_columns:  
    times_through += 1
    if len(partition1[column].dropna(axis=0)) <= 1:
#         comparison_columns.remove(column)
        continue
    elif len(partition2[column].dropna(axis=0)) <= 1:
#         comparison_columns.remove(column)
        continue
    else:
        stat, pval = scipy.stats.ttest_ind(partition1[column].dropna(axis=0), partition2[column].dropna(axis=0))
        comparisons.append(column)
        pvals.append(pval)
    

'''Correct for multiple testing to determine if each comparison meets the new cutoff'''
results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=.05, method='fdr_bh')
reject = results[0]


'''Format results in a pandas dataframe'''
results_df = pd.DataFrame(columns=['Comparison','P_Value'])

for i in range(0, len(reject)):
    if reject[i]:
        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':pvals[i]}, ignore_index=True)


'''Sort dataframe by ascending p-value'''
results_df = results_df.sort_values(by='P_Value', ascending=True)
results_df = results_df.reset_index(drop=True)

In [64]:
def wrap_ttest(df, label_column, comparison_columns=None, alpha=.05, return_all=False, correction_method='bonferroni'):
    try:
        '''Verify precondition that label column exists and has exactly 2 unique values'''
        label_values = df[label_column].unique()
        if len(label_values) != 2:
            print("Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.")
            return None
        
        '''Partition dataframe into two sets, one for each of the two unique values from the label column'''
        partition1 = df.loc[df[label_column] == label_values[0]]
        partition2 = df.loc[df[label_column] == label_values[1]]

        '''If no comparison columns specified, use all columns except the specified labed column'''
        if not comparison_columns:
            comparison_columns = list(df.columns)
            comparison_columns.remove(label_column)

        '''Determine the number of real valued columns on which we will do t-tests'''
        number_of_comparisons = len(comparison_columns)

        '''Store comparisons and p-values in two arrays'''
        comparisons = []
        pvals = []
        
        '''Loop through each comparison column, perform the t-test, and record the p-val'''
        for column in comparison_columns:
            if len(partition1[column].dropna(axis=0)) <= 1:
        #         comparison_columns.remove(column)
                continue
            elif len(partition2[column].dropna(axis=0)) <= 1:
        #         comparison_columns.remove(column)
                continue
            else:
                stat, pval = scipy.stats.ttest_ind(partition1[column].dropna(axis=0), partition2[column].dropna(axis=0))
                comparisons.append(column)
                pvals.append(pval)
            
        '''Correct for multiple testing to determine if each comparison meets the new cutoff'''
        results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=alpha, method=correction_method)
        reject = results[0]

        '''Format results in a pandas dataframe'''
        results_df = pd.DataFrame(columns=['Comparison','P_Value'])

        '''If return all, add all comparisons and p-values to dataframe'''
        if return_all:
            results_df['Comparison'] = comparisons
            results_df['P_Value'] = pvals

            '''Else only add significant comparisons'''
        else:
            for i in range(0, len(reject)):
                if reject[i]:
                    results_df = results_df.append({'Comparison':comparisons[i],'P_Value':pvals[i]}, ignore_index=True)


        '''Sort dataframe by ascending p-value'''
        results_df = results_df.sort_values(by='P_Value', ascending=True)
        results_df = results_df.reset_index(drop=True)

        '''If results df is not empty, return it, else return None'''
        if len(results_df) > 0:
            return results_df
        else:
            return None


    except:
        print("Incorrectly Formatted Dataframe!")
        return None

In [68]:
wrap_ttest(prot_and_mutations, "Mutation", correction_method="fdr_bh", alpha=.9)

In [69]:
wrap_ttest(prot_and_mutations, "Mutation", alpha=.9)

In [34]:
wrap_ttest(prot_and_mutations, "Mutation", return_all=True) 

Unnamed: 0,Comparison,P_Value
0,SLC25A14,0.000645
1,CORO7-PAM16,0.000877
2,RIPK1,0.001035
3,LYRM9,0.001181
4,GAS6,0.001208
...,...,...
10664,MYOC,0.999707
10665,STK35,0.999726
10666,GSTT2B,0.999833
10667,CA4,0.999840
