# Create tables with p-value and change in medians for multiple cancers 

Create a dataframe with p-value results from t-tests for all proteins (trans gene proteomics when PTEN has cnv deletions compared to PTEN wildtype). The dataframe also includes the change in medians between deletions and wildtype. Prepare these tables for further analysis by creating csv files.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as pu
root = R'~\Github\WhenMutationsDontMatter\PTEN\Figure_2\csv'

In [2]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.stats.multitest
import operator


def wrap_ttest(df, label_column, comparison_columns=None, alpha=.05, return_all=False, 
               correction_method='bonferroni', mincount=3, pval_return_corrected=True):
    try:
        '''Verify precondition that label column exists and has exactly 2 unique values'''
        label_values = df[label_column].unique()
        if len(label_values) != 2:
            print("Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.")
            return None
       
        '''Partition dataframe into two sets, one for each of the two unique values from the label column'''
        partition1 = df.loc[df[label_column] == label_values[0]]
        partition2 = df.loc[df[label_column] == label_values[1]]

        '''If no comparison columns specified, use all columns except the specified labed column'''
        if not comparison_columns:
            comparison_columns = list(df.columns)
            comparison_columns.remove(label_column)

        '''Determine the number of real valued columns on which we will do t-tests'''
        number_of_comparisons = len(comparison_columns)

        '''Store comparisons and p-values in two arrays'''
        comparisons = []
        pvals = []
       
        '''Loop through each comparison column, perform the t-test, and record the p-val'''
       
        for column in comparison_columns:
            if len(partition1[column].dropna(axis=0)) <= mincount:
                continue
            elif len(partition2[column].dropna(axis=0)) <= mincount:
                continue
            else:
                stat, pval = scipy.stats.ttest_ind(partition1[column].dropna(axis=0), partition2[column].dropna(axis=0))
                comparisons.append(column)
                pvals.append(pval)
               
        '''Correct for multiple testing to determine if each comparison meets the new cutoff'''
        results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=alpha, method=correction_method)
        reject = results[0]

        '''Format results in a pandas dataframe'''
        results_df = pd.DataFrame(columns=['Comparison','P_Value'])

        '''If return all, add all comparisons and p-values to dataframe'''
        if return_all:
            if pval_return_corrected:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = results[1]

            else:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = pvals

            '''Else only add significant comparisons'''
        else:
            for i in range(0, len(reject)):
                if reject[i]:
                    if pval_return_corrected:
                        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':results[1][i]}, ignore_index=True)
                    else:
                        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':pvals[i]}, ignore_index=True)


        '''Sort dataframe by ascending p-value'''
        results_df = results_df.sort_values(by='P_Value', ascending=True)
        results_df = results_df.reset_index(drop=True)

        '''If results df is not empty, return it, else return None'''
        if len(results_df) > 0:
            return results_df
        else:
            return None


    except:
        print("Incorrectly Formatted Dataframe!")
        return None



In [3]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot)
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
        prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    else: 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2= all_prot)
        omics = utils.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = omics.drop(columns='PTEN_CNV')
        # Get only tumor samples
        p = cancer_object.get_proteomics(tissue_type='tumor')
        tumor_ids = list(p.index)
        get = omics.index.isin(tumor_ids)
        omics = omics[get]
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
        del_wt['Mutation'] = np.where(
            del_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')

    return del_wt


In [4]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Only samples with deletions and wildtype for PTEN. 
all_prot_list: List. All proteins in proteomics data frame. 

Returns a dataframe with the difference in medians between proteomics with PTEN wt and del (wt - del). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    d = del_wt_df[del_wt_df.Mutation == "Deletion"]
    wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    
    # Correlation: + is mutant up compared to wt, - is mutant down
    for prot in all_prot_list:
        if prot+'_proteomics' in del_med.index and prot+'_proteomics' in wt_med.index:
            dif = del_med[prot+'_proteomics'] - wt_med[prot+'_proteomics']
            med_dict[prot+'_proteomics'] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Proteomics'})
    
    return df
        

#  Step 1: Create Data frames with p-values and differences in median

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run many T-tests for all genes in the proteomics data frame. Use get_change_in_medians_df to create the data frame with change in median values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [5]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

Checking that endometrial index is up-to-date...



Checking that luad index is up-to-date...       



version 3scc v3.2.......                 
Checking that ovarian index is up-to-date...



                                            

In [5]:
ls = cptac.Lscc()

version 3scc v3.2.......                 
                            



# Gbm

Part 1: Format data frame using all_prot_format_df. This returns a dataframe with proteomics and a Mutation column containing only PTEN cnv deletions and wildtype tumors. 

In [6]:
gene = 'PTEN'
g_prot = g.get_proteomics()
g_prot_list = list(g_prot.columns)

g_del_wt = all_prot_format_df(g, g_prot_list)
g_del_wt = g_del_wt.dropna(axis='columns', thresh = 10)
g_del_wt.head()



Name,A1BG_proteomics,A2M_proteomics,AAAS_proteomics,AACS_proteomics,AADAT_proteomics,AAED1_proteomics,AAGAB_proteomics,AAK1_proteomics,AAMDC_proteomics,AAMP_proteomics,...,ZSWIM8_proteomics,ZW10_proteomics,ZWILCH_proteomics,ZWINT_proteomics,ZXDC_proteomics,ZYG11B_proteomics,ZYX_proteomics,ZZEF1_proteomics,ZZZ3_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00104,0.07763,0.487228,-0.254208,-0.144373,0.551881,-0.025276,-0.467451,-0.089511,-0.078806,0.329603,...,-0.047437,-0.105908,-0.347076,,0.459635,0.079452,-0.784983,-0.488441,0.16799,Deletion
C3L-00365,-0.145975,0.798796,0.184242,-0.470603,,0.390211,0.245466,-0.609998,0.118625,-0.086927,...,0.161975,-0.213093,0.235571,,0.107421,0.048724,0.138403,-0.290141,0.405037,Deletion
C3L-00674,0.821991,1.09647,-0.094421,-0.106304,0.084578,0.176402,-0.248151,0.014061,-0.699773,-0.638462,...,-0.065534,-0.306717,0.879991,,0.883564,-0.172222,0.011876,-0.131889,-0.503581,Deletion
C3L-00677,-0.064567,0.129385,0.047751,-0.118187,0.237434,,0.303847,0.322163,-0.555479,-0.363414,...,-0.254535,0.463653,0.58023,0.503044,-0.604986,0.178077,-0.720059,-0.150197,-0.268715,Deletion
C3L-01040,-0.763691,-1.031834,-0.217194,-0.695701,0.184173,-0.474816,-0.051789,0.344842,-0.642746,0.068863,...,-0.092502,0.010639,-0.465079,,-0.500083,0.112651,1.00466,-0.230304,-0.102416,Deletion


Part 2: Run T-tests. Create a data frame of just significant comparisons and another data frame for all comparisons. A moderately stringent correction is used (FDR_BH).

In [7]:
g_cols = list(g_del_wt.columns[:-1])

# Get only sig genes
g_sig = u.wrap_ttest(g_del_wt, 'Mutation', g_cols, correction_method = 'fdr_bh')
# Create list of sig genes
if g_sig is not None:
    g_sig_list = list(g_sig.Comparison)
else: 
    g_sig_list = None
print('significant pvals: \n',g_sig)

# Get all pvals
#g_pval = u.wrap_ttest(g_del_wt, 'Mutation', g_cols, return_all = True, correction_method = 'fdr_bh')
g_pval = wrap_ttest(g_del_wt, 'Mutation', g_cols, return_all = True, correction_method = 'fdr_bh', min_count = 5)
g_pval = g_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Gbm_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
                Comparison       P_Value
0        ARMH3_proteomics  5.396032e-11
1         CUTC_proteomics  1.593480e-10
2      PIP4K2A_proteomics  1.009419e-09
3         CUL2_proteomics  1.122076e-09
4         GDI2_proteomics  1.302273e-09
...                   ...           ...
1895  KIAA1522_proteomics  8.588845e-03
1896      NOB1_proteomics  8.592904e-03
1897      MSH3_proteomics  8.603975e-03
1898      FPR2_proteomics  8.615010e-03
1899      FBP2_proteomics  8.625413e-03

[1900 rows x 2 columns]


TypeError: wrap_ttest() got an unexpected keyword argument 'min_count'

Part 3: Create the differences in median df. (median of wildtype tumors - median of cnv deletions)

In [None]:
g_med = get_change_in_medians_df(g, "Gbm", g_del_wt, g_prot_list)

Part 4: Merge the p-values and the differences in median dfs.

In [None]:
g_merged = g_pval.merge(g_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(g_cols))

# Create csv
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'
g_merged.to_csv(root+R'\gbm_pval_medians.csv',index=False)
g_merged

# Repeat for other cancers.

# Ovarian

In [None]:
gene = 'PTEN'
o_prot = o.get_proteomics()
o_prot = u.reduce_multiindex(o_prot, levels_to_drop = 1)
o_prot_list = list(o_prot.columns)

o_del_wt = all_prot_format_df(o, o_prot_list)
o_del_wt = o_del_wt.dropna(axis='columns', thresh = 10)

There are isoforms in ovarian. Create unique columns names by adding a number to differentiate. 

In [None]:
cols = pd.Series(o_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
o_del_wt.columns=cols

In [None]:
o_cols = list(o_del_wt.columns[:-1])

# Get only sig sites
o_sig = wrap_ttest(o_del_wt, 'Mutation', o_cols, correction_method = 'fdr_bh', min_count = min_num)
if o_sig is not None:
    o_sig_list = list(o_sig.Comparison)
else: 
    o_sig_list = None
print('significant pvals: \n',o_sig)

# Get all pvals
o_pval = wrap_ttest(o_del_wt, 'Mutation', o_cols, return_all = True, correction_method = 'fdr_bh', min_count = min_num)
o_pval = o_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Ov_P_Value'})

In [None]:
o_med = get_change_in_medians_df(o, "Ov", o_del_wt, o_prot_list)

In [None]:
o_merged = o_pval.merge(o_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(o_cols))

# Create csv
o_merged.to_csv(root+R'\ov_pval_medians.csv',index=False)
o_merged

# Breast

In [None]:
gene = 'PTEN'
b_prot = b.get_proteomics()
b_prot = u.reduce_multiindex(b_prot, levels_to_drop = 1)
b_prot_list = list(b_prot.columns)

b_del_wt = all_prot_format_df(b, b_prot_list)
b_del_wt = b_del_wt.dropna(axis='columns', thresh = 10)

In [None]:
# Differentiate duplicate column names
cols = pd.Series(b_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
b_del_wt.columns=cols

In [None]:
b_cols = list(b_del_wt.columns[:-1])

# Get only sig sites
b_sig = wrap_ttest(b_del_wt, 'Mutation', b_cols, correction_method = 'fdr_bh', min_count = min_num)
if b_sig is not None:
    b_sig_list = list(b_sig.Comparison)
else: 
    b_sig_list = None
print('significant pvals: \n',b_sig)

# Get all pvals
b_pval = wrap_ttest(b_del_wt, 'Mutation', b_cols, correction_method = 'fdr_bh',
                      return_all = True, min_count = min_num)
b_pval = b_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Brca_P_Value'})

In [None]:
b_med = get_change_in_medians_df(b, "Brca", b_del_wt, b_prot_list)

In [None]:
b_merged = b_pval.merge(b_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(b_cols))

# Create csv
b_merged.to_csv(root+R'\brca_pval_medians.csv',index=False)
b_merged.head()

# Colon

In [None]:
gene = 'PTEN'
c_prot = col.get_proteomics()
c_prot_list = list(c_prot.columns)

c_del_wt = all_prot_format_df(col, c_prot_list)
c_del_wt = c_del_wt.dropna(axis='columns', thresh=10)

In [None]:
# Differentiate duplicate column names
cols = pd.Series(c_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
c_del_wt.columns=cols

In [None]:
c_cols = list(c_del_wt.columns[:-1])

# Get only sig sites
c_sig = u.wrap_ttest(c_del_wt, 'Mutation', c_cols, correction_method = 'fdr_bh')
if c_sig is not None:
    c_sig_list = list(c_sig.Comparison)
else: 
    c_sig_list = None
print('significant pvals: \n',c_sig)

# Get all pvals
c_pval = u.wrap_ttest(c_del_wt, 'Mutation', c_cols, return_all = True, correction_method = 'fdr_bh')
c_pval = c_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Colon_P_Value'})

In [None]:
c_med = get_change_in_medians_df(col, "Colon", c_del_wt, c_prot_list)

In [None]:
c_merged = c_pval.merge(c_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(c_cols))

# Create csv
c_merged.to_csv(root+R'\colon_pval_medians.csv',index=False)
c_merged

# Hnscc

In [None]:
gene = 'PTEN'
h_prot = h.get_proteomics()
h_prot_list = list(h_prot.columns)

h_del_wt = all_prot_format_df(h, h_prot_list)
h_del_wt = h_del_wt.dropna(axis='columns', how='all')
h_del_wt = h_del_wt.dropna(axis = 'columns',thresh = 10)

In [None]:
h_cols = list(h_del_wt.columns[:-1])

# Get only sig sites
h_sig = wrap_ttest(h_del_wt, 'Mutation', h_cols, correction_method = 'fdr_bh', min_count = min_num)
if h_sig is not None:
    h_sig_list = list(h_sig.Comparison)
else: 
    h_sig_list = None
print('significant pvals: \n',h_sig)

# Get all pvals
h_pval = wrap_ttest(h_del_wt, 'Mutation', h_cols, return_all = True, correction_method = 'fdr_bh', min_count = min_num)
h_pval = h_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Hnscc_P_Value'})

In [None]:
h_med = get_change_in_medians_df(h, "Hnscc", h_del_wt, h_prot_list)

In [None]:
h_merged = h_pval.merge(h_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(h_cols))

# Create csv
h_merged.to_csv(root+R'\hnscc_pval_medians.csv',index=False)
h_merged

# Luad

In [None]:
gene = 'PTEN'
l_prot = l.get_proteomics()
l_prot = u.reduce_multiindex(l_prot, levels_to_drop = 1)
l_prot_list = list(l_prot.columns)

l_del_wt = all_prot_format_df(l, l_prot_list)
l_del_wt = l_del_wt.dropna(axis='columns', thresh=10)

In [None]:
# Differentiate duplicate column names
cols = pd.Series(l_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
l_del_wt.columns=cols

In [None]:
l_cols = list(l_del_wt.columns[:-2])

# Get only sig sites
l_sig = wrap_ttest(l_del_wt, 'Mutation', l_cols, correction_method = 'fdr_bh', min_count = min_num)
if l_sig is not None:
    l_sig_list = list(l_sig.Comparison)
else: 
    l_sig_list = None
print('significant pvals: \n',l_sig)

# Get all pvals
l_pval = wrap_ttest(l_del_wt, 'Mutation', l_cols, return_all = True, correction_method = 'fdr_bh', min_count = min_num)
l_pval = l_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Luad_P_Value'})

In [None]:
l_med = get_change_in_medians_df(l, "Luad", l_del_wt, l_prot_list)

In [None]:
l_merged = l_pval.merge(l_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(l_cols))

# Create csv
l_merged.to_csv(root+R'\luad_pval_medians.csv',index=False)
l_merged.head()

# Lscc

In [6]:
gene = 'PTEN'
ls_prot = ls.get_proteomics()
ls_prot = u.reduce_multiindex(ls_prot, levels_to_drop = 1)
ls_prot_list = list(ls_prot.columns)

ls_del_wt = all_prot_format_df(ls, ls_prot_list)
ls_del_wt = ls_del_wt.dropna(axis='columns', thresh=10)



In [7]:
# Differentiate duplicate column names
cols = pd.Series(ls_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
ls_del_wt.columns=cols

In [11]:
min_num = 5

In [13]:
ls_cols = list(ls_del_wt.columns[:-2])

# Get only sig sites
ls_sig = wrap_ttest(ls_del_wt, 'Mutation', ls_cols, correction_method = 'fdr_bh', mincount = min_num)
if ls_sig is not None:
    ls_sig_list = list(ls_sig.Comparison)
else: 
    ls_sig_list = None
print('significant pvals: \n', ls_sig)

# Get all pvals

ls_pval = wrap_ttest(ls_del_wt, 'Mutation', ls_cols, return_all = True, correction_method = 'fdr_bh', mincount = min_num)
ls_pval = ls_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Lscc_P_Value'})

significant pvals: 
           Comparison   P_Value
0   ATAD1_proteomics  0.000006
1   BTAF1_proteomics  0.001136
2    PTEN_proteomics  0.042649
3  VPS26A_proteomics  0.042649


In [None]:
ls_med = get_change_in_medians_df(ls, "Lscc", ls_del_wt, ls_prot_list)

In [None]:
ls_merged = ls_pval.merge(ls_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(ls_cols))

# Create csv
ls_merged.to_csv(root+R'\lscc_pval_medians.csv',index=False)
ls_merged.head()

# Endometrial

The Endometrial data set does not have enough cnv deletions to perform a t-test, however the data set does have enough truncation type mutations (nonsense and frame shifts). Different code is needed to create the data frame for Endometrial.

In [14]:
en = cptac.Endometrial()

                                                

In [24]:
gene = 'PTEN'
prot = en.get_proteomics()
e_prot_list = list(prot.columns)

mut_type = en.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = e_prot_list)
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type = mut_type[['Mutation']] 
merged = ip_df.join(mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
get = merged['Mutation'].isin(compare)
trunc_wt = merged[get]
trunc_wt['Mutation'] = np.where(
                trunc_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')
#trunc_wt = trunc_wt.dropna(axis = 'columns',thresh = 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [25]:
trunc_wt = trunc_wt.drop(columns='PTEN_proteomics')

In [27]:
e_cols = list(trunc_wt.columns[:-1])

# Get only sig sites
e_sig = wrap_ttest(trunc_wt, 'Mutation', e_cols, correction_method = 'fdr_bh', mincount=5)
if e_sig is not None:
    e_sig_list = list(e_sig.Comparison)
else: 
    e_sig_list = None
print('significant pvals: \n',e_sig)

# Get all pvals
e_pval = u.wrap_ttest(trunc_wt, 'Mutation', e_cols, return_all = True, correction_method = 'fdr_bh', mincount = 5)
e_pval = e_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'En_P_Value'})

significant pvals: 
             Comparison   P_Value
0     NOL10_proteomics  0.000915
1    TOPBP1_proteomics  0.001242
2     UTP25_proteomics  0.001242
3      ABT1_proteomics  0.001242
4     L1CAM_proteomics  0.001590
..                 ...       ...
410  TFIP11_proteomics  0.048745
411    TAF2_proteomics  0.048780
412   YWHAE_proteomics  0.048805
413    KLC4_proteomics  0.048898
414  SEC24C_proteomics  0.048898

[415 rows x 2 columns]


TypeError: wrap_ttest() got an unexpected keyword argument 'mincount'

Differences in median with adaption to trunctation mutations.

In [None]:
t = trunc_wt[trunc_wt.Mutation == "Truncation"]
wt = trunc_wt[trunc_wt.Mutation == "Wildtype_Tumor"]
trunc_med = t.median()
wt_med = wt.median()

en_d = {}

for prot in e_prot_list:
    if prot+'_proteomics' in trunc_med.index and prot+'_proteomics' in wt_med.index:
        dif = trunc_med[prot+'_proteomics'] - wt_med[prot+'_proteomics']
        en_d[prot+'_proteomics'] = dif

en_med = pd.DataFrame.from_dict(en_d, orient='index', columns=['En_Median'])
en_med = en_med.reset_index().rename(columns={'index':'Proteomics'})

In [None]:
e_merged = e_pval.merge(en_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(e_cols))

# Create csv
e_merged.to_csv(root+R'\endo_pval_medians.csv',index=False)
e_merged

# Get a list of significant genes in at least one cancer

In [None]:
# Keep only genes significant in > 1 cancer
all_sig = []

sig_lists = [g_sig_list, e_sig_list, b_sig_list, o_sig_list, c_sig_list, h_sig_list, l_sig_list, ls_sig_list]

for next_list in sig_lists:
    if next_list is not None:
        all_sig.append(next_list)
        

flat_list = [item for sublist in all_sig for item in sublist] #change list of lists, to just one list
sig = list(set(flat_list)) # remove duplicates
print('Number of significant in >= 1 cancer:', len(sig))

In [None]:
s = pd.Series(sig)
s = s.replace(to_replace = r'_proteomics$', value = '', regex = True)
s.to_csv(root+R'\list_sig_one_cancer.csv', index=False)

# Get a list of significant genes in multiple cancers

In [None]:
# sig in multiple (more than 1) cancers
mult = list(set(i for i in flat_list if flat_list.count(i) > 1)) # Keep genes sig in more than 1 cancer
print('Number of significant in mult cancers:', len(mult))


In [None]:
m = pd.Series(mult)
m = m.replace(to_replace = r'_proteomics$', value = '', regex = True)
m.to_csv(root+R'\list_sig_multiple_cancers.csv', index=False)

# Number of significant comparisons for each cancer

In [None]:
sig_dict = {'Gbm': g_sig_list, 'En': e_sig_list, 'Brca': b_sig_list, 'Ov': o_sig_list, 'Colon': c_sig_list,
            'Hnscc': h_sig_list, 'Luad': l_sig_list, 'Lscc': ls_sig_list}
print('Number of significant tests:\n')

for cancer in sig_dict:
    if sig_dict[cancer] is not None: 
        print(cancer+':', len(sig_dict[cancer]), '\n')
    

# Total proteins tested

In [None]:
protein_dict = {'Gbm': len(g_cols), 'En': len(e_cols), 'Brca': len(b_cols), 'Ov': len(o_cols), 'Colon': len(c_cols),
            'Hnscc': len(h_cols), 'Luad': len(l_cols), 'Lscc': len(ls_cols)}

print('Total proteins tested (protein dropped if < 10 samples with nonNaN data)\n')

for cancer in protein_dict:
    if protein_dict[cancer] is not None: 
        print(cancer+':', protein_dict[cancer], '\n')

# Lowest p-value accepted 

In [None]:
low_pval_dict = {'Gbm': g_sig['P_Value'].iloc[-1], 'En': e_sig['P_Value'].iloc[-1], 'Brca': b_sig['P_Value'].iloc[-1], 
                 'Ov': o_sig['P_Value'].iloc[-1], 'Hnscc': h_sig['P_Value'].iloc[-1], 
                 'Luad': l_sig['P_Value'].iloc[-1], 'Lscc': ls_sig['P_Value'].iloc[-1]}

print('Lowest p-value accepted using FDR_BH correction method: \n')

for cancer in low_pval_dict:
    print(cancer, ':', low_pval_dict[cancer], '\n')
