# Create tables with p-value and change in medians for multiple cancers 

Create a dataframe with p-value results from t-tests for all proteins (trans gene proteomics when PTEN has cnv deletions compared to PTEN wildtype). The dataframe also includes the change in medians between deletions and wildtype. Prepare these tables for further analysis by creating csv files.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as p
root = R'~\Github\WhenMutationsDontMatter\PTEN\Figure_2\csv'

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.stats.multitest
import operator


def wrap_ttest(df, label_column, comparison_columns=None, alpha=.05, return_all=False, 
               correction_method='bonferroni', mincount=3, pval_return_corrected=True):
    try:
        '''Verify precondition that label column exists and has exactly 2 unique values'''
        label_values = df[label_column].unique()
        if len(label_values) != 2:
            print("Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.")
            return None
       
        '''Partition dataframe into two sets, one for each of the two unique values from the label column'''
        partition1 = df.loc[df[label_column] == label_values[0]]
        partition2 = df.loc[df[label_column] == label_values[1]]

        '''If no comparison columns specified, use all columns except the specified labed column'''
        if not comparison_columns:
            comparison_columns = list(df.columns)
            comparison_columns.remove(label_column)

        '''Determine the number of real valued columns on which we will do t-tests'''
        number_of_comparisons = len(comparison_columns)

        '''Store comparisons and p-values in two arrays'''
        comparisons = []
        pvals = []
       
        '''Loop through each comparison column, perform the t-test, and record the p-val'''
       
        for column in comparison_columns:
            if len(partition1[column].dropna(axis=0)) <= mincount:
                continue
            elif len(partition2[column].dropna(axis=0)) <= mincount:
                continue
            else:
                stat, pval = scipy.stats.ttest_ind(partition1[column].dropna(axis=0), partition2[column].dropna(axis=0))
                comparisons.append(column)
                pvals.append(pval)
               
        '''Correct for multiple testing to determine if each comparison meets the new cutoff'''
        results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=alpha, method=correction_method)
        reject = results[0]

        '''Format results in a pandas dataframe'''
        results_df = pd.DataFrame(columns=['Comparison','P_Value'])

        '''If return all, add all comparisons and p-values to dataframe'''
        if return_all:
            if pval_return_corrected:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = results[1]

            else:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = pvals

            '''Else only add significant comparisons'''
        else:
            for i in range(0, len(reject)):
                if reject[i]:
                    if pval_return_corrected:
                        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':results[1][i]}, ignore_index=True)
                    else:
                        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':pvals[i]}, ignore_index=True)


        '''Sort dataframe by ascending p-value'''
        results_df = results_df.sort_values(by='P_Value', ascending=True)
        results_df = results_df.reset_index(drop=True)

        '''If results df is not empty, return it, else return None'''
        if len(results_df) > 0:
            return results_df
        else:
            return None


    except:
        print("Incorrectly Formatted Dataframe!")
        return None



In [22]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, mir_list, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'miRNA', omics_genes = mir_list)
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
        prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    else: 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='miRNA',genes1= gene_in, 
            genes2= mir_list)
        omics = utils.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = omics.drop(columns='PTEN_CNV')
        # Get only tumor samples
        p = cancer_object.get_proteomics(tissue_type='tumor')
        tumor_ids = list(p.index)
        get = omics.index.isin(tumor_ids)
        omics = omics[get]
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
        del_wt['Mutation'] = np.where(
            del_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')

    return del_wt


In [5]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Only samples with deletions and wildtype for PTEN. 
all_prot_list: List. All proteins in proteomics data frame. 

Returns a dataframe with the difference in medians between proteomics with PTEN wt and del (wt - del). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    d = del_wt_df[del_wt_df.Mutation == "Deletion"]
    wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    
    # Correlation: + is mutant up compared to wt, - is mutant down
    for prot in all_prot_list:
        if prot in del_med.index and prot in wt_med.index:
            dif = del_med[prot] - wt_med[prot]
            med_dict[prot] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Proteomics'})
    
    return df
        

#  Step 1: Create Data frames with p-values and differences in median

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run many T-tests for all genes in the proteomics data frame. Use get_change_in_medians_df to create the data frame with change in median values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [6]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

version 3scc v3.2.......                        
                                            

# Gbm

Part 1: Format data frame using all_prot_format_df. This returns a dataframe with proteomics and a Mutation column containing only PTEN cnv deletions and wildtype tumors. 

In [83]:
gene = 'PTEN'
cob = en
gm = cob.get_miRNA()
#gm = u.reduce_multiindex(gm, levels_to_drop = 1)

gm_list = list(gm.columns)
g_del_wt = all_prot_format_df(cob, gm_list)
g_del_wt.Mutation.value_counts()
g_del_wt

Name,hsa-let-7a-2-3p_miRNA,hsa-let-7a-3p_miRNA,hsa-let-7a-5p_miRNA,hsa-let-7b-3p_miRNA,hsa-let-7b-5p_miRNA,hsa-let-7c-3p_miRNA,hsa-let-7c-5p_miRNA,hsa-let-7d-3p_miRNA,hsa-let-7d-5p_miRNA,hsa-let-7e-3p_miRNA,...,hsa-miR-9902_miRNA,hsa-miR-9903_miRNA,hsa-miR-9983-3p_miRNA,hsa-miR-9985_miRNA,hsa-miR-9986_miRNA,hsa-miR-99a-3p_miRNA,hsa-miR-99a-5p_miRNA,hsa-miR-99b-3p_miRNA,hsa-miR-99b-5p_miRNA,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00098,,,,,,,,,,,...,,,,,,,,,,Wildtype_Tumor
C3L-00139,1.71,8.11,15.61,8.28,12.23,2.92,13.54,8.31,11.7,5.6,...,0.0,1.48,0.67,4.27,0.0,3.81,12.41,9.19,12.91,Wildtype_Tumor
C3L-00358,,,,,,,,,,,...,,,,,,,,,,Deletion
C3L-00361,0.48,8.65,16.3,9.29,13.39,2.86,14.75,8.43,11.75,4.08,...,0.0,0.48,0.0,2.98,0.0,3.99,11.66,7.77,11.9,Wildtype_Tumor
C3L-00362,0.86,8.02,15.84,8.38,12.57,2.24,14.01,7.1,11.1,3.47,...,0.0,2.19,0.0,4.85,0.0,3.49,12.56,6.77,11.17,Wildtype_Tumor
C3L-00932,1.11,7.76,16.52,8.82,14.06,3.68,14.97,8.18,12.45,4.89,...,0.0,0.85,0.0,5.09,0.0,4.82,14.07,6.98,11.96,Wildtype_Tumor
C3L-00947,1.19,10.04,16.3,10.21,14.37,4.83,15.04,10.36,12.86,4.66,...,0.0,0.0,0.0,4.35,0.0,6.35,10.94,11.47,11.55,Wildtype_Tumor
C3L-00963,0.85,7.45,15.7,7.86,12.49,2.3,13.76,7.74,11.79,5.24,...,0.0,1.39,0.0,5.06,0.0,4.15,13.06,8.35,12.51,Deletion
C3L-01246,,,,,,,,,,,...,,,,,,,,,,Wildtype_Tumor
C3L-01925,2.69,6.92,15.33,7.97,11.96,2.53,13.3,7.72,11.59,4.76,...,0.0,2.87,0.42,5.04,0.0,3.67,13.96,8.82,12.99,Deletion


Part 2: Run T-tests. Create a data frame of just significant comparisons and another data frame for all comparisons. A moderately stringent correction is used (FDR_BH).

In [87]:
#g_cols = list(g_del_wt.columns[:-1])
g_cols = ['hsa-miR-93-5p_miRNA']
#g_cols = ['hsa-miR-25-5p_miRNA']

# Get all pvals
g_del_wt = g_del_wt.dropna(axis = 'columns', how='all')
g_pval = wrap_ttest(g_del_wt, 'Mutation', g_cols, return_all = True, correction_method = 'fdr_bh', mincount = 3)
#g_pval = g_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Gbm_P_Value'})

Incorrectly Formatted Dataframe!


In [86]:
g_pval


Part 3: Create the differences in median df. (median of wildtype tumors - median of cnv deletions)

In [9]:
g_med = get_change_in_medians_df(g, "Gbm", g_del_wt, g_cols)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Part 4: Merge the p-values and the differences in median dfs.

In [10]:
g_merged = g_pval.merge(g_med, on='Proteomics', how='outer')
g_merged = g_merged.replace(to_replace = r'_proteomics', value = '', regex = True)

# Create csv
g_merged.to_csv(root+R'\Gbm_pval_medians.csv',index=False)
g_merged.head()

Unnamed: 0,Proteomics,Gbm_P_Value,Gbm_Median
0,ARMH3,5.766739e-07,-0.405134
1,CUTC,8.514758e-07,-0.553255
2,CUL2,2.783477e-06,-0.586396
3,PIP4K2A,2.783477e-06,-0.838882
4,GDI2,2.783477e-06,-0.610188


# Repeat for other cancers.

In [11]:
cancer_objects = {'Hnscc':h, 'Luad':l, 'Lscc':ls, 'Brca':b, 'Colon':col, 'Ov':o}

In [12]:
import warnings
warnings.filterwarnings('ignore')

gene = 'PTEN'
merged_dfs = {}
for cancer in cancer_objects:

    prot = cancer_objects[cancer].get_proteomics()
    prot = prot.drop(columns = 'PTEN')
    # Reduce a multiindex 
    if isinstance(prot.keys(), pd.core.indexes.multi.MultiIndex):
        prot = u.reduce_multiindex(prot, levels_to_drop = 1)
    prot_list = list(prot.columns)
    
    # Format df for t-test
    del_wt = all_prot_format_df(cancer_objects[cancer], prot_list)
    #del_wt = del_wt.dropna(axis='columns', thresh = 10)
    
    # Rename duplicate columns (isoforms)
    col_names = pd.Series(del_wt.columns[:])
    for dup in col_names[col_names.duplicated()].unique(): 
        col_names[col_names[col_names == dup].index.values.tolist()] = [dup + '_isoform_' + str(i) if i != 0 else dup for i in range(sum(col_names == dup))]
    del_wt.columns = col_names # rename the columns with the cols list
    
    # T-test
    cols_in = list(del_wt.columns[:-1])

    # Get all pvals
    min_num = 5
    all_pval = wrap_ttest(del_wt, 'Mutation', cols_in, return_all = True, correction_method = 'fdr_bh', mincount = min_num)
    all_pval = all_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': cancer+'_P_Value'})
    
    # Get difference in medians
    delta_median_df = get_change_in_medians_df(cancer_objects[cancer], cancer, del_wt, cols_in)

    # Merge pval_df and delta_median_df
    pval_medians_df = all_pval.merge(delta_median_df, on='Proteomics', how='outer')
    pval_medians_df = pval_medians_df.replace(to_replace = r'_proteomics', value = '', regex = True)
    merged_dfs[cancer] = pval_medians_df # testing purposes
    
    # Create csv
    pval_medians_df.to_csv(root+'\\'+cancer+'_pval_medians.csv', index=False)
    print(pval_medians_df.head(), '\n')

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


  Proteomics  Hnscc_P_Value  Hnscc_Median
0        NLN       0.000012      0.431666
1     AKR1C3       0.000054      0.951905
2     AKR1C1       0.000240      1.961262
3       ADI1       0.000416      0.513198
4      EPHX1       0.000678      0.871688 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


  Proteomics  Luad_P_Value  Luad_Median
0       ATE1      0.003010     -0.88365
1      SHOC2      0.003209     -0.60600
2       EGFR      0.003209      2.32970
3     LANCL2      0.004599      1.21475
4    CWF19L1      0.005583     -0.57265 





  Proteomics  Lscc_P_Value  Lscc_Median
0      ATAD1      0.000006     -1.34185
1      BTAF1      0.001136     -0.59625
2     VPS26A      0.047096     -0.45300
3     KIF20B      0.064640     -1.13105
4        SLK      0.083806     -0.61615 





  Proteomics  Brca_P_Value  Brca_Median
0     TMSB10      0.131051      0.87630
1       DRG1      0.131051      0.75135
2      EIF4H      0.131051      0.41295
3      RIPK2      0.131051      0.82670
4      MIEF1      0.131051      0.84135 



  return np.nanmean(a, axis, out=out, keepdims=keepdims)


  Proteomics  Colon_P_Value  Colon_Median
0       DFFA       0.268924      -0.19930
1       GBF1       0.295685      -0.12630
2    SEC14L2       0.295685      -0.66800
3       WAPL       0.295685      -0.19605
4    STK11IP       0.345534      -0.31250 



  return np.nanmean(a, axis, out=out, keepdims=keepdims)


  Proteomics  Ov_P_Value  Ov_Median
0      MMS19    0.000110  -0.324897
1     PI4K2A    0.322853  -0.298399
2      RACK1    0.322853  -0.191250
3      PGAM2    0.355275  -0.292221
4      RRP12    0.355275  -0.305705 



# Endometrial

The Endometrial data set does not have enough cnv deletions to perform a t-test, however the data set does have enough truncation type mutations (nonsense and frameshift). Different code is needed to create the data frame for Endometrial.

In [13]:
gene = 'PTEN'
prot = en.get_proteomics()
prot = prot.drop(columns = 'PTEN')
e_prot_list = list(prot.columns)

mut_type = en.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = e_prot_list)
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type = mut_type[['Mutation']] 
merged = ip_df.join(mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
get = merged['Mutation'].isin(compare)
trunc_wt = merged[get]
trunc_wt['Mutation'] = np.where(
                trunc_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
e_cols = list(trunc_wt.columns[:-1])

# Get all pvals
e_pval = wrap_ttest(trunc_wt, 'Mutation', e_cols, return_all = True, correction_method = 'fdr_bh', mincount = 5)
e_pval = e_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'En_P_Value'})

Differences in median with adaption for trunctation mutations.

In [15]:
t = trunc_wt[trunc_wt.Mutation == "Truncation"]
wt = trunc_wt[trunc_wt.Mutation == "Wildtype_Tumor"]
trunc_med = t.median()
wt_med = wt.median()

en_d = {}

for prot in e_cols:
    if prot in trunc_med.index and prot in wt_med.index:
        dif = trunc_med[prot] - wt_med[prot]
        en_d[prot] = dif

en_med = pd.DataFrame.from_dict(en_d, orient='index', columns=['En_Median'])
en_med = en_med.reset_index().rename(columns={'index':'Proteomics'})

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [16]:
e_merged = e_pval.merge(en_med, on='Proteomics',how='outer')
e_merged = e_merged.replace(to_replace = r'_proteomics', value = '', regex = True)

# Create csv
e_merged.to_csv(root+R'\En_pval_medians.csv',index=False)
e_merged.head()

Unnamed: 0,Proteomics,En_P_Value,En_Median
0,NOL10,0.000915,-0.442
1,ABT1,0.001242,-0.4851
2,TOPBP1,0.001242,-0.61795
3,UTP25,0.001242,-0.7345
4,RIF1,0.00159,-0.5464


# Get number of significant genes in each cancer

In [17]:
# Add Gbm and En to merged_dfs dictionary
merged_dfs['Gbm'] = g_merged
merged_dfs['En'] = e_merged

In [18]:
# Using corrected pvals from wrap_ttest, each cancer's cutoff is 0.05
all_sig = []

for cancer in merged_dfs:
    df = merged_dfs[cancer]
    sig_df = df.loc[df[cancer+'_P_Value'] < 0.05]
    print(cancer, 'sig comparisons:', len(sig_df))
    sig_list = list(sig_df['Proteomics'])
    all_sig.append(sig_list)
        

flat_list = [item for sublist in all_sig for item in sublist] #change list of lists, to just one list
sig = list(set(flat_list)) # remove duplicates
print('\nNumber of significant comparisons in at least 1 cancer:', len(sig))

Hnscc sig comparisons: 570
Luad sig comparisons: 111
Lscc sig comparisons: 3
Brca sig comparisons: 0
Colon sig comparisons: 0
Ov sig comparisons: 1
Gbm sig comparisons: 1900
En sig comparisons: 415

Number of significant comparisons in at least 1 cancer: 2630


In [None]:
# check

In [19]:
# check
lu = merged_dfs['Luad']
len(lu.Proteomics.unique())
sl = lu.loc[lu['Luad_P_Value'] < 0.05]
list(sl.Proteomics)
lu.loc[lu['Proteomics'] == 'PIK3R1_isoform_1'] #ITGAL_isoform_1, PIK3R1_isoform_1, RABGAP1L_isoform_1

Unnamed: 0,Proteomics,Luad_P_Value,Luad_Median
103,PIK3R1_isoform_1,0.046156,-0.5345


In [20]:
# check
pr = all_prot_format_df(l, all_prot = ['RABGAP1L'])

col_names = pd.Series(pr.columns)

for dup in col_names[col_names.duplicated()].unique(): 
    col_names[col_names[col_names == dup].index.values.tolist()] = [dup + '_isoform_' + str(i) if i != 0 else dup for i in range(sum(col_names == dup))]
pr.columns = col_names # rename the columns with the cols list

pr = pr[['RABGAP1L_proteomics_isoform_1', 'Mutation']]
delt = pr.loc[pr['Mutation'] == 'Deletion']
wt = pr.loc[pr['Mutation'] != 'Deletion']

dm = float(delt.median())
wtm = float(wt.median())


print(dm, '-', wtm, '=', dm - wtm)



-0.3073 - 0.10500000000000001 = -0.4123


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# check

# Get a list of significant genes in at least one cancer

In [25]:
s = pd.Series(sig)
print('Number of significant genes in at least one cancer:', len(s))
s.to_csv(root+R'\list_sig_one_cancer.csv', index=False)

Number of significant genes in at least one cancer: 2630


# Get a list of significant genes in multiple cancers

In [26]:
# sig in multiple (more than 1) cancers
mult = list(set(i for i in flat_list if flat_list.count(i) > 1)) # Keep genes sig in more than 1 cancer
print('Number of significant genes in mult cancers:', len(mult))

Number of significant genes in mult cancers: 332


In [23]:
m = pd.Series(mult)
m = m.replace(to_replace = r'_proteomics$', value = '', regex = True)
m.to_csv(root+R'\list_sig_multiple_cancers.csv', index=False)