# Check num sig genes, pvals, and change in medians in csv files

Check values in all_proteins.csv and all_heatmap.csv (sig_pval_heatmap.csv and mult_sig_pval_heatmap.csv are derived from all_heatmap.csv).

In [206]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 
import math

import cptac
import cptac.utils as u

In [420]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad', 'endometrial'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot, tissue_type = 'tumor')
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
            prot_and_mutations = rename_duplicate_cols(prot_and_mutations)
        
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    elif cancer_object.get_cancer_type() in ('luad'): 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2= all_prot, tissue_type = 'tumor')
        omics = utils.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = rename_duplicate_cols(omics)
        omics = omics.drop(columns='PTEN_CNV')
        # Get only tumor samples
        p = cancer_object.get_proteomics(tissue_type='tumor')
        tumor_ids = list(p.index)
        get = omics.index.isin(tumor_ids)
        omics = omics[get]
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
            mut_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')
    
    elif cancer_object.get_cancer_type() in ('endometrial'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot,
            tissue_type = 'tumor') # drop Normal samples
        
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
            prot_and_mutations = rename_duplicate_cols(prot_and_mutations)

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep two values to compare
        compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
                        mut_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_proteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [432]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Only samples with deletions and wildtype for PTEN. 
all_prot_list: List. All proteins in proteomics data frame. 

Returns a dataframe with the difference in medians between proteomics with PTEN wt and del (wt - del). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    if  cancer_object.get_cancer_type() in ('endometrial'):
        d = del_wt_df[del_wt_df.Mutation == "Truncation"]
        wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    else:
        d = del_wt_df[del_wt_df.Mutation == "Deletion"]
        wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    
    # Correlation: + is mutant up compared to wt, - is mutant down
    for prot in all_prot_list:
        if prot in del_med.index and prot in wt_med.index:
            dif = del_med[prot] - wt_med[prot]
            med_dict[prot] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Proteomics'})
    
    return df
        

In [354]:
# Adds '_i' to duplicate col names where i is a number (increases with each duplicate)

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

#  Step 1: Create Data frames with p-values and differences in median

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run many T-tests for all genes in the proteomics data frame. Use get_change_in_medians_df to create the data frame with change in median values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [4]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

Checking that endometrial index is up-to-date...



Checking that luad index is up-to-date...       



Checking that ovarian index is up-to-date...



                                            

In [73]:
# Tables to check
all_proteins = pd.read_csv(r'../Make_Tables/csv/all_proteins.csv')
all_heatmap = pd.read_csv(r'../Make_Tables/csv/all_heatmap.csv') 
# sig_pval_heatmap and mult_sig_pval_heatmap derived from all_heatmap

# Test cancers

Part 1: Format data frame using all_prot_format_df. This returns a dataframe with proteomics and a Mutation column containing only PTEN cnv deletions and wildtype tumors. 

In [418]:
# Replace with cancer of choice
# Names in csv to use for cancer variable: 'Gbm', 'Hnscc', 'Lscc', 'Luad', 'Brca', 'Ov', 'En', 'Colon'
cancer = 'Lscc'
cancer_obj = ls

gene = 'PTEN'
prot = cancer_obj.get_proteomics()
if isinstance(prot.keys(), pd.core.indexes.multi.MultiIndex):
    prot = u.reduce_multiindex(prot, levels_to_drop = 1)

prot = prot.drop(columns = 'PTEN') # cis effect
prot_list = list(prot.columns)

del_wt = all_prot_format_df(cancer_obj, prot_list)
del_wt.head()



Name,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AAED1,AAGAB,AAK1,...,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00081,-3.1102,-6.4487,-1.1714,0.266,0.7587,-0.5222,-6.2401,0.7268,0.9413,0.0487,...,,0.524,0.8369,,1.8136,-0.1484,-0.7831,-0.5483,-0.5135,Deletion
C3L-00415,-2.6522,-4.5841,6.3064,,0.2618,2.7781,6.6933,-0.2847,-0.0385,-0.2198,...,0.6812,0.2781,-0.5418,,0.7894,-0.6121,-1.5266,0.1942,-0.0953,Deletion
C3L-00445,-0.6754,-3.6368,-5.2459,2.5505,0.2253,1.7921,-0.9698,-1.808,1.7646,-0.1414,...,,0.5222,1.4278,2.7775,1.4203,0.7517,-0.8849,-0.7053,0.7442,Wildtype_Tumor
C3L-00568,-1.9178,-3.4358,-6.3256,-0.9436,0.6271,2.2628,-5.112,-0.0101,1.589,0.6312,...,0.0548,1.4713,2.5429,0.6555,-0.3754,-0.5175,-0.8949,0.3471,0.4404,Deletion
C3L-00603,-1.7088,-1.6799,-4.8142,,0.837,1.4581,-2.0167,-1.5169,0.1562,0.2286,...,-0.1679,1.3386,3.0732,0.4694,1.3476,-0.0376,-1.2996,0.1218,0.5527,Wildtype_Tumor


Part 2: Run T-tests. Create a data frame of just significant comparisons and another data frame for all comparisons. A moderately stringent correction is used (FDR_BH).

In [419]:
prot_and_isoforms = list(del_wt[:-1])

# Get all pvals
pval = u.wrap_ttest(del_wt, 'Mutation', prot_and_isoforms, return_all = True, 
                    correction_method = 'fdr_bh', pval_return_corrected = True, mincount = 5)

In [431]:
# Check num sig genes 
sig = pval.loc[pval['P_Value'] < 0.05]
print('Num sig genes for '+cancer+':', len(sig))

check_count = {'Gbm': 1900, 'Hnscc': 570, 'En':415, 'Luad': 111, 'Lscc': 3, 'Ov': 1, 'Brca': 0, 'Colon': 0}
if check_count[cancer] == len(sig):
    print('Sig counts match')
else:
    print('Different counts')

Num sig genes for Lscc: 3
Sig counts match


Part 3: Create the differences in median df. (median of wildtype tumors - median of cnv deletions)

In [448]:
prot_and_isoforms = list(del_wt[:-1])
med = get_change_in_medians_df(cancer_obj, cancer, del_wt, prot_and_isoforms)

In [435]:
# test get_change_in_medians_df
# tested genes: RFC1, MMS19, HERC4, MAD2L1, TOP2A, TP53, DNM1L_1
trans_gene = 'DNM1L_1' # change to test different genes

In [449]:
del_wt_df = del_wt[[trans_gene, 'Mutation']].dropna(axis = 'index', how = 'any')
print('Mutation with actual measurements for', trans_gene+':\n', del_wt_df.Mutation.value_counts())

if cancer_obj.get_cancer_type() in ('endometrial'):
    d = del_wt_df[del_wt_df.Mutation == "Truncation"]
    wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
else:
    d = del_wt_df[del_wt_df.Mutation == "Deletion"]
    wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
del_med = d.median()
wt_med = wt.median()

# Correlation: + is mutant up compared to wt, - is mutant down
test_med = del_med[trans_gene] - wt_med[trans_gene]
print('\n', del_med[trans_gene], '-', wt_med[trans_gene], ' = ', 
      float(del_med[trans_gene]) - float(wt_med[trans_gene]))

Mutation with actual measurements for DNM1L_1:
 Deletion          52
Wildtype_Tumor    47
Name: Mutation, dtype: int64

 0.37395 - 0.4234  =  -0.049449999999999994


In [450]:
# From function
func_med = med.loc[med['Proteomics'] == trans_gene]
from_func_med = float(func_med[cancer+'_Median']) 

# Check if get_change_in_medians_df gets same change in median as when calculated above
if from_func_med == test_med:
    print('test get_change_in_medians_df passed')
else:
    print('FAILED: check function')

# From wrap_ttest
gene_pval = pval.loc[pval['Comparison'] == trans_gene]
from_wrap_ttest_pval = float(gene_pval.P_Value)

test get_change_in_medians_df passed


In [455]:
# check same pval and change in medians as in csv files
# test all_proteins.csv
cancer_df = all_proteins[['Proteomics', cancer+'_P_Value', cancer+'_Median']]
gene_proteins = cancer_df.loc[cancer_df['Proteomics'] == trans_gene]

print('Test all_proteins.csv')
from_proteins_pval = float(gene_proteins[cancer+'_P_Value'])
from_proteins_med = float(gene_proteins[cancer+'_Median'])
# Check t-test
if math.isclose(from_wrap_ttest_pval, from_proteins_pval, rel_tol=0.01): # true if floats within 0.01
    print(True, 'pvals match')
else:
    print(False)
# Check change in medians
if math.isclose(test_med, from_proteins_med, rel_tol=0.01):
    print(True, 'change in medians match\n')
else:
    print(False)
print(gene_proteins, '\n')



# test all_heatmap.csv
gene_heatmap = all_heatmap.loc[all_heatmap['Proteomics'] == trans_gene]
gene_heatmap = gene_heatmap.loc[gene_heatmap['Cancer'] == cancer]

print('Test all_heatmap.csv')
from_heatmap_pval = float(gene_heatmap.P_Value)
from_heatmap_med = float(gene_proteins[cancer+'_Median'])
# Check t-test
if math.isclose(from_wrap_ttest_pval, from_heatmap_pval, rel_tol=0.01):
    print(True, 'pvals match')
else:
    print(False)
# Check change in medians
if math.isclose(test_med, from_heatmap_med, rel_tol=0.01): 
    print(True, 'change in medians match\n')
else:
    print(False)
print(gene_heatmap, '\n')

Test all_proteins.csv


TypeError: cannot convert the series to <class 'float'>

In [456]:
gene_proteins

Unnamed: 0,Proteomics,Lscc_P_Value,Lscc_Median
