# Check differences in t-test results between Lscc versions

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re

import cptac
import cptac.utils as u

import plot_utils as p

In [2]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot)
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
        prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]

    return del_wt


# Check version 1.0

In [3]:
ls_v1 = cptac.Lscc(version= '1.0')

Loading lscc v1.0.                       



                         



In [4]:
gene = 'PTEN'
ls_prot_v1 = ls_v1.get_proteomics()
ls_prot_v1 = u.reduce_multiindex(ls_prot_v1, levels_to_drop = 1)
ls_prot_list = list(ls_prot_v1.columns)

ls_del_wt = all_prot_format_df(ls_v1, ls_prot_list)
#ls_del_wt = ls_del_wt.dropna(axis='columns', how='all') # count all proteins

  return array(a, dtype, copy=False, order=order)


In [5]:
# Differentiate duplicate column names
cols = pd.Series(ls_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
ls_del_wt.columns=cols

In [6]:
print('total_proteins_tested = ', len(ls_prot_list))
cols = list(ls_del_wt.columns[:-2])

# Get only sig sites
ls_sig = u.wrap_ttest(ls_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if ls_sig is not None:
    ls_sig_list = list(ls_sig.Comparison)
else: 
    ls_sig_list = None
print('significant pvals: \n', ls_sig)

# Get all pvals
ls_pval = u.wrap_ttest(ls_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
ls_pval = ls_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Lscc_P_Value'})

total_proteins_tested =  11295
significant pvals: 
             Comparison   P_Value
0      RFC3_proteomics  0.004902
1     CPSF2_proteomics  0.005941
2     WDR33_proteomics  0.013929
3       SLK_proteomics  0.013929
4     CPSF1_proteomics  0.013929
..                 ...       ...
281   ABCC2_proteomics  0.048308
282   VPS35_proteomics  0.048709
283  WRAP53_proteomics  0.048709
284  BICDL2_proteomics  0.048709
285   MED22_proteomics  0.048907

[286 rows x 2 columns]


In [7]:
ls_sig.loc[ls_sig['Comparison'] == 'MSH2_proteomics']

Unnamed: 0,Comparison,P_Value
13,MSH2_proteomics,0.019573


# Check version 3.2

In [8]:
ls_v3 = cptac.Lscc(version= '3.2')

Loading lscc v3.2.                       



                            



In [61]:
gene = 'PTEN'
ls_prot = ls_v3.get_proteomics()
ls_prot = u.reduce_multiindex(ls_prot, levels_to_drop = 1)
ls_prot_list = list(ls_prot.columns)

ls_v3_del_wt = all_prot_format_df(ls_v3, ls_prot_list)
ls_v3_del_wt = ls_del_wt.dropna(axis='columns', how='all') # count all proteins
len(ls_v3_del_wt.columns)

  return array(a, dtype, copy=False, order=order)


11576

In [62]:
# Differentiate duplicate column names
cols = pd.Series(ls_v3_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
ls_v3_del_wt.columns=cols

In [63]:
print('total_proteins_tested = ', len(ls_prot_list))
cols = list(ls_v3_del_wt.columns[:-2])

# Get all pvals
ls_v3_pval = u.wrap_ttest(ls_v3_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
ls_v3_sig = ls_v3_pval.loc[ls_v3_pval['P_Value'] < 0.05]

total_proteins_tested =  11575


In [64]:
ls_v3_del_wt['ATAD1_proteomics'].dropna()

Patient_ID
C3L-00081    0.0719
C3L-00415   -0.8259
C3L-00445    0.1230
C3L-00568   -1.3617
C3L-00603    0.6922
              ...  
C3N-03886   -1.0619
C3N-04124    0.4544
C3N-04127   -0.7996
C3N-04155    0.1896
C3N-04162    1.2685
Name: ATAD1_proteomics, Length: 97, dtype: float64

# v 3.2.1

In [13]:
ls = cptac.Lscc()

                                         



In [25]:
ls.version()

'3.2.1'

In [65]:
gene = 'PTEN'
ls_prot = ls.get_proteomics()
ls_prot = u.reduce_multiindex(ls_prot, levels_to_drop = 1)
ls_prot_list = list(ls_prot.columns)

ls_del_wt = all_prot_format_df(ls, ls_prot_list)
ls_del_wt = ls_del_wt.dropna(axis='columns', how='all') # count all proteins

  return array(a, dtype, copy=False, order=order)


In [66]:
# Differentiate duplicate column names
cols = pd.Series(ls_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
ls_del_wt.columns=cols

In [67]:
print('total_proteins_tested = ', len(ls_prot_list))
cols = list(ls_del_wt.columns[:-2])

# Get only sig sites
ls_sig = u.wrap_ttest(ls_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if ls_sig is not None:
    ls_sig_list = list(ls_sig.Comparison)
else: 
    ls_sig_list = None
print('significant pvals: \n', len(ls_sig))

#ls_sig

total_proteins_tested =  11575
significant pvals: 
 59


In [73]:
'''
df: DataFrame containing omics data and a binary column.
mincount: Int of the minimum num of actual values needed to pass cutoff and not be listed in not_enough_data list. 
omics_cols: Names of columns to check if there is enough data. 

Returns: List of genes with not enough data (num of non-NaN values are less than mincount).
'''

def find_few_data_genes(df, mincount, omics_cols = None, binary_labels = ('Deletion','Wildtype_Tumor'), 
    binary_col = 'Mutation'):
    
    # Separate into binary groups
    label_1 = binary_labels[0]
    label_2 = binary_labels[1]
    partition1 = df.loc[df[binary_col] == label_1]
    partition2 = df.loc[df[binary_col] == label_2]
    
    # Get list of columns
    if omics_cols is None:
        omics_cols = list(df.columns)
        omics_cols.remove(binary_col)
    else:
        omics_cols = set(omics_cols)
        if binary_col in (omics_cols):
            omics_cols.remove(binary_col)
        
    # Append genes with less data than mincount (for mut or wt) to list
    not_enough_data = []
    for c in omics_cols:
        #print(len(partition1[c].dropna(axis='rows')))
        #print(len(partition2[c].dropna(axis='rows')))
        if len(partition1[c].dropna(axis='rows')) <= mincount: #non-nan vals less than min
            not_enough_data.append(c)
            continue
        elif len(partition2[c].dropna(axis='rows')) <= mincount:
            not_enough_data.append(c)
            continue
        
    
    print('genes with not enough data: ', len(not_enough_data), '/', len(omics_cols))        
    return not_enough_data

In [76]:
find_few_data_genes(ls_del_wt, 5)

genes with not enough data:  1 / 11575


['TRMT2B_proteomics']

In [77]:
find_few_data_genes(ls_v3_del_wt, 5)

genes with not enough data:  1 / 11575


['TRMT2B_proteomics']

In [68]:
# check different number of measurements between v 3.2 and 3.2.1

In [69]:
g_list = list(ls_sig.Comparison) #[['Comparison']].replace('_proteomics', '', regex = True).Comparison)

sig_ls_v3 = ls_v3_del_wt[ls_v3_del_wt.columns[ls_v3_del_wt.columns.isin(g_list)]]
sig_ls = ls_del_wt[ls_del_wt.columns[ls_del_wt.columns.isin(g_list)]]

for g in g_list:
    g_ls_v3 = len(sig_ls_v3[[g]].dropna())
    g_ls = len(sig_ls[[g]].dropna())
    print(g, g_ls_v3, g_ls)
    

PTEN_proteomics 97 97
ATE1_proteomics 97 97
ATAD1_proteomics 97 97
INTS2_proteomics 97 97
FAM45A_proteomics 97 97
CCT7_proteomics 97 97
NPLOC4_proteomics 97 97
PCBP1_proteomics 97 97
INTS10_proteomics 97 97
CCT2_proteomics 97 97
CCDC186_proteomics 97 97
FASN_proteomics 97 97
CCT3_proteomics 97 97
CCSER2_proteomics 97 97
WDR91_proteomics 97 97
EDC3_proteomics 97 97
SLK_proteomics 97 97
HACD3_proteomics 97 97
R3HDM1_proteomics 97 97
NCAPG_proteomics 97 97
RNF10_proteomics 89 89
CCT6A_proteomics 97 97
RTKN_proteomics 97 97
POC1B_proteomics 83 83
RFC3_proteomics 97 97
COA5_proteomics 97 97
TCP1_proteomics 97 97
VPS26A_proteomics 97 97
PPP3CB_proteomics 97 97
DSG2_proteomics 97 97
STOML2_proteomics 97 97
PIAS2_proteomics 94 94
DIP2B_proteomics 97 97
TXNL1_proteomics 97 97
RASGRP4_proteomics 97 97
COLGALT2_proteomics 81 81
ZSWIM8_proteomics 97 97
DDX39A_proteomics 97 97
SYTL3_proteomics 94 94
VCP_proteomics 97 97
FADS2_proteomics 97 97
TIMM21_proteomics 97 97
MSH2_proteomics 97 97
WASHC2C_pr

In [70]:
ls_v3_del_wt['ATAD1_proteomics'].dropna()

Patient_ID
C3L-00081    0.0719
C3L-00415   -0.8259
C3L-00445    0.1230
C3L-00568   -1.3617
C3L-00603    0.6922
              ...  
C3N-03886   -1.0619
C3N-04124    0.4544
C3N-04127   -0.7996
C3N-04155    0.1896
C3N-04162    1.2685
Name: ATAD1_proteomics, Length: 97, dtype: float64

In [71]:
ls_del_wt['ATAD1_proteomics'].dropna()

Patient_ID
C3L-00081    0.0719
C3L-00415   -0.8259
C3L-00445    0.1230
C3L-00568   -1.3617
C3L-00603    0.6922
              ...  
C3N-03886   -1.0619
C3N-04124    0.4544
C3N-04127   -0.7996
C3N-04155    0.1896
C3N-04162    1.2685
Name: ATAD1_proteomics, Length: 97, dtype: float64

# Check protein count

In [17]:
ls_prot_series_v1 = pd.Series(ls_prot_v1.columns).value_counts()
total_unique_proteins_v1 = len(ls_prot_series_v1)
print('Total Unique Proteins for version 1.0:  ', total_unique_proteins_v1)
ls_prot_series_v1

Total Unique Proteins for version 1.0:   10863


POSTN       5
PLEC        5
PML         4
PUF60       4
HLA-DRB1    4
           ..
SLC43A3     1
THBS1       1
SPRYD3      1
SLF2        1
SERAC1      1
Name: Name, Length: 10863, dtype: int64

In [18]:
ls_prot_series = pd.Series(ls_prot.columns).value_counts()
total_unique_proteins_v3 = len(ls_prot_series)
print('Total Unique Proteins for version 3.2:  ', total_unique_proteins_v3)
ls_prot_series

Total Unique Proteins for version 3.2:   11128


PLEC        5
POSTN       5
PML         4
PUF60       4
HLA-DRB1    4
           ..
LONP1       1
PRKRA       1
TMC4        1
NFKBIB      1
SERAC1      1
Name: Name, Length: 11128, dtype: int64

In [19]:
change = total_unique_proteins_v3 - total_unique_proteins_v1
print('total_unique_proteins_v3 - total_unique_proteins_v1 = ', change)

total_unique_proteins_v3 - total_unique_proteins_v1 =  265


In [20]:
change_prot_iso = len(ls_prot.columns) - len(ls_prot_v1.columns) 
print('total_proteins_isoforms_v3 - total_proteins_isoforms_v1 = ', change_prot_iso)

total_proteins_isoforms_v3 - total_proteins_isoforms_v1 =  280


# Get list of added proteins

In [21]:
ls_prot_df = pd.DataFrame(ls_prot.columns)
ls_prot_df_v1 = pd.DataFrame(ls_prot_v1.columns)

In [22]:
# Use indicator to note which df the protein comes from 
# Keep only proteins found in one df
added_prot = ls_prot_df.merge(ls_prot_df_v1,indicator = True, how='left').loc[lambda x : x['_merge']!='both']
added_prot.head(2)

Unnamed: 0,Name,_merge
90,ABRAXAS2,left_only
198,ACTR8,left_only


In [23]:
added_prot_list = list(added_prot.Name)
added_prot_list

['ABRAXAS2',
 'ACTR8',
 'AFTPH',
 'AKAP9',
 'ANAPC1',
 'ANAPC4',
 'ANAPC5',
 'ANKHD1-EIF4EBP3',
 'AP1G1',
 'AP3B1',
 'AP3D1',
 'AP4M1',
 'ARF1',
 'ARID1B',
 'ARIH2',
 'BABAM1',
 'BABAM2',
 'BAG6',
 'BIRC6',
 'BOD1L1',
 'BRAP',
 'BRK1',
 'C2CD5',
 'CABIN1',
 'CAPZA1',
 'CCZ1',
 'CDC23',
 'CDC37',
 'CDKN2AIP',
 'CEP192',
 'CEP350',
 'CHD2',
 'CIAO1',
 'CIC',
 'CLTC',
 'CNOT10',
 'CNOT3',
 'CNOT9',
 'COG1',
 'COG4',
 'COG5',
 'COG6',
 'COG7',
 'COG8',
 'COPS2',
 'COPS3',
 'COPS4',
 'CSNK2A2',
 'CSTF2T',
 'CYFIP1',
 'DAZAP1',
 'DCP1A',
 'DDB1',
 'DDX1',
 'DDX19A',
 'DDX39B',
 'DHX15',
 'DIS3L2',
 'DYNC1H1',
 'DYNC1LI1',
 'EIF2B3',
 'EIF2B4',
 'EIF2B5',
 'EIF3C',
 'EIF3D',
 'EIF3E',
 'EIF3F',
 'EIF3H',
 'EIF4E2',
 'EIF4G2',
 'EIPR1',
 'ELOB',
 'EP400',
 'FAF1',
 'FAM120A',
 'FAM208A',
 'GCC1',
 'GET4',
 'GRIPAP1',
 'GSK3A',
 'GTPBP1',
 'HCFC1',
 'HDAC3',
 'HEATR5B',
 'HERC1',
 'HGS',
 'HNRNPA2B1',
 'HNRNPH1',
 'HNRNPH2',
 'HPS3',
 'ING3',
 'INO80',
 'INTS11',
 'INTS4',
 'IPO13',
 'KDM5A',
 

In [24]:
# Check if protein in earlier version 1.0
ls_prot_v1 = ls_v1.get_proteomics()
ls_prot_v1 = u.reduce_multiindex(ls_prot_v1, levels_to_drop = 1)
test_gene = added_prot_list[0]
print('test gene: ', test_gene)

print('should show key error, none are in the columns for lscc v1.0')
ls_prot_v1.loc[:, added_prot_list] 

test gene:  ABRAXAS2
should show key error, none are in the columns for lscc v1.0




KeyError: "None of [Index(['ABRAXAS2', 'ACTR8', 'AFTPH', 'AKAP9', 'ANAPC1', 'ANAPC4', 'ANAPC5',\n       'ANKHD1-EIF4EBP3', 'AP1G1', 'AP3B1',\n       ...\n       'WAC', 'WAPL', 'WDR26', 'WDR59', 'WDR61', 'WDR82', 'XIAP', 'ZC3H7A',\n       'ZMIZ1', 'ZSWIM8'],\n      dtype='object', name='Name', length=265)] are in the [columns]"