# Check differences in t-test results between Lscc versions

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as p

In [2]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot)
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
        prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]

    return del_wt


# Check version 1.0

In [3]:
ls_v1 = cptac.Lscc(version= '1.0')

Loading lscc v1.0.                       



                         



In [4]:
gene = 'PTEN'
ls_prot_v1 = ls_v1.get_proteomics()
ls_prot_v1 = u.reduce_multiindex(ls_prot_v1, levels_to_drop = 1)
ls_prot_list = list(ls_prot_v1.columns)

ls_del_wt = all_prot_format_df(ls_v1, ls_prot_list)
#ls_del_wt = ls_del_wt.dropna(axis='columns', how='all') # count all proteins



In [5]:
# Differentiate duplicate column names
cols = pd.Series(ls_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
ls_del_wt.columns=cols

In [6]:
print('total_proteins_tested = ', len(ls_prot_list))
cols = list(ls_del_wt.columns[:-2])

# Get only sig sites
ls_sig = u.wrap_ttest(ls_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if ls_sig is not None:
    ls_sig_list = list(ls_sig.Comparison)
else: 
    ls_sig_list = None
print('significant pvals: \n', ls_sig)

# Get all pvals
ls_pval = u.wrap_ttest(ls_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
ls_pval = ls_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Lscc_P_Value'})

total_proteins_tested =  11295
significant pvals: 
             Comparison       P_Value
0      RFC3_proteomics  4.340033e-07
1     CPSF2_proteomics  1.052148e-06
2       SLK_proteomics  4.777268e-06
3     CPSF1_proteomics  5.111559e-06
4     WDR33_proteomics  6.166593e-06
..                 ...           ...
281   ABCC2_proteomics  1.206208e-03
282   VPS35_proteomics  1.223632e-03
283  BICDL2_proteomics  1.225388e-03
284  WRAP53_proteomics  1.229143e-03
285   MED22_proteomics  1.238478e-03

[286 rows x 2 columns]


In [7]:
ls_sig.loc[ls_sig['Comparison'] == 'MSH2_proteomics']

Unnamed: 0,Comparison,P_Value
9,MSH2_proteomics,2.2e-05


# Check version 3.2

In [8]:
ls = cptac.Lscc()

version 3scc v3.2.......                 
                            



In [9]:
gene = 'PTEN'
ls_prot = ls.get_proteomics()
ls_prot = u.reduce_multiindex(ls_prot, levels_to_drop = 1)
ls_prot_list = list(ls_prot.columns)

ls_del_wt = all_prot_format_df(ls, ls_prot_list)
#ls_del_wt = ls_del_wt.dropna(axis='columns', how='all') # count all proteins



In [10]:
# Differentiate duplicate column names
cols = pd.Series(ls_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
ls_del_wt.columns=cols

In [11]:
print('total_proteins_tested = ', len(ls_prot_list))
cols = list(ls_del_wt.columns[:-2])

# Get only sig sites
ls_sig = u.wrap_ttest(ls_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if ls_sig is not None:
    ls_sig_list = list(ls_sig.Comparison)
else: 
    ls_sig_list = None
print('significant pvals: \n', ls_sig)

# Get all pvals
ls_pval = u.wrap_ttest(ls_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
ls_pval = ls_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Lscc_P_Value'})

total_proteins_tested =  11575
significant pvals: 
           Comparison       P_Value
0   ATAD1_proteomics  5.610837e-10
1   BTAF1_proteomics  1.962971e-07
2  VPS26A_proteomics  1.220741e-05
3    PTEN_proteomics  1.473964e-05


In [12]:
ls_pval.loc[ls_pval['Proteomics'] == 'MSH2_proteomics']

Unnamed: 0,Proteomics,Lscc_P_Value
1536,MSH2_proteomics,0.138684


# Check protein count

In [13]:
ls_prot_series_v1 = pd.Series(ls_prot_v1.columns).value_counts()
total_unique_proteins_v1 = len(ls_prot_series_v1)
print('Total Unique Proteins for version 1.0:  ', total_unique_proteins_v1)
ls_prot_series_v1

Total Unique Proteins for version 1.0:   10863


POSTN       5
PLEC        5
PUF60       4
HLA-DRB1    4
PML         4
           ..
COPZ1       1
AKAP17A     1
CXCL1       1
INTS7       1
RGS10       1
Name: Name, Length: 10863, dtype: int64

In [14]:
ls_prot_series = pd.Series(ls_prot.columns).value_counts()
total_unique_proteins_v3 = len(ls_prot_series)
print('Total Unique Proteins for version 3.2:  ', total_unique_proteins_v3)
ls_prot_series

Total Unique Proteins for version 3.2:   11128


PLEC        5
POSTN       5
HLA-DRB1    4
PML         4
PUF60       4
           ..
KLRG2       1
FMNL2       1
ZNF146      1
KLHL25      1
RGS10       1
Name: Name, Length: 11128, dtype: int64

In [15]:
change = total_unique_proteins_v3 - total_unique_proteins_v1
print('total_unique_proteins_v3 - total_unique_proteins_v1 = ', change)

total_unique_proteins_v3 - total_unique_proteins_v1 =  265


In [16]:
change_prot_iso = len(ls_prot.columns) - len(ls_prot_v1.columns) 
print('total_proteins_isoforms_v3 - total_proteins_isoforms_v1 = ', change_prot_iso)

total_proteins_isoforms_v3 - total_proteins_isoforms_v1 =  280


# Get list of added proteins

In [31]:
ls_prot_df = pd.DataFrame(ls_prot.columns)
ls_prot_df_v1 = pd.DataFrame(ls_prot_v1.columns)

In [60]:
# Use indicator to note which df the protein comes from 
# Keep only proteins found in one df
added_prot = ls_prot_df.merge(ls_prot_df_v1,indicator = True, how='left').loc[lambda x : x['_merge']!='both']
added_prot.head(2)

Unnamed: 0,Name,_merge
90,ABRAXAS2,left_only
198,ACTR8,left_only


In [58]:
added_prot_list = list(added_prot.Name)
added_prot_list

['ABRAXAS2',
 'ACTR8',
 'AFTPH',
 'AKAP9',
 'ANAPC1',
 'ANAPC4',
 'ANAPC5',
 'ANKHD1-EIF4EBP3',
 'AP1G1',
 'AP3B1',
 'AP3D1',
 'AP4M1',
 'ARF1',
 'ARID1B',
 'ARIH2',
 'BABAM1',
 'BABAM2',
 'BAG6',
 'BIRC6',
 'BOD1L1',
 'BRAP',
 'BRK1',
 'C2CD5',
 'CABIN1',
 'CAPZA1',
 'CCZ1',
 'CDC23',
 'CDC37',
 'CDKN2AIP',
 'CEP192',
 'CEP350',
 'CHD2',
 'CIAO1',
 'CIC',
 'CLTC',
 'CNOT10',
 'CNOT3',
 'CNOT9',
 'COG1',
 'COG4',
 'COG5',
 'COG6',
 'COG7',
 'COG8',
 'COPS2',
 'COPS3',
 'COPS4',
 'CSNK2A2',
 'CSTF2T',
 'CYFIP1',
 'DAZAP1',
 'DCP1A',
 'DDB1',
 'DDX1',
 'DDX19A',
 'DDX39B',
 'DHX15',
 'DIS3L2',
 'DYNC1H1',
 'DYNC1LI1',
 'EIF2B3',
 'EIF2B4',
 'EIF2B5',
 'EIF3C',
 'EIF3D',
 'EIF3E',
 'EIF3F',
 'EIF3H',
 'EIF4E2',
 'EIF4G2',
 'EIPR1',
 'ELOB',
 'EP400',
 'FAF1',
 'FAM120A',
 'FAM208A',
 'GCC1',
 'GET4',
 'GRIPAP1',
 'GSK3A',
 'GTPBP1',
 'HCFC1',
 'HDAC3',
 'HEATR5B',
 'HERC1',
 'HGS',
 'HNRNPA2B1',
 'HNRNPH1',
 'HNRNPH2',
 'HPS3',
 'ING3',
 'INO80',
 'INTS11',
 'INTS4',
 'IPO13',
 'KDM5A',
 

In [55]:
# Check if protein in earlier version 1.0
ls_prot_v1 = ls_v1.get_proteomics()
ls_prot_v1 = u.reduce_multiindex(ls_prot_v1, levels_to_drop = 1)
test_gene = added_prot_list[0]
print('test gene: ', test_gene)

print('should show key error, none are in the columns for lscc v1.0')
ls_prot_v1.loc[:, added_prot_list] 

test gene:  ABRAXAS2
should show key error, none are in the columns for lscc v1.0




KeyError: "None of [Index(['ABRAXAS2', 'ACTR8', 'AFTPH', 'AKAP9', 'ANAPC1', 'ANAPC4', 'ANAPC5',\n       'ANKHD1-EIF4EBP3', 'AP1G1', 'AP3B1',\n       ...\n       'WAC', 'WAPL', 'WDR26', 'WDR59', 'WDR61', 'WDR82', 'XIAP', 'ZC3H7A',\n       'ZMIZ1', 'ZSWIM8'],\n      dtype='object', name='Name', length=265)] are in the [columns]"