In [1]:
import os
import numpy as np
import pandas as pd 

%matplotlib inline
import seaborn as sns 
from matplotlib import pyplot as plt

from pingouin import compute_effsize
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection

In [2]:
get_data_path = lambda folders, fname: os.path.normpath(os.environ['DATA_PATH']+'/'+'/'.join(folders) +'/'+ fname)
get_tables_path = lambda folders, fname: os.path.normpath('../local_data/' + '/'.join(folders) + '/' + fname)
get_figure_path = lambda fname: os.path.normpath('../local_data/figures/' + fname)

In [3]:
file_tcga_survival_analysis = get_data_path(['usc_xena_browser'], 'Survival_SupplementalTable_S1_20171025_xena_sp')
file_cptac_sample_info = get_tables_path(['processed_data'], 'CPTAC_sample_info.parquet')
file_vae_100_encoded_features = get_tables_path(['processed_data'], 'encoded_rnaseq_vae100_cptac.parquet')
file_vae_500_encoded_features = get_tables_path(['processed_data'], 'encoded_rnaseq_vae500_cptac.parquet')
file_vae_1000_encoded_features = get_tables_path(['processed_data'], 'encoded_rnaseq_vae1000_cptac.parquet')

In [4]:
cptac_sample_info = pd.read_parquet(file_cptac_sample_info).rename(columns={'Gender':'Sex'})
cptac_sample_info['Study'] = cptac_sample_info['Study'].apply(lambda x: 'CPTAC-'+x)
cptac_sample_info[:2]

Unnamed: 0,Sex,Age,Study
C3L-00004,male,72.0,CPTAC-ccRCC
C3L-00010,male,30.0,CPTAC-ccRCC


In [5]:
survival_analysis_data = pd.read_csv(file_tcga_survival_analysis, sep='\t', 
                                     usecols=['sample', 'cancer type abbreviation', 
                                              'gender', 'new_tumor_event_type']).set_index('sample')
survival_analysis_data.rename(columns={'cancer type abbreviation': 'Study', 
                                       'gender': 'Sex'}, inplace=True)
survival_analysis_data['Study'] = survival_analysis_data['Study'].apply(lambda x: 'TCGA-'+x)
print("Dimensions: ", survival_analysis_data.shape)
survival_analysis_data[:2]

Dimensions:  (12591, 3)


Unnamed: 0_level_0,Study,Sex,new_tumor_event_type
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-OR-A5J1-01,TCGA-ACC,MALE,Distant Metastasis
TCGA-OR-A5J2-01,TCGA-ACC,FEMALE,Distant Metastasis


In [6]:
survival_analysis_data['Sex'].replace({"MALE": "male",  "FEMALE": "female"}, inplace=True)

In [7]:
combined_sample = pd.concat([cptac_sample_info[['Sex', 'Study']], survival_analysis_data[['Sex', 'Study']]])
combined_sample[:2]

Unnamed: 0,Sex,Study
C3L-00004,male,CPTAC-ccRCC
C3L-00010,male,CPTAC-ccRCC


In [8]:
combined_sample = combined_sample[~combined_sample.index.duplicated(keep='first')]

In [9]:
vae_100_encoded_features = pd.read_parquet(file_vae_100_encoded_features)
print(vae_100_encoded_features.shape)
vae_100_encoded_features[:2]

(12915, 100)


sample_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
X05BR044,8.833497,3.561182,1.77851,0.0,0.695813,10.420034,0.0,0.0,3.694162,0.158421,...,0.79455,0.202841,0.0,0.0,0.0,0.962331,0.0,0.0,8.418137,2.354422
X06BR006,8.827207,3.075365,0.0,0.0,0.146443,9.978468,0.0,0.0,0.734273,0.0,...,0.0,0.155483,0.0,0.0,0.0,0.939598,0.751575,1.725533,7.726731,0.642764


In [10]:
vae_500_encoded_features = pd.read_parquet(file_vae_500_encoded_features)
print(vae_500_encoded_features.shape)
vae_500_encoded_features[:2]

(12915, 500)


sample_id,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
X05BR044,0.350045,0.0,0.441421,0.0,1.948918,5.326907,0.0,0.0,2.499535,0.0,...,0.0,5.753693,2.258451,1.168775,0.0,0.0,0.049345,0.0,0.0,0.0
X06BR006,0.449813,0.0,0.261759,0.0,1.28763,2.808841,0.0,0.0,0.163546,0.0,...,0.0,4.974125,2.090433,1.25725,0.0,0.910411,0.27513,0.0,0.0,0.0


In [11]:
vae_1000_encoded_features = pd.read_parquet(file_vae_1000_encoded_features)
print(vae_100_encoded_features.shape)
vae_1000_encoded_features[:2]

(12915, 100)


sample_id,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
X05BR044,0.0,0.0,0.0,0.853952,0.0,0.0,0.0,0.70587,0.0,0.0,...,0.0,0.0,0.74215,3.545775,0.0,0.0,0.0,0.0,0.0,0.0
X06BR006,0.0,0.0,0.0,0.389183,0.0,0.0,0.0,1.168161,0.0,0.0,...,1.208577,0.0,0.0,2.147605,0.296745,0.0,0.0,0.0,0.0,0.0


In [12]:
common_patients = np.intersect1d(vae_100_encoded_features.index, combined_sample.index)
print("Common patients: ", len(common_patients))

Common patients:  12915


In [13]:
vae100_samples = pd.concat([combined_sample, vae_100_encoded_features], axis=1, join='inner')
print(vae100_samples.shape)
vae100_samples[:2]

(12915, 102)


Unnamed: 0,Sex,Study,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100
C3L-00004,male,CPTAC-ccRCC,8.323169,0.721695,0.0,1.13097,4.164046,1.284208,6.744878,2.125343,...,0.0,3.657236,6.068415,0.0,0.0,6.764137,0.151883,0.0,0.0,0.0
C3L-00010,male,CPTAC-ccRCC,7.937684,1.398556,0.0,1.509008,4.320913,0.912725,6.286304,2.330709,...,0.0,4.267029,6.26283,0.0,0.0,7.388425,0.054899,0.0,0.0,0.0


In [14]:
vae500_samples = pd.concat([combined_sample, vae_500_encoded_features], axis=1, join='inner')
print(vae500_samples.shape)
vae500_samples[:2]

(12915, 502)


Unnamed: 0,Sex,Study,1,2,3,4,5,6,7,8,...,491,492,493,494,495,496,497,498,499,500
C3L-00004,male,CPTAC-ccRCC,0.699492,0.0,4.937079,0.0,3.314665,0.65847,0.0,0.0,...,0.0,7.317772,0.209015,1.303783,0.0,0.0,1.208687,0.274448,0.0,0.787194
C3L-00010,male,CPTAC-ccRCC,0.389071,0.0,5.26749,0.0,2.208476,0.390722,0.0,0.0,...,0.0,7.376022,0.429111,1.607995,0.0,0.0,1.238013,0.0,0.0,0.0


In [15]:
vae1000_samples = pd.concat([combined_sample, vae_1000_encoded_features], axis=1, join='inner')
print(vae1000_samples.shape)
vae1000_samples[:2]

(12915, 1002)


Unnamed: 0,Sex,Study,1,2,3,4,5,6,7,8,...,991,992,993,994,995,996,997,998,999,1000
C3L-00004,male,CPTAC-ccRCC,0.0,0.0,0.0,0.173331,1.082308,0.417386,0.0,1.654185,...,0.090461,0.0,0.0,0.0,3.564837,0.0,1.739093,0.274997,0.0,0.0
C3L-00010,male,CPTAC-ccRCC,0.0,0.0,0.0,0.140758,1.764406,0.0,0.0,0.956764,...,0.0,0.0,0.201519,0.0,3.317901,0.0,1.347144,0.178014,0.0,0.0


In [16]:
SEX = 'Sex'; VAE100 = 'VAE100'; VAE500 = 'VAE500'; VAE1000 = 'VAE1000'
results = []

def get_label_column(data, cancerType):
    conditions = [
        data['Study'].eq(cancerType),
        data['Study'].ne(cancerType)]
    choices = [cancerType, 'Other']
    data.insert(loc = 0, column = 'label', value=np.select(conditions, choices, default=0))
    data = data.drop(columns=['Study']).rename(columns={'label': 'Study'})
    data['Study'] = pd.Categorical(data['Study'], ordered=True, categories=['Other', cancerType])
    return(data.sort_values('Study'))


def perform_stat_test(vae_encodings, vae_encoding, test_gender=True, study=None):
    if(test_gender):
        vae_encodings_subset = vae_encodings.drop(columns='Study')
        stat_test_results = pd.DataFrame.from_dict({column: mannwhitneyu(vae_encodings_subset[vae_encodings_subset['Sex'] == 'female'][column],
                                                                         vae_encodings_subset[vae_encodings_subset['Sex'] == 'male'][column]) for column in vae_encodings_subset.columns[1:]}, orient='index')
        
        stat_test_results['pval_adjusted'] = fdrcorrection(stat_test_results['pvalue'])[1]
        stat_test_results['CLES'] = pd.Series({column: compute_effsize(vae_encodings_subset[vae_encodings_subset['Sex'] == 'female'][column], 
                                                                                vae_encodings_subset[vae_encodings_subset['Sex'] == 'male'][column], eftype='CLES') for column in vae_encodings_subset.columns[1:]})
        interested_result = stat_test_results[stat_test_results['pval_adjusted'] <= 0.0001].sort_values('CLES', 
                                                                                                        ascending=False)[:2]
        
        interested_result['Test'] = SEX
        interested_result['VAE-encodings'] = vae_encoding
        results.append(interested_result.drop(columns=['statistic']).reset_index().rename(columns={'index': 'Encoding'}))
    
    else:
        vae_encodings_subset = vae_encodings.drop(columns='Sex')
        vae_encodings_subset = get_label_column(vae_encodings_subset, study)
        stat_test_results = pd.DataFrame.from_dict({column: mannwhitneyu(vae_encodings_subset[vae_encodings_subset['Study'] == study][column], 
                                                                         vae_encodings_subset[vae_encodings_subset['Study'] == 'Other'][column]) for column in vae_encodings_subset.columns[1:]}, 
                                                   orient='index')
        
        stat_test_results['pval_adjusted'] = fdrcorrection(stat_test_results['pvalue'])[1]
        stat_test_results['CLES'] = pd.Series({column: compute_effsize(vae_encodings_subset[vae_encodings_subset['Study'] == study][column], 
                                                        vae_encodings_subset[vae_encodings_subset['Study'] == 'Other'][column], eftype='CLES') \
                                               for column in vae_encodings_subset.columns[1:]})
        interested_result = stat_test_results[stat_test_results['pval_adjusted'] <= 0.0001].sort_values('CLES', 
                                                                                                        ascending=False)[:2]
        
        interested_result['Test'] = study
        interested_result['VAE-encodings'] = vae_encoding
        results.append(interested_result.drop(columns=['statistic']).reset_index().rename(columns={'index': 'Encoding'}))
        print('Completed ' + study)

In [17]:
perform_stat_test(vae100_samples, VAE100)

In [18]:
perform_stat_test(vae500_samples, VAE500)

In [19]:
perform_stat_test(vae1000_samples, VAE1000)

In [20]:
for study in vae100_samples.Study.unique():
    perform_stat_test(vae100_samples, VAE100, test_gender=False, study=study)

Completed CPTAC-ccRCC
Completed CPTAC-UCEC
Completed CPTAC-BrCa2020
Completed CPTAC-GBM
Completed CPTAC-HNSCC
Completed CPTAC-LSCC
Completed CPTAC-LUAD
Completed CPTAC-OvCa2020
Completed CPTAC-Pdac
Completed CPTAC-BrCa2016
Completed CPTAC-OvCa2016
Completed TCGA-ACC
Completed TCGA-BLCA
Completed TCGA-BRCA
Completed TCGA-CESC
Completed TCGA-CHOL
Completed TCGA-COAD
Completed TCGA-DLBC
Completed TCGA-ESCA
Completed TCGA-GBM
Completed TCGA-HNSC
Completed TCGA-KICH
Completed TCGA-KIRC
Completed TCGA-KIRP
Completed TCGA-LAML
Completed TCGA-LGG
Completed TCGA-LIHC
Completed TCGA-LUAD
Completed TCGA-LUSC
Completed TCGA-MESO
Completed TCGA-OV
Completed TCGA-PAAD
Completed TCGA-PCPG
Completed TCGA-PRAD
Completed TCGA-READ
Completed TCGA-SARC
Completed TCGA-SKCM
Completed TCGA-STAD
Completed TCGA-TGCT
Completed TCGA-THCA
Completed TCGA-THYM
Completed TCGA-UCEC
Completed TCGA-UCS
Completed TCGA-UVM


In [21]:
for study in vae500_samples.Study.unique():
    perform_stat_test(vae500_samples, VAE500, test_gender=False, study=study)

Completed CPTAC-ccRCC
Completed CPTAC-UCEC
Completed CPTAC-BrCa2020
Completed CPTAC-GBM
Completed CPTAC-HNSCC
Completed CPTAC-LSCC
Completed CPTAC-LUAD
Completed CPTAC-OvCa2020
Completed CPTAC-Pdac
Completed CPTAC-BrCa2016
Completed CPTAC-OvCa2016
Completed TCGA-ACC
Completed TCGA-BLCA
Completed TCGA-BRCA
Completed TCGA-CESC
Completed TCGA-CHOL
Completed TCGA-COAD
Completed TCGA-DLBC
Completed TCGA-ESCA
Completed TCGA-GBM
Completed TCGA-HNSC
Completed TCGA-KICH
Completed TCGA-KIRC
Completed TCGA-KIRP
Completed TCGA-LAML
Completed TCGA-LGG
Completed TCGA-LIHC
Completed TCGA-LUAD
Completed TCGA-LUSC
Completed TCGA-MESO
Completed TCGA-OV
Completed TCGA-PAAD
Completed TCGA-PCPG
Completed TCGA-PRAD
Completed TCGA-READ
Completed TCGA-SARC
Completed TCGA-SKCM
Completed TCGA-STAD
Completed TCGA-TGCT
Completed TCGA-THCA
Completed TCGA-THYM
Completed TCGA-UCEC
Completed TCGA-UCS
Completed TCGA-UVM


In [22]:
for study in vae1000_samples.Study.unique():
    perform_stat_test(vae1000_samples, VAE1000, test_gender=False, study=study)

Completed CPTAC-ccRCC
Completed CPTAC-UCEC
Completed CPTAC-BrCa2020
Completed CPTAC-GBM
Completed CPTAC-HNSCC
Completed CPTAC-LSCC
Completed CPTAC-LUAD
Completed CPTAC-OvCa2020
Completed CPTAC-Pdac
Completed CPTAC-BrCa2016
Completed CPTAC-OvCa2016
Completed TCGA-ACC
Completed TCGA-BLCA
Completed TCGA-BRCA
Completed TCGA-CESC
Completed TCGA-CHOL
Completed TCGA-COAD
Completed TCGA-DLBC
Completed TCGA-ESCA
Completed TCGA-GBM
Completed TCGA-HNSC
Completed TCGA-KICH
Completed TCGA-KIRC
Completed TCGA-KIRP
Completed TCGA-LAML
Completed TCGA-LGG
Completed TCGA-LIHC
Completed TCGA-LUAD
Completed TCGA-LUSC
Completed TCGA-MESO
Completed TCGA-OV
Completed TCGA-PAAD
Completed TCGA-PCPG
Completed TCGA-PRAD
Completed TCGA-READ
Completed TCGA-SARC
Completed TCGA-SKCM
Completed TCGA-STAD
Completed TCGA-TGCT
Completed TCGA-THCA
Completed TCGA-THYM
Completed TCGA-UCEC
Completed TCGA-UCS
Completed TCGA-UVM


In [23]:
final_result = pd.concat(results).reset_index(drop=True)
final_result

Unnamed: 0,Encoding,pvalue,pval_adjusted,CLES,Test,VAE-encodings
0,98,1.745198e-267,8.725991e-266,0.663401,Sex,VAE100
1,63,3.933228e-236,9.833071e-235,0.659358,Sex,VAE100
2,282,0.000000e+00,0.000000e+00,0.770936,Sex,VAE500
3,280,2.621794e-270,2.184829e-268,0.666796,Sex,VAE500
4,77,1.187159e-258,3.957197e-256,0.675303,Sex,VAE1000
...,...,...,...,...,...,...
265,849,0.000000e+00,0.000000e+00,0.918938,TCGA-UCEC,VAE1000
266,450,4.636563e-44,4.215057e-42,0.970730,TCGA-UCS,VAE1000
267,883,2.548567e-46,2.548567e-44,0.969546,TCGA-UCS,VAE1000
268,596,1.076359e-90,4.305436e-89,0.995249,TCGA-UVM,VAE1000


In [24]:
# final_result.to_parquet(get_tables_path(['results'], 'VAE_Encodings_StatResults.parquet'))

In [30]:
final_result.to_csv(get_tables_path(['results'], 'TableS4.csv'))

In [31]:
vae100_samples.to_csv(get_tables_path(['results'], 'TableS1.csv'))
vae500_samples.to_csv(get_tables_path(['results'], 'TableS2.csv'))
vae1000_samples.to_csv(get_tables_path(['results'], 'TableS3.csv'))