In [1]:
import cptac
import cptac.utils as ut
import pandas as pd

In [2]:
ccrcc = cptac.Ccrcc()
en = cptac.Endometrial()
luad = cptac.Luad()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()
pdac = cptac.Pdac()

Checking that pdac index is up-to-date...       



                                         



In [3]:
cancers = [ccrcc, en, luad, hnscc, lscc, pdac]
cancer_names = ['CCRCC', 'Endometrial', 'LUAD', 'HNSCC', 'LSCC', 'PDAC']

In [4]:
cancer_dfs = []
for cancer, cancer_name in zip(cancers, cancer_names):
    prot_normal_df = cancer.get_proteomics('normal')
    if isinstance(prot_normal_df.columns, pd.MultiIndex):
        prot_normal_df = ut.reduce_multiindex(df= prot_normal_df, levels_to_drop = 'Database_ID')
    trans_normal_df = cancer.get_transcriptomics('normal')
    if isinstance(trans_normal_df.columns, pd.MultiIndex):
        trans_normal_df = ut.reduce_multiindex(df = trans_normal_df, levels_to_drop='Database_ID')
    prot_normal_df['Patient_ID'] = prot_normal_df.index
    trans_normal_df['Patient_ID'] = trans_normal_df.index
    prot_normal_df = prot_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    trans_normal_df = trans_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    prot_normal_df['Stage'] = ['Normal'] * len(prot_normal_df)
    trans_normal_df['Stage'] = ['Normal'] * len(trans_normal_df)
    if cancer == ccrcc:
        stage = cancer.get_clinical('tumor')[['tumor_stage_pathological']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'tumor_stage_pathological': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
    elif cancer == luad or cancer == lscc:
        stage = cancer.get_clinical('tumor')[['Stage']]
        stage['Patient_ID'] = stage.index
        stage = stage.reset_index(level = 0, drop = True)
        stages = []
        for s in stage['Stage']:
            if s in ['1A', '1B', '1', 1, 'I', 'IA', 'IB']:
                stages.append('Stage I')
            elif s in ['2A', '2B', 'II', 'IIA', 'IIB']:
                stages.append('Stage II')
            elif s in ['3A', '3', 3, 'III', 'IIIA', 'IIIB']:
                stages.append('Stage III')
            elif s in ['IV']:
                stages.append('Stage IV')
            else:
                stages.append(float('NaN'))
        stage['Stage'] = stages
    elif cancer == en:
        stage = cancer.get_clinical('tumor')[['FIGO_stage']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'FIGO_stage': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
        stages = []
        for s in stage['Stage']:
            if s in ['IA', 'IB']:
                stages.append('Stage I')
            elif s in ['II']:
                stages.append('Stage II')
            elif s in ['IIIA', 'IIIB', 'IIIC2', 'IIIC1']:
                stages.append('Stage III')
            elif s in ['IVB', 'IV']:
                stages.append('Stage IV')
            else:
                stages.append(float('NaN'))
        stage['Stage'] = stages
    elif cancer == hnscc:
        stage = cancer.get_clinical('tumor')[['patho_staging_curated']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'patho_staging_curated': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
        
    elif cancer == pdac:
        stage = cancer.get_clinical('tumor')[['tumor_stage_pathological']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'tumor_stage_pathological': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
        stages = []
        for s in stage['Stage']:
            if s in ['Stage IA', 'Stage IB']:
                stages.append('Stage I')
            elif s in ['Stage IIA', 'Stage IIB']:
                stages.append('Stage II')
            elif s in ['Stage III']:
                stages.append('Stage III')
            elif s in ['Stage IV']:
                stages.append('Stage IV')
            else:
                stages.append(float('NaN'))
        stage['Stage'] = stages
        
        
    prot_tumor_df = cancer.get_proteomics('tumor')
    if isinstance(prot_tumor_df.columns, pd.MultiIndex):
        prot_tumor_df = ut.reduce_multiindex(df= prot_tumor_df, levels_to_drop = 'Database_ID')
    trans_tumor_df = cancer.get_transcriptomics('tumor')
    if isinstance(trans_tumor_df.columns, pd.MultiIndex):
        trans_tumor_df = ut.reduce_multiindex(df = trans_tumor_df, levels_to_drop='Database_ID')
    prot_tumor_df['Patient_ID'] = prot_tumor_df.index
    trans_tumor_df['Patient_ID'] = trans_tumor_df.index
    prot_tumor_df = prot_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    trans_tumor_df = trans_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    trans_tumor_df = pd.merge(trans_tumor_df, stage, how = 'outer')
    prot_tumor_df = pd.merge(prot_tumor_df, stage, how = 'outer')
    prot_df = pd.merge(prot_tumor_df, prot_normal_df, how = 'outer')
    trans_df = pd.merge(trans_normal_df, trans_tumor_df, how = 'outer')
    cancer_df = pd.merge(trans_df, prot_df,how = 'outer')
    cancer_df['Cancer'] = [cancer_name] * len(cancer_df)
    cancer_dfs.append(cancer_df)
prot_trans_stages_df = pd.concat(cancer_dfs)
prot_trans_stages_df



Unnamed: 0,Patient_ID,Gene,Transcriptomics,Stage,Proteomic,Cancer
0,C3L-00004.N,A1BG,0.859821,Normal,0.291127,CCRCC
1,C3L-00010.N,A1BG,0.824992,Normal,-0.579658,CCRCC
2,C3L-00011.N,A1BG,0.673249,Normal,-0.096200,CCRCC
3,C3L-00026.N,A1BG,3.043751,Normal,-0.209196,CCRCC
4,C3L-00079.N,A1BG,0.801038,Normal,-0.395915,CCRCC
...,...,...,...,...,...,...
5148696,C3N-03839.N,ZZZ3,,Normal,,PDAC
5148697,C3N-03840.N,ZZZ3,,Normal,,PDAC
5148698,C3N-03884.N,ZZZ3,,Normal,,PDAC
5148699,C3N-04119.N,ZZZ3,,Normal,,PDAC


In [5]:
prot_trans_stages_df = prot_trans_stages_df.dropna()
prot_trans_stages_df

Unnamed: 0,Patient_ID,Gene,Transcriptomics,Stage,Proteomic,Cancer
0,C3L-00004.N,A1BG,0.859821,Normal,0.291127,CCRCC
1,C3L-00010.N,A1BG,0.824992,Normal,-0.579658,CCRCC
2,C3L-00011.N,A1BG,0.673249,Normal,-0.096200,CCRCC
3,C3L-00026.N,A1BG,3.043751,Normal,-0.209196,CCRCC
4,C3L-00079.N,A1BG,0.801038,Normal,-0.395915,CCRCC
...,...,...,...,...,...,...
4517166,C3N-04284,ZW10,10.161812,Stage III,24.909713,PDAC
4517167,C3N-04284,ZWILCH,8.849157,Stage III,20.684593,PDAC
4517173,C3N-04284,ZYG11B,11.665104,Stage III,20.666625,PDAC
4517174,C3N-04284,ZYX,10.118784,Stage III,27.897518,PDAC


In [6]:
df = prot_trans_stages_df
df = df.drop(columns = 'Patient_ID')
grouped_stages = df.groupby(['Cancer', 'Stage', 'Gene'])
corr_df = grouped_stages.apply(lambda x: x.corr(method = 'pearson', min_periods = 10))
corr_df = corr_df.drop(columns = 'Proteomic')
corr_df.reset_index(inplace = True)
corr_df = corr_df[corr_df.level_3 == 'Proteomic']
corr_df = corr_df.drop(columns = 'level_3')
corr_df = corr_df.rename(columns = {'Transcriptomics': 'PearsonR'})
corr_df = corr_df.dropna()
corr_df


Unnamed: 0,Cancer,Stage,Gene,PearsonR
1,CCRCC,Normal,A1BG,0.328964
3,CCRCC,Normal,A1CF,0.745763
5,CCRCC,Normal,A2M,0.287323
9,CCRCC,Normal,AAAS,0.156649
11,CCRCC,Normal,AACS,0.707274
...,...,...,...,...
611575,PDAC,Stage III,ZWILCH,0.385813
611583,PDAC,Stage III,ZYG11B,0.171495
611585,PDAC,Stage III,ZYX,0.480944
611587,PDAC,Stage III,ZZEF1,0.097533


In [7]:
corr_df.to_csv('data/Cancer_stages_correlations.csv', index = False)