In [1]:
import cptac
import cptac.utils as ut
import scipy
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import numpy
import math
import pandas as pd
import statsmodels.stats.multitest as ssm
sns.set(style = 'white')



In [2]:
cptac.download("ccrcc")
cptac.download("endometrial")
cptac.download("luad")
cptac.download("hnscc")
cptac.download("lscc")
ccrcc = cptac.Ccrcc()
en = cptac.Endometrial()
luad = cptac.Luad()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

                                                



In [3]:
cancers = [ccrcc, en, luad, hnscc, lscc]
cancer_names = ['CCRCC', 'Endometrial', 'LUAD', 'HNSCC', 'LSCC']

In [4]:
cancer_dfs = []
for cancer, cancer_name in zip(cancers, cancer_names):
    prot_normal_df = cancer.get_proteomics('normal')
    if isinstance(prot_normal_df.columns, pd.MultiIndex):
        prot_normal_df = ut.reduce_multiindex(df= prot_normal_df, levels_to_drop = 'Database_ID')
    trans_normal_df = cancer.get_transcriptomics('normal')
    if isinstance(trans_normal_df.columns, pd.MultiIndex):
        trans_normal_df = ut.reduce_multiindex(df = trans_normal_df, levels_to_drop='Database_ID')
    prot_normal_df['Patient_ID'] = prot_normal_df.index
    trans_normal_df['Patient_ID'] = trans_normal_df.index
    prot_normal_df = prot_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    trans_normal_df = trans_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    prot_normal_df['Stage'] = ['Normal'] * len(prot_normal_df)
    trans_normal_df['Stage'] = ['Normal'] * len(trans_normal_df)
    if cancer == ccrcc:
        stage = cancer.get_clinical('tumor')[['tumor_stage_pathological']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'tumor_stage_pathological': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
    elif cancer == luad or cancer == lscc:
        stage = cancer.get_clinical('tumor')[['Stage']]
        stage['Patient_ID'] = stage.index
        stage = stage.reset_index(level = 0, drop = True)
        stages = []
        for s in stage['Stage']:
            if s in ['1A', '1B', '1', 1, 'I', 'IA', 'IB']:
                stages.append('Stage I')
            elif s in ['2A', '2B', 'II', 'IIA', 'IIB']:
                stages.append('Stage II')
            elif s in ['3A', '3', 3, 'III', 'IIIA', 'IIIB']:
                stages.append('Stage III')
            elif s in ['IV']:
                stages.append('Stage IV')
            else:
                stages.append(float('NaN'))
        stage['Stage'] = stages
    elif cancer == en:
        stage = cancer.get_clinical('tumor')[['FIGO_stage']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'FIGO_stage': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
        stages = []
        for s in stage['Stage']:
            if s in ['IA', 'IB']:
                stages.append('Stage I')
            elif s in ['II']:
                stages.append('Stage II')
            elif s in ['IIIA', 'IIIB', 'IIIC2', 'IIIC1']:
                stages.append('Stage III')
            elif s in ['IVB', 'IV']:
                stages.append('Stage IV')
            else:
                stages.append(float('NaN'))
        stage['Stage'] = stages
    elif cancer == hnscc:
        stage = cancer.get_clinical('tumor')[['patho_staging_curated']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'patho_staging_curated': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
    prot_tumor_df = cancer.get_proteomics('tumor')
    if isinstance(prot_tumor_df.columns, pd.MultiIndex):
        prot_tumor_df = ut.reduce_multiindex(df= prot_tumor_df, levels_to_drop = 'Database_ID')
    trans_tumor_df = cancer.get_transcriptomics('tumor')
    if isinstance(trans_tumor_df.columns, pd.MultiIndex):
        trans_tumor_df = ut.reduce_multiindex(df = trans_tumor_df, levels_to_drop='Database_ID')
    prot_tumor_df['Patient_ID'] = prot_tumor_df.index
    trans_tumor_df['Patient_ID'] = trans_tumor_df.index
    prot_tumor_df = prot_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    trans_tumor_df = trans_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    trans_tumor_df = pd.merge(trans_tumor_df, stage, how = 'outer')
    prot_tumor_df = pd.merge(prot_tumor_df, stage, how = 'outer')
    prot_df = pd.merge(prot_tumor_df, prot_normal_df, how = 'outer')
    trans_df = pd.merge(trans_normal_df, trans_tumor_df, how = 'outer')
    cancer_df = pd.merge(trans_df, prot_df,how = 'outer')
    cancer_df['Cancer'] = [cancer_name] * len(cancer_df)
    cancer_dfs.append(cancer_df)
prot_trans_stages_df = pd.concat(cancer_dfs)
prot_trans_stages_df



Unnamed: 0,Patient_ID,Gene,Transcriptomics,Stage,Proteomic,Cancer
0,C3L-00004.N,A1BG,0.859821,Normal,0.291127,CCRCC
1,C3L-00010.N,A1BG,0.824992,Normal,-0.579658,CCRCC
2,C3L-00011.N,A1BG,0.673249,Normal,-0.096200,CCRCC
3,C3L-00026.N,A1BG,3.043751,Normal,-0.209196,CCRCC
4,C3L-00079.N,A1BG,0.801038,Normal,-0.395915,CCRCC
...,...,...,...,...,...,...
4598431,C3L-02646.N,ZZZ3,,Normal,0.402600,LSCC
4598432,C3N-03072.N,ZZZ3,,Normal,1.336400,LSCC
4598433,C3N-03662.N,ZZZ3,,Normal,0.726400,LSCC
4598434,C3N-03886.N,ZZZ3,,Normal,0.359300,LSCC


In [5]:
prot_trans_stages_df = prot_trans_stages_df.dropna()
prot_trans_stages_df

Unnamed: 0,Patient_ID,Gene,Transcriptomics,Stage,Proteomic,Cancer
0,C3L-00004.N,A1BG,0.859821,Normal,0.291127,CCRCC
1,C3L-00010.N,A1BG,0.824992,Normal,-0.579658,CCRCC
2,C3L-00011.N,A1BG,0.673249,Normal,-0.096200,CCRCC
3,C3L-00026.N,A1BG,3.043751,Normal,-0.209196,CCRCC
4,C3L-00079.N,A1BG,0.801038,Normal,-0.395915,CCRCC
...,...,...,...,...,...,...
4491060,C3N-04162,ZXDC,17.376300,Stage II,0.450600,LSCC
4491062,C3N-04162,ZYG11B,17.324700,Stage II,-0.583000,LSCC
4491063,C3N-04162,ZYX,18.029000,Stage II,-2.542300,LSCC
4491064,C3N-04162,ZZEF1,17.693600,Stage II,-0.589400,LSCC


In [10]:
prot_trans_stages_df.groupby(['Cancer', 'Stage', 'Gene'])


Unnamed: 0,Patient_ID,Gene,Transcriptomics,Stage,Proteomic,Cancer
0,C3L-00004.N,A1BG,0.859821,Normal,0.291127,CCRCC
1,C3L-00010.N,A1BG,0.824992,Normal,-0.579658,CCRCC
2,C3L-00011.N,A1BG,0.673249,Normal,-0.096200,CCRCC
3,C3L-00026.N,A1BG,3.043751,Normal,-0.209196,CCRCC
4,C3L-00079.N,A1BG,0.801038,Normal,-0.395915,CCRCC
...,...,...,...,...,...,...
4491060,C3N-04162,ZXDC,17.376300,Stage II,0.450600,LSCC
4491062,C3N-04162,ZYG11B,17.324700,Stage II,-0.583000,LSCC
4491063,C3N-04162,ZYX,18.029000,Stage II,-2.542300,LSCC
4491064,C3N-04162,ZZEF1,17.693600,Stage II,-0.589400,LSCC
