In [1]:
import pandas as pd
import sys, os
currentdir = os.path.dirname(os.path.realpath('make_regression_df.py'))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)
import Delta_Correlation as dc

In [2]:
cancers, cancer_names = dc.load_cancers()

                                                



In [3]:
cancer_dfs = []
for cancer, cancer_name in zip(cancers, cancer_names):    
    if cancer_name == 'CCRCC':
        stage = cancer.get_clinical('tumor')[['tumor_stage_pathological']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'tumor_stage_pathological': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
    elif cancer_name == 'LUAD' or cancer_name == 'LSCC':
        stage = cancer.get_clinical('tumor')[['Stage']]
        stage['Patient_ID'] = stage.index
        stage = stage.reset_index(level = 0, drop = True)
        stages = []
        for s in stage['Stage']:
            if s in ['1A', '1B', '1', 1, 'I', 'IA', 'IB']:
                stages.append('Stage I')
            elif s in ['2A', '2B', 'II', 'IIA', 'IIB']:
                stages.append('Stage II')
            elif s in ['3A', '3', 3, 'III', 'IIIA', 'IIIB']:
                stages.append('Stage III')
            elif s in ['IV']:
                stages.append('Stage IV')
            else:
                stages.append(float('NaN'))
        stage['Stage'] = stages
    elif cancer_name == 'Endometrial':
        stage = cancer.get_clinical('tumor')[['FIGO_stage']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'FIGO_stage': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
        stages = []
        for s in stage['Stage']:
            if s in ['IA', 'IB']:
                stages.append('Stage I')
            elif s in ['II']:
                stages.append('Stage II')
            elif s in ['IIIA', 'IIIB', 'IIIC2', 'IIIC1']:
                stages.append('Stage III')
            elif s in ['IVB', 'IV']:
                stages.append('Stage IV')
            else:
                stages.append(float('NaN'))
        stage['Stage'] = stages
    elif cancer_name == 'HNSCC':
        stage = cancer.get_clinical('tumor')[['patho_staging_curated']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'patho_staging_curated': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
        
    elif cancer_name == 'PDAC':
        stage = cancer.get_clinical('tumor')[['tumor_stage_pathological']]
        stage['Patient_ID'] = stage.index
        stage = stage.rename(columns = {'tumor_stage_pathological': 'Stage'})
        stage = stage.reset_index(level = 0, drop = True)
        stages = []
        for s in stage['Stage']:
            if s in ['Stage IA', 'Stage IB']:
                stages.append('Stage I')
            elif s in ['Stage IIA', 'Stage IIB']:
                stages.append('Stage II')
            elif s in ['Stage III']:
                stages.append('Stage III')
            elif s in ['Stage IV']:
                stages.append('Stage IV')
            else:
                stages.append(float('NaN'))
        stage['Stage'] = stages
    stage = stage.dropna()
    cancer_df = dc.get_prot_trans_df(cancer)
    cancer_df = cancer_df.dropna()
    cancer_df = pd.merge(cancer_df, stage, how = 'outer')
    cancer_df[['Stage']] = cancer_df[['Stage']].fillna(value='Normal')
    cancer_df = cancer_df.dropna()
    cancer_df['Cancer'] = [cancer_name] * len(cancer_df)
    cancer_dfs.append(cancer_df)
prot_trans_stages_df = pd.concat(cancer_dfs)
prot_trans_stages_df



Unnamed: 0,Patient_ID,Gene,Proteomics,Tissue,Transcriptomics,Stage,Cancer
0,C3L-00004,A1BG,-0.304302,Tumor,0.995336,Stage III,CCRCC
1,C3L-00004,A1CF,0.641447,Tumor,16.677828,Stage III,CCRCC
2,C3L-00004,A2M,-0.000025,Tumor,353.263362,Stage III,CCRCC
3,C3L-00004,AAAS,0.207831,Tumor,15.831130,Stage III,CCRCC
4,C3L-00004,AACS,-0.364128,Tumor,2.938550,Stage III,CCRCC
...,...,...,...,...,...,...,...
2125631,C3N-04162.N,ZXDC,-2.053400,Normal,17.869000,Normal,LSCC
2125632,C3N-04162.N,ZYG11B,0.404400,Normal,18.023000,Normal,LSCC
2125633,C3N-04162.N,ZYX,0.503600,Normal,19.709300,Normal,LSCC
2125634,C3N-04162.N,ZZEF1,0.969900,Normal,18.249100,Normal,LSCC


In [4]:
df = prot_trans_stages_df
df = df.drop(columns = 'Patient_ID')
grouped_stages = df.groupby(['Cancer', 'Stage', 'Gene'])
corr_df = grouped_stages.apply(lambda x: x.corr(method = 'spearman', min_periods = 10).iloc[0][1])
corr_df = pd.DataFrame(corr_df, columns = ['PearsonR'])
corr_df.reset_index(inplace = True)
corr_df = corr_df.dropna()
corr_df


Unnamed: 0,Cancer,Stage,Gene,PearsonR
0,CCRCC,Normal,A1BG,0.340967
1,CCRCC,Normal,A1CF,0.719744
2,CCRCC,Normal,A2M,0.194680
4,CCRCC,Normal,AAAS,0.119972
5,CCRCC,Normal,AACS,0.765320
...,...,...,...,...
259834,LUAD,Stage III,ZXDC,0.218182
259835,LUAD,Stage III,ZYG11B,0.620879
259836,LUAD,Stage III,ZYX,0.604396
259837,LUAD,Stage III,ZZEF1,0.604396


In [5]:
corr_df.to_csv('Cancer_stages_correlations_spearman.csv', index = False)