In [1]:
import cptac
import pandas as pd
import scipy.stats as stats
import cptac.utils as ut
import statsmodels.stats.multitest as ssm



In [2]:
ccrcc = cptac.Ccrcc()
en = cptac.Endometrial()
luad = cptac.Luad()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

                                                



In [3]:
cancers = [ccrcc, en, luad, hnscc, lscc]
cancer_names = ['CCRCC', 'Endometrial', 'LUAD', 'HNSCC', 'LSCC']

In [4]:
prot_df = []
trans_df = []
for cancer, cancer_name in zip(cancers, cancer_names):
    prot_normal_df = cancer.get_proteomics('normal')
    if isinstance(prot_normal_df.columns, pd.MultiIndex):
        prot_normal_df = ut.reduce_multiindex(df= prot_normal_df, levels_to_drop = 'Database_ID')
    trans_normal_df = cancer.get_transcriptomics('normal')
    if isinstance(trans_normal_df.columns, pd.MultiIndex):
        trans_normal_df = ut.reduce_multiindex(df = trans_normal_df, levels_to_drop='Database_ID')
    prot_normal_df['Patient_ID'] = prot_normal_df.index
    trans_normal_df['Patient_ID'] = trans_normal_df.index
    prot_normal_df = prot_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    trans_normal_df = trans_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    prot_tumor_df = cancer.get_proteomics('tumor')
    if isinstance(prot_tumor_df.columns, pd.MultiIndex):
        prot_tumor_df = ut.reduce_multiindex(df= prot_tumor_df, levels_to_drop = 'Database_ID')
    trans_tumor_df = cancer.get_transcriptomics('tumor')
    if isinstance(trans_tumor_df.columns, pd.MultiIndex):
        trans_tumor_df = ut.reduce_multiindex(df = trans_tumor_df, levels_to_drop='Database_ID')
    prot_tumor_df['Patient_ID'] = prot_tumor_df.index
    trans_tumor_df['Patient_ID'] = trans_tumor_df.index
    prot_tumor_df = prot_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    trans_tumor_df = trans_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    prot_tumor_df['Tissue'] = ['tumor'] * len(prot_tumor_df)
    prot_normal_df['Tissue'] = ['normal'] * len(prot_normal_df)
    trans_tumor_df['Tissue'] = ['tumor'] * len(trans_tumor_df)
    trans_normal_df['Tissue'] = ['normal'] * len(trans_normal_df)
    prot_tumor_df['Cancer'] = [cancer_name] * len(prot_tumor_df)
    prot_normal_df['Cancer'] = [cancer_name] * len(prot_normal_df)
    trans_tumor_df['Cancer'] = [cancer_name] * len(trans_tumor_df)
    trans_normal_df['Cancer'] = [cancer_name] * len(trans_normal_df)
    prot_df.extend([prot_tumor_df, prot_normal_df])
    trans_df.extend([trans_tumor_df, trans_normal_df])
prot_df = pd.concat(prot_df)
trans_df = pd.concat(trans_df)

    



In [5]:
prot_df = prot_df.dropna()
prot_df

Unnamed: 0,Patient_ID,Gene,Proteomic,Tissue,Cancer
0,C3L-00004,A1BG,-0.304302,tumor,CCRCC
1,C3L-00010,A1BG,1.195915,tumor,CCRCC
2,C3L-00011,A1BG,-0.286155,tumor,CCRCC
3,C3L-00026,A1BG,0.135730,tumor,CCRCC
4,C3L-00079,A1BG,-0.123959,tumor,CCRCC
...,...,...,...,...,...
1145920,C3N-03886.N,ZZZ3,0.359300,normal,LSCC
1145921,C3N-04124.N,ZZZ3,-0.087900,normal,LSCC
1145922,C3N-04127.N,ZZZ3,0.137700,normal,LSCC
1145923,C3N-04155.N,ZZZ3,0.809200,normal,LSCC


In [6]:
trans_df = trans_df.dropna()
trans_df

Unnamed: 0,Patient_ID,Gene,Transcriptomics,Tissue,Cancer
0,C3L-00004,A1BG,0.995336,tumor,CCRCC
1,C3L-00010,A1BG,0.679400,tumor,CCRCC
2,C3L-00011,A1BG,0.354549,tumor,CCRCC
3,C3L-00026,A1BG,2.543775,tumor,CCRCC
4,C3L-00079,A1BG,4.355205,tumor,CCRCC
...,...,...,...,...,...
2048443,C3N-03880.N,ZZZ3,17.705300,normal,LSCC
2048444,C3N-03882.N,ZZZ3,17.762500,normal,LSCC
2048445,C3N-04124.N,ZZZ3,17.979700,normal,LSCC
2048446,C3N-04127.N,ZZZ3,17.735200,normal,LSCC


In [7]:
prot_group = prot_df.groupby(['Cancer', 'Gene'])
rows = []
for name, group in prot_group:
    cancer, gene = name
    normal = group[group.Tissue == 'normal']
    tumor = group[group.Tissue == 'tumor']
    t, pval = stats.ttest_ind(normal.Proteomic, tumor.Proteomic)
    row = dict()
    row['Cancer'] = cancer
    row['Gene'] = gene
    row['t_score'] = t
    row['pval'] = pval
    rows.append(row)
prot_t_test_df = pd.DataFrame(rows)
prot_t_test_df.to_csv('data/proteomics_t_tests.csv', index = False)
prot_t_test_df

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  cond2 = cond0 & (x <= _a)


Unnamed: 0,Cancer,Gene,t_score,pval
0,CCRCC,A1BG,-4.357646,2.139405e-05
1,CCRCC,A1CF,7.793847,3.999889e-13
2,CCRCC,A2M,-7.416771,3.757940e-12
3,CCRCC,A4GALT,-1.538266,1.678723e-01
4,CCRCC,AAAS,-11.785243,1.787427e-24
...,...,...,...,...
55417,LUAD,ZXDC,-0.739588,4.606404e-01
55418,LUAD,ZYG11B,9.172638,4.505852e-17
55419,LUAD,ZYX,22.336080,2.816184e-57
55420,LUAD,ZZEF1,14.761766,2.955253e-34


In [8]:
trans_group = trans_df.groupby(['Cancer', 'Gene'])
rows = []
for name, group in trans_group:
    cancer, gene = name
    normal = group[group.Tissue == 'normal']
    tumor = group[group.Tissue == 'tumor']
    t, pval = stats.ttest_ind(normal.Transcriptomics, tumor.Transcriptomics)
    row = dict()
    row['Cancer'] = cancer
    row['Gene'] = gene
    row['t_score'] = t
    row['pval'] = pval
    rows.append(row)
trans_t_test_df = pd.DataFrame(rows)
trans_t_test_df.to_csv('data/transcriptomics_t_tests.csv', index = False)
trans_t_test_df

  cond2 = cond0 & (x <= _a)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Cancer,Gene,t_score,pval
0,CCRCC,A1BG,-5.468802,1.471386e-07
1,CCRCC,A1CF,5.126843,7.454574e-07
2,CCRCC,A2M,-6.159303,4.526458e-09
3,CCRCC,A2ML1,2.163946,3.176444e-02
4,CCRCC,A3GALT2,-1.573925,1.172320e-01
...,...,...,...,...
125674,LUAD,ZYG11A,-16.499280,1.031040e-39
125675,LUAD,ZYG11B,6.577774,3.763714e-10
125676,LUAD,ZYX,9.108707,6.873646e-17
125677,LUAD,ZZEF1,11.350264,1.425407e-23
