In [1]:
import cptac
import pandas as pd
import scipy.stats as stats
import cptac.utils as ut
import scipy
import statsmodels.stats.multitest as ssm
import numpy as np

In [2]:
ccrcc = cptac.Ccrcc()
en = cptac.Endometrial()
luad = cptac.Luad()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()
pdac = cptac.Pdac()

Checking that pdac index is up-to-date...       



                                         



In [6]:
cancers = [ccrcc, en, luad, hnscc, lscc, pdac]
cancer_names = ['CCRCC', 'Endometrial', 'LUAD', 'HNSCC', 'LSCC', 'PDAC']

In [7]:
def log2_fold_change(tumor, normal):
    tumor = np.mean(tumor)
    normal = np.mean(normal)
    if normal == 0 or tumor == 0:
        return(float('Nan'))
    fold_change = tumor / normal
    fold_change = abs(fold_change)
    log2_change = np.log2(fold_change)
    return log2_change    

In [8]:
diff_expression_dfs = []
for cancer, cancer_name in zip(cancers, cancer_names):
    prot_normal_df = cancer.get_proteomics('normal')
    if isinstance(prot_normal_df.columns, pd.MultiIndex):
        prot_normal_df = ut.reduce_multiindex(df= prot_normal_df, levels_to_drop = 'Database_ID')
    prot_normal_df.reset_index(inplace = True)
    prot_normal_df = prot_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    prot_normal_df['Tissue'] = ['normal'] * len(prot_normal_df)
    prot_tumor_df = cancer.get_proteomics('tumor')
    if(isinstance(prot_tumor_df.columns, pd.MultiIndex)):
        prot_tumor_df = ut.reduce_multiindex(df= prot_tumor_df, levels_to_drop = 'Database_ID')
    prot_tumor_df.reset_index(inplace = True)
    prot_tumor_df = prot_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    prot_tumor_df['Tissue'] = ['tumor'] * len(prot_tumor_df)
    prot_df = pd.merge(prot_tumor_df, prot_normal_df, how = 'outer')
    prot_df = prot_df.dropna()
    fold_change = prot_df.groupby('Gene').apply(
        lambda df: log2_fold_change(df[df['Tissue']=='tumor'].Proteomic,
                                    df[df['Tissue']=='normal'].Proteomic))
    fold_change = pd.DataFrame(fold_change, columns = ['Log2_fold_change'])
    fold_change.reset_index(inplace = True)
    ranksums = prot_df.groupby('Gene').apply(lambda df: stats.ranksums(
        df[df['Tissue']=='tumor'].Proteomic,
        df[df['Tissue']=='normal'].Proteomic))
    ranksums = pd.DataFrame.from_records(ranksums, index = ranksums.index, columns = ['statistic', 'pval'])
    ranksums.reset_index(inplace = True)
    ranksums = ranksums.dropna()
    ranksums['FDR'] = ssm.fdrcorrection(ranksums['pval'])[1]
    ranksums['Cancer'] = [cancer_name] * len(ranksums)
    diff_expression_df = pd.merge(ranksums, fold_change)
    diff_expression_dfs.append(diff_expression_df)
diff_expression_df = pd.concat(diff_expression_dfs)
diff_expression_df.to_csv('data/Proteomics_differential_expression_df.csv', index = False)
diff_expression_df

  z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
  cond2 = cond0 & (x <= _a)
  z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
  cond2 = cond0 & (x <= _a)
  z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
  cond2 = cond0 & (x <= _a)
  z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
  cond2 = cond0 & (x <= _a)


Unnamed: 0,Gene,statistic,pval,FDR,Cancer,Log2_fold_change
0,A1BG,4.062020,4.864990e-05,7.977389e-05,CCRCC,0.846625
1,A1CF,-6.998855,2.580621e-12,6.376757e-12,CCRCC,-0.218309
2,A2M,6.557556,5.469684e-11,1.261222e-10,CCRCC,-1.256118
3,A4GALT,0.979796,3.271869e-01,3.660405e-01,CCRCC,3.045874
4,AAAS,9.324065,1.119664e-20,4.340440e-20,CCRCC,4.527650
...,...,...,...,...,...,...
11626,ZXDC,2.500000,1.241933e-02,2.168582e-02,PDAC,0.068020
11627,ZYG11B,1.778682,7.529194e-02,1.102090e-01,PDAC,0.005597
11628,ZYX,9.327386,1.085139e-20,2.743751e-19,PDAC,0.036329
11629,ZZEF1,-4.140393,3.467108e-05,8.841467e-05,PDAC,-0.008661


In [9]:
diff_expression_dfs = []
for cancer, cancer_name in zip(cancers, cancer_names):
    prot_normal_df = cancer.get_transcriptomics('normal')
    if isinstance(prot_normal_df.columns, pd.MultiIndex):
        prot_normal_df = ut.reduce_multiindex(df= prot_normal_df, levels_to_drop = 'Database_ID')
    prot_normal_df.reset_index(inplace = True)
    prot_normal_df = prot_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    prot_normal_df['Tissue'] = ['normal'] * len(prot_normal_df)
    prot_tumor_df = cancer.get_transcriptomics('tumor')
    if(isinstance(prot_tumor_df.columns, pd.MultiIndex)):
        prot_tumor_df = ut.reduce_multiindex(df= prot_tumor_df, levels_to_drop = 'Database_ID')
    prot_tumor_df.reset_index(inplace = True)
    prot_tumor_df = prot_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    prot_tumor_df['Tissue'] = ['tumor'] * len(prot_tumor_df)
    prot_df = pd.merge(prot_tumor_df, prot_normal_df, how = 'outer')
    prot_df = prot_df.dropna()
    fold_change = prot_df.groupby('Gene').apply(
        lambda df: log2_fold_change(df[df['Tissue']=='tumor'].Transcriptomics,
                                    df[df['Tissue']=='normal'].Transcriptomics))
    fold_change = pd.DataFrame(fold_change, columns = ['Log2_fold_change'])
    fold_change.reset_index(inplace = True)
    ranksums = prot_df.groupby('Gene').apply(lambda df: stats.ranksums(
        df[df['Tissue']=='tumor'].Transcriptomics,
        df[df['Tissue']=='normal'].Transcriptomics))
    ranksums = pd.DataFrame.from_records(ranksums, index = ranksums.index, columns = ['statistic', 'pval'])
    ranksums.reset_index(inplace = True)
    ranksums = ranksums.dropna()
    ranksums['FDR'] = ssm.fdrcorrection(ranksums['pval'])[1]
    ranksums['Cancer'] = [cancer_name] * len(ranksums)
    diff_expression_df = pd.merge(ranksums, fold_change)
    diff_expression_dfs.append(diff_expression_df)
diff_expression_df = pd.concat(diff_expression_dfs)
diff_expression_df.to_csv('Transcriptomics_differential_expression_df.csv', index = False)
diff_expression_df

Unnamed: 0,Gene,statistic,pval,FDR,Cancer,Log2_fold_change
0,A1BG,6.602420,4.044989e-11,9.028157e-11,CCRCC,0.881967
1,A1CF,-5.201398,1.977946e-07,3.574768e-07,CCRCC,-0.641470
2,A2M,5.970422,2.366402e-09,4.783680e-09,CCRCC,0.514646
3,A2ML1,-4.798710,1.596912e-06,2.739452e-06,CCRCC,-0.304295
4,A3GALT2,1.037483,2.995107e-01,3.425340e-01,CCRCC,0.314083
...,...,...,...,...,...,...
28052,ZYG11A,-0.120468,9.041126e-01,1.000000e+00,PDAC,-0.001721
28053,ZYG11B,-0.983820,3.252042e-01,4.961530e-01,PDAC,-0.008388
28054,ZYX,2.730602,6.321883e-03,1.768955e-02,PDAC,0.043200
28055,ZZEF1,-6.806426,1.000530e-11,1.711700e-09,PDAC,-0.087991
