In [1]:
import cptac
import pandas as pd
import scipy.stats as stats
import cptac.utils as ut
import scipy
import statsmodels.stats.multitest as ssm
import numpy as np

In [2]:
ccrcc = cptac.Ccrcc()
en = cptac.Endometrial()
luad = cptac.Luad()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

                                                



In [3]:
cancers = [ccrcc, en, luad, hnscc, lscc]
cancer_names = ['CCRCC', 'Endometrial', 'LUAD', 'HNSCC', 'LSCC']

In [4]:
def log2_fold_change(tumor, normal):
    tumor = np.mean(tumor)
    normal = np.mean(normal)
    if normal == 0 or tumor == 0:
        return(float('Nan'))
    fold_change = tumor / normal
    fold_change = abs(fold_change)
    log2_change = np.log2(fold_change)
    return log2_change    

In [5]:
diff_expression_dfs = []
for cancer, cancer_name in zip(cancers, cancer_names):
    prot_normal_df = cancer.get_proteomics('normal')
    if isinstance(prot_normal_df.columns, pd.MultiIndex):
        prot_normal_df = ut.reduce_multiindex(df= prot_normal_df, levels_to_drop = 'Database_ID')
    prot_normal_df.reset_index(inplace = True)
    prot_normal_df = prot_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    prot_normal_df['Tissue'] = ['normal'] * len(prot_normal_df)
    prot_tumor_df = cancer.get_proteomics('tumor')
    if(isinstance(prot_tumor_df.columns, pd.MultiIndex)):
        prot_tumor_df = ut.reduce_multiindex(df= prot_tumor_df, levels_to_drop = 'Database_ID')
    prot_tumor_df.reset_index(inplace = True)
    prot_tumor_df = prot_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    prot_tumor_df['Tissue'] = ['tumor'] * len(prot_tumor_df)
    prot_df = pd.merge(prot_tumor_df, prot_normal_df, how = 'outer')
    prot_df = prot_df.dropna()
    fold_change = prot_df.groupby('Gene').apply(
        lambda df: log2_fold_change(df[df['Tissue']=='tumor'].Proteomic,
                                    df[df['Tissue']=='normal'].Proteomic))
    fold_change = pd.DataFrame(fold_change, columns = ['Log2_fold_change'])
    fold_change.reset_index(inplace = True)
    ranksums = prot_df.groupby('Gene').apply(lambda df: stats.ranksums(
        df[df['Tissue']=='tumor'].Proteomic,
        df[df['Tissue']=='normal'].Proteomic))
    ranksums = pd.DataFrame.from_records(ranksums, index = ranksums.index, columns = ['statistic', 'pval'])
    ranksums.reset_index(inplace = True)
    ranksums = ranksums.dropna()
    ranksums['FDR'] = ssm.fdrcorrection(ranksums['pval'])[1]
    ranksums['Cancer'] = [cancer_name] * len(ranksums)
    diff_expression_df = pd.merge(ranksums, fold_change)
    diff_expression_dfs.append(diff_expression_df)
diff_expression_df = pd.concat(diff_expression_dfs)
diff_expression_df.to_csv('data/Proteomics_differential_expression_df.csv')
diff_expression_df

  z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
  cond2 = cond0 & (x <= _a)
  z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
  cond2 = cond0 & (x <= _a)
  z = (s - expected) / np.sqrt(n1*n2*(n1+n2+1)/12.0)
  cond2 = cond0 & (x <= _a)


Unnamed: 0,Gene,statistic,pval,FDR,Cancer,Log2_fold_change
0,A1BG,4.062020,4.864990e-05,7.977389e-05,CCRCC,0.846625
1,A1CF,-6.998855,2.580621e-12,6.376757e-12,CCRCC,-0.218309
2,A2M,6.557556,5.469684e-11,1.261222e-10,CCRCC,-1.256118
3,A4GALT,0.979796,3.271869e-01,3.660405e-01,CCRCC,3.045874
4,AAAS,9.324065,1.119664e-20,4.340440e-20,CCRCC,4.527650
...,...,...,...,...,...,...
11123,ZXDC,9.587427,9.030866e-22,3.677112e-21,LSCC,-2.914588
11124,ZYG11B,-9.284609,1.623017e-20,6.101665e-20,LSCC,-1.544463
11125,ZYX,-11.867668,1.742578e-32,2.798182e-31,LSCC,-0.724833
11126,ZZEF1,-10.255579,1.117044e-24,5.770876e-24,LSCC,-1.760497
