In [1]:
import cptac
import pandas as pd
import scipy.stats as stats
import cptac.utils as ut
import scipy
import statsmodels.stats.multitest as ssm
import numpy as np

In [None]:
ccrcc = cptac.Ccrcc()
en = cptac.Endometrial()
luad = cptac.Luad()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()
pdac = cptac.Pdac()

Loading lscc v3.2.1......                       

In [None]:
cancers = [ccrcc, en, luad, hnscc, lscc, pdac]
cancer_names = ['CCRCC', 'Endometrial', 'LUAD', 'HNSCC', 'LSCC', 'PDAC']

In [None]:
def log2_fold_change(tumor, normal):
    tumor = np.mean(tumor)
    normal = np.mean(normal)
    if normal == 0 or tumor == 0:
        return(float('Nan'))
    fold_change = tumor / normal
    fold_change = abs(fold_change)
    log2_change = np.log2(fold_change)
    return log2_change    

In [None]:
diff_expression_dfs = []
for cancer, cancer_name in zip(cancers, cancer_names):
    prot_normal_df = cancer.get_proteomics('normal')
    if isinstance(prot_normal_df.columns, pd.MultiIndex):
        prot_normal_df = ut.reduce_multiindex(df= prot_normal_df, levels_to_drop = 'Database_ID')
    prot_normal_df.reset_index(inplace = True)
    prot_normal_df = prot_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    prot_normal_df['Tissue'] = ['normal'] * len(prot_normal_df)
    prot_tumor_df = cancer.get_proteomics('tumor')
    if(isinstance(prot_tumor_df.columns, pd.MultiIndex)):
        prot_tumor_df = ut.reduce_multiindex(df= prot_tumor_df, levels_to_drop = 'Database_ID')
    prot_tumor_df.reset_index(inplace = True)
    prot_tumor_df = prot_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Proteomic')
    prot_tumor_df['Tissue'] = ['tumor'] * len(prot_tumor_df)
    prot_df = pd.merge(prot_tumor_df, prot_normal_df, how = 'outer')
    prot_df = prot_df.dropna()
    fold_change = prot_df.groupby('Gene').apply(
        lambda df: log2_fold_change(df[df['Tissue']=='tumor'].Proteomic,
                                    df[df['Tissue']=='normal'].Proteomic))
    fold_change = pd.DataFrame(fold_change, columns = ['Log2_fold_change'])
    fold_change.reset_index(inplace = True)
    ranksums = prot_df.groupby('Gene').apply(lambda df: stats.ranksums(
        df[df['Tissue']=='tumor'].Proteomic,
        df[df['Tissue']=='normal'].Proteomic))
    ranksums = pd.DataFrame.from_records(ranksums, index = ranksums.index, columns = ['statistic', 'pval'])
    ranksums.reset_index(inplace = True)
    ranksums = ranksums.dropna()
    ranksums['FDR'] = ssm.fdrcorrection(ranksums['pval'])[1]
    ranksums['Cancer'] = [cancer_name] * len(ranksums)
    diff_expression_df = pd.merge(ranksums, fold_change)
    diff_expression_dfs.append(diff_expression_df)
diff_expression_df = pd.concat(diff_expression_dfs)
diff_expression_df.to_csv('data/Proteomics_differential_expression_df.csv', index = False)
diff_expression_df

In [None]:
diff_expression_dfs = []
for cancer, cancer_name in zip(cancers, cancer_names):
    prot_normal_df = cancer.get_transcriptomics('normal')
    if isinstance(prot_normal_df.columns, pd.MultiIndex):
        prot_normal_df = ut.reduce_multiindex(df= prot_normal_df, levels_to_drop = 'Database_ID')
    prot_normal_df.reset_index(inplace = True)
    prot_normal_df = prot_normal_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    prot_normal_df['Tissue'] = ['normal'] * len(prot_normal_df)
    prot_tumor_df = cancer.get_transcriptomics('tumor')
    if(isinstance(prot_tumor_df.columns, pd.MultiIndex)):
        prot_tumor_df = ut.reduce_multiindex(df= prot_tumor_df, levels_to_drop = 'Database_ID')
    prot_tumor_df.reset_index(inplace = True)
    prot_tumor_df = prot_tumor_df.melt(id_vars = 'Patient_ID', var_name = 'Gene', value_name = 'Transcriptomics')
    prot_tumor_df['Tissue'] = ['tumor'] * len(prot_tumor_df)
    prot_df = pd.merge(prot_tumor_df, prot_normal_df, how = 'outer')
    prot_df = prot_df.dropna()
    fold_change = prot_df.groupby('Gene').apply(
        lambda df: log2_fold_change(df[df['Tissue']=='tumor'].Transcriptomics,
                                    df[df['Tissue']=='normal'].Transcriptomics))
    fold_change = pd.DataFrame(fold_change, columns = ['Log2_fold_change'])
    fold_change.reset_index(inplace = True)
    ranksums = prot_df.groupby('Gene').apply(lambda df: stats.ranksums(
        df[df['Tissue']=='tumor'].Transcriptomics,
        df[df['Tissue']=='normal'].Transcriptomics))
    ranksums = pd.DataFrame.from_records(ranksums, index = ranksums.index, columns = ['statistic', 'pval'])
    ranksums.reset_index(inplace = True)
    ranksums = ranksums.dropna()
    ranksums['FDR'] = ssm.fdrcorrection(ranksums['pval'])[1]
    ranksums['Cancer'] = [cancer_name] * len(ranksums)
    diff_expression_df = pd.merge(ranksums, fold_change)
    diff_expression_dfs.append(diff_expression_df)
diff_expression_df = pd.concat(diff_expression_dfs)
diff_expression_df.to_csv('Transcriptomics_differential_expression_df.csv', index = False)
diff_expression_df