In [1]:
import pandas as pd
import math
import scipy
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import cptac
import cptac.utils as ut
import copy
import numpy as np

In [2]:
brca = cptac.Brca()
ccrcc = cptac.Ccrcc()
colon = cptac.Colon()
en = cptac.Endometrial()
gbm = cptac.Gbm()
luad = cptac.Luad()
ovarian = cptac.Ovarian()
hnscc = cptac.Hnscc()
lscc = cptac.Lscc()

                                                



# Get dataframes to correct

In [3]:
perm_corrected = pd.read_csv("full_10k_permutation_corrected.csv")
corr_diff = pd.read_csv("corr_diff.csv")
corr_df = pd.read_csv("correlations_dataframe.csv")#this one wait
pval_df = pd.read_csv("p_val.csv")
all_genes = (list(pd.read_csv("gene_list.csv").columns))

# Get tumor and normal data to check minimum sample number

In [4]:
ccrcc_prot_trans_tumor = (ut.reduce_multiindex(df=ccrcc.join_omics_to_omics(df1_name="proteomics", df2_name="transcriptomics", tissue_type="tumor"), levels_to_drop="Database_ID"))
ccrcc_prot_trans_normal = (ut.reduce_multiindex(df=ccrcc.join_omics_to_omics(df1_name="proteomics", df2_name="transcriptomics", tissue_type="normal"), levels_to_drop="Database_ID"))

ccrcc_prot_trans_tumor = ccrcc_prot_trans_tumor.loc[:,~ccrcc_prot_trans_tumor.columns.duplicated()]
ccrcc_prot_trans_normal = ccrcc_prot_trans_normal.loc[:,~ccrcc_prot_trans_normal.columns.duplicated()]





In [5]:
en_prot_trans_tumor = en.join_omics_to_omics(df1_name="proteomics", df2_name="transcriptomics", tissue_type="tumor")
en_prot_trans_normal = en.join_omics_to_omics(df1_name="proteomics", df2_name="transcriptomics", tissue_type="normal")




In [6]:
luad_prot_trans_tumor = ut.reduce_multiindex(df=luad.join_omics_to_omics(df1_name="proteomics", df2_name="transcriptomics", tissue_type="tumor"), levels_to_drop="Database_ID")
luad_prot_trans_normal = ut.reduce_multiindex(df=luad.join_omics_to_omics(df1_name="proteomics", df2_name="transcriptomics", tissue_type="normal"), levels_to_drop="Database_ID")

luad_prot_trans_tumor = luad_prot_trans_tumor.loc[:,~luad_prot_trans_tumor.columns.duplicated()]
luad_prot_trans_normal = luad_prot_trans_normal.loc[:,~luad_prot_trans_normal.columns.duplicated()]





In [7]:
hnscc_prot_luad_tumor = hnscc.join_omics_to_omics(df1_name="proteomics", df2_name="transcriptomics", tissue_type="tumor")
hnscc_prot_luad_normal = hnscc.join_omics_to_omics(df1_name="proteomics", df2_name="transcriptomics", tissue_type="normal")





In [8]:
lscc_prot_trans_tumor = ut.reduce_multiindex(df=lscc.join_omics_to_omics(df1_name="proteomics", df2_name="transcriptomics", tissue_type="tumor"), levels_to_drop="Database_ID")
lscc_prot_trans_normal = ut.reduce_multiindex(df=lscc.join_omics_to_omics(df1_name="proteomics", df2_name="transcriptomics", tissue_type="normal"), levels_to_drop="Database_ID")


lscc_prot_trans_tumor = lscc_prot_trans_tumor.loc[:,~lscc_prot_trans_tumor.columns.duplicated()]
lscc_prot_trans_normal = lscc_prot_trans_normal.loc[:,~lscc_prot_trans_normal.columns.duplicated()]





In [9]:
tissue_list = [[ccrcc_prot_trans_tumor,ccrcc_prot_trans_normal],[en_prot_trans_tumor, en_prot_trans_normal],[luad_prot_trans_tumor, luad_prot_trans_normal],[hnscc_prot_luad_tumor, hnscc_prot_luad_normal],[lscc_prot_trans_tumor, lscc_prot_trans_normal]]
order = ["ccrcc", "en", "luad", "hnscc", "lscc"]


# Function to fix dataframe

In [10]:
def fix_df(df, genes, tissues, order):

    correct_dp = copy.deepcopy(df.set_index("Cancer"))             
    data = {}
    data["Cancer"] = order
    genes_in_df = correct_dp.columns
    for i in genes_in_df:
        data[i] = []
    
    for i in genes_in_df:
        for index, value in enumerate(tissues):
            if (i+"_proteomics") in value[0].columns and (i+"_transcriptomics") in value[0].columns and (i+"_proteomics") in value[1].columns and (i+"_transcriptomics") in value[1].columns:
                if (len(value[0][i+"_proteomics"].dropna())) > 10 and (len(value[0][i+"_transcriptomics"].dropna())) and (len(value[1][i+"_proteomics"].dropna())) and (len(value[1][i+"_transcriptomics"].dropna())):
                    data[i].append(correct_dp.iloc[index][i])
                else:
                    data[i].append(np.nan)
            else:
                    data[i].append(np.nan)
                    
    return(pd.DataFrame(data))



# Dataframe output

In [12]:
perm_corrected = fix_df(perm_corrected, all_genes, tissue_list, order).set_index("Cancer")
corr_diff = fix_df(corr_diff, all_genes, tissue_list, order).set_index("Cancer")
pval_df = fix_df(pval_df, all_genes, tissue_list, order).set_index("Cancer")


In [13]:
new_index_corr_diff = list(corr_diff.index)
new_index_corr_diff[1] = "endometrial"
corr_diff.index = new_index_corr_diff


In [14]:
perm_corrected.to_csv("full_10k_permutation_corrected.csv")
corr_diff.to_csv("corr_diff.csv")
pval_df.to_csv("p_val.csv")
