In [1]:
import cptac
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd

In [2]:
def get_gene_list(tissue):
    #Returns a list of the genes that are in both proteomics and transcriptomics
    gene_list = []
    prot = tissue.get_proteomics()
    if isinstance(prot.columns, pd.MultiIndex):
        prot = prot.columns.get_level_values(0)
    trans = tissue.get_transcriptomics()
    if isinstance(trans.columns, pd.MultiIndex):
        trans = trans.columns.get_level_values(0)
    for i in prot:
        if i in trans and i not in gene_list:
            gene_list.append(i)
    return gene_list

In [3]:
def correlation_list(tissue, gene_list, valid_sample_number = 50):
    #Returns a list of lists containing the name of the gene and the correlation
    corr_list = []
    pval_list = []
    prot = tissue.get_proteomics()
    if isinstance(prot.columns, pd.MultiIndex):
        prot = tissue.reduce_multiindex(df = prot, levels_to_drop="Database_ID")
    
    trans = tissue.get_transcriptomics()
    if isinstance(trans.columns, pd.MultiIndex):
        trans = tissue.reduce_multiindex(df = trans, levels_to_drop="Database_ID")
        
    prot_index_values = list(prot.index.values)
    trans_index_values = list(trans.index.values)
    

    for found_index in prot_index_values:
        if found_index not in trans_index_values:
            prot = prot.drop(index=found_index)
    for found_index in trans_index_values:
        if found_index not in prot_index_values:
            trans = trans.drop(index=found_index)
        
    for gene in gene_list:
        prot_measurements = prot[gene]
        prot_count = len(prot_measurements.dropna())
        
        trans_measurements = trans[gene]
        trans_count = len(trans_measurements.dropna())
        
        if prot_count < valid_sample_number or trans_count < valid_sample_number:
            continue
            
        nan_indices = set()
        prot_indices = list(prot_measurements.index.values)
        trans_indices = list(trans_measurements.index.values)
        
        #Here we are only currently taking the first column of multi_indices
        if isinstance(prot_measurements, pd.core.frame.DataFrame): 
            prot_measurements = prot_measurements.iloc[:,0]
        if isinstance(trans_measurements, pd.core.frame.DataFrame):
            trans_measurements = trans_measurements.iloc[:,0]
        for i in range(len(prot_measurements)):
            if math.isnan(prot_measurements[i]):
                nan_indices.add(prot_indices[i])
                
        for i in range(len(trans_measurements)):
            if math.isnan(trans_measurements[i]):
                nan_indices.add(trans_indices[i])
        
        nan_indices = list(nan_indices)
        prot_measurements = prot_measurements.drop(nan_indices)
        trans_measurements = trans_measurements.drop(nan_indices)
        
        correlation,pval = scipy.stats.pearsonr(prot_measurements, trans_measurements)
        if math.isnan(correlation):
            continue
        corr_list.append([gene, correlation])
        pval_list.append([gene, pval])
    return corr_list, pval_list

In [4]:
def ret_list(li):
    #Returns a list of correlations from all genes
    ret_li = []
    for i in li:
        ret_li.append(i[1])   
    return ret_li

In [5]:
cptac.download("brca")
cptac.download("ccrcc")
cptac.download("colon")
cptac.download("endometrial")
cptac.download("gbm")
cptac.download("luad")
cptac.download("ovarian")

brca = cptac.Brca()
ccrcc = cptac.Ccrcc()
colon = cptac.Colon()
en = cptac.Endometrial()
gbm = cptac.Gbm()
luad = cptac.Luad()
ovarian = cptac.Ovarian()

Checking that luad index is up-to-date...       



Checking that ovarian index is up-to-date...



                                            

In [6]:
brca_gene_list = get_gene_list(brca)
ccrcc_gene_list = get_gene_list(ccrcc)
colon_gene_list = get_gene_list(colon)
gbm_gene_list = get_gene_list(gbm)
luad_gene_list = get_gene_list(luad)
ovarian_gene_list = get_gene_list(ovarian)
en_gene_list = get_gene_list(en)

In [7]:
brca_corr, brca_pval = correlation_list(brca, brca_gene_list)
ccrcc_corr, ccrcc_pval = correlation_list(ccrcc, ccrcc_gene_list)
colon_corr, colon_pval = correlation_list(colon, colon_gene_list)
en_corr, en_pval = correlation_list(en, en_gene_list)
gbm_corr, gbm_pval = correlation_list(gbm, gbm_gene_list)
luad_corr, luad_pval = correlation_list(luad,luad_gene_list)
ovarian_corr, ovarian_pval = correlation_list(ovarian, ovarian_gene_list)



In [8]:
genes = brca_gene_list + ccrcc_gene_list + colon_gene_list + en_gene_list + gbm_gene_list + luad_gene_list + ovarian_gene_list
common_genes = []
for i in genes:
    if i not in common_genes:
        common_genes.append(i)

In [9]:
def fill_genes_nan(all_genes, tissue_corr):
    tissue_genes = []
    for i, j  in tissue_corr:
        tissue_genes.append(i)
        
    for i in all_genes:
        if i not in tissue_genes:
            tissue_corr.append([i, np.nan])
    end_list = sorted(tissue_corr)
           
    return end_list

In [11]:
brca_list_corr = (fill_genes_nan(common_genes, brca_corr))
ccrcc_list_corr = (fill_genes_nan(common_genes, ccrcc_corr))
colon_list_corr = (fill_genes_nan(common_genes, colon_corr))
en_list_corr = (fill_genes_nan(common_genes, en_corr))
gbm_list_corr = (fill_genes_nan(common_genes, gbm_corr))
luad_list_corr = (fill_genes_nan(common_genes, luad_corr))
ovarian_list_corr = (fill_genes_nan(common_genes, ovarian_corr))


correlation_columns = {'BRCA': ret_list(brca_list_corr),'CCRCC': ret_list(ccrcc_list_corr), 
       'Colon': ret_list(colon_list_corr), 'Endometrial': ret_list(en_list_corr),
       'GBM': ret_list(gbm_list_corr), 'LUAD': ret_list(luad_list_corr), 'Ovarian': ret_list(ovarian_list_corr)}

correlation_df = pd.DataFrame(correlation_columns , columns = ['BRCA', 'CCRCC', 'Colon', 'Endometrial', 'GBM', 'LUAD', 'Ovarian'], index=common_genes)
correlation_df

Unnamed: 0,BRCA,CCRCC,Colon,Endometrial,GBM,LUAD,Ovarian
A1BG,0.131286,0.106042,,0.535767,-0.199334,-0.015535,
A2M,,0.862425,0.698025,,,,
A2ML1,0.422497,0.264356,-0.122255,0.203493,0.408460,0.806497,0.140341
AAAS,0.713289,,,0.818275,,,0.531368
AACS,,,,0.429483,,,
...,...,...,...,...,...,...,...
ZNF716,0.200364,0.140537,,0.256042,0.133355,0.147789,
ZNF75A,-0.029503,0.020117,,0.439872,0.632420,0.475587,0.371712
ZNF791,0.546670,0.344869,0.144728,0.648994,0.660884,0.559567,0.300631
ZSCAN25,0.280712,0.207168,0.087799,0.242913,0.242082,0.780117,0.218811


In [12]:
brca_list_pval = (fill_genes_nan(common_genes, brca_pval))
ccrcc_list_pval = (fill_genes_nan(common_genes, ccrcc_pval))
colon_list_pval = (fill_genes_nan(common_genes, colon_pval))
en_list_pval = (fill_genes_nan(common_genes, en_pval))
gbm_list_pval = (fill_genes_nan(common_genes, gbm_pval))
luad_list_pval = (fill_genes_nan(common_genes, luad_pval))
ovarian_list_pval = (fill_genes_nan(common_genes, ovarian_pval))

pval_columns = {'BRCA': ret_list(brca_list_pval),
        'CCRCC': ret_list(ccrcc_list_pval), 'Colon': ret_list(colon_list_pval), 'Endometrial': ret_list(en_list_pval),
       'GBM': ret_list(gbm_list_pval), 'LUAD': ret_list(luad_list_pval), 'Ovarian': ret_list(ovarian_list_pval)}

pval_df= pd.DataFrame(pval_columns , columns = ['BRCA', 'CCRCC', 'Colon', 'Endometrial', 'GBM', 'LUAD', 'Ovarian'], index=common_genes)
pval_df

Unnamed: 0,BRCA,CCRCC,Colon,Endometrial,GBM,LUAD,Ovarian
A1BG,1.657241e-01,1.508299e-01,,1.934760e-09,3.861768e-02,8.224935e-01,
A2M,,5.275037e-56,5.325574e-15,,,,
A2ML1,1.249853e-06,2.767257e-04,2.353794e-01,3.381290e-02,1.141651e-05,1.355506e-49,0.208548
AAAS,1.373971e-17,,,4.348735e-24,,,0.000013
AACS,,,,8.715025e-05,,,
...,...,...,...,...,...,...,...
ZNF716,4.909360e-02,1.165070e-01,,7.203451e-03,1.905127e-01,6.135719e-02,
ZNF75A,7.470119e-01,7.857789e-01,,1.703433e-06,2.114287e-13,2.630642e-13,0.000585
ZNF791,7.319264e-11,2.769505e-06,1.594617e-01,2.313176e-14,7.072062e-15,8.778764e-19,0.006064
ZSCAN25,1.736969e-03,4.662864e-03,3.949742e-01,1.092515e-02,1.159701e-02,1.900213e-44,0.048267
