In [1]:
import cptac
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import statistics

In [2]:
def get_gene_list(tissue):
    #Returns a list of the genes that are in both proteomics and transcriptomics
    gene_list = []
    prot = tissue.get_proteomics()
    if isinstance(prot.columns, pd.MultiIndex):
        prot = prot.columns.get_level_values(0)
    trans = tissue.get_transcriptomics()
    if isinstance(trans.columns, pd.MultiIndex):
        trans = trans.columns.get_level_values(0)
    for i in prot:
        if i in trans and i not in gene_list:
            gene_list.append(i)
    return gene_list

In [3]:
def correlation_list(tissue, gene_list, valid_sample_number = 30, tissue_type = "both"):
    #Returns a list of lists containing the name of the gene and the correlation
    corr_list = []
    pval_list = []
    prot = tissue.get_proteomics(tissue_type)
    if isinstance(prot.columns, pd.MultiIndex):
        prot = tissue.reduce_multiindex(df = prot, levels_to_drop="Database_ID")
    
    trans = tissue.get_transcriptomics(tissue_type)
    if isinstance(trans.columns, pd.MultiIndex):
        trans = tissue.reduce_multiindex(df = trans, levels_to_drop="Database_ID")
        
    prot_index_values = list(prot.index.values)
    trans_index_values = list(trans.index.values)
    

    for found_index in prot_index_values:
        if found_index not in trans_index_values:
            prot = prot.drop(index=found_index)
    for found_index in trans_index_values:
        if found_index not in prot_index_values:
            trans = trans.drop(index=found_index)
        
    for gene in gene_list:
        prot_measurements = prot[gene]
        prot_count = len(prot_measurements.dropna())
        
        trans_measurements = trans[gene]
        trans_count = len(trans_measurements.dropna())
        
        if prot_count < valid_sample_number or trans_count < valid_sample_number:
            continue
            
        nan_indices = set()
        prot_indices = list(prot_measurements.index.values)
        trans_indices = list(trans_measurements.index.values)
        
        #Here we are only currently taking the first column of multi_indices
        if isinstance(prot_measurements, pd.core.frame.DataFrame): 
            prot_measurements = prot_measurements.iloc[:,0]
        if isinstance(trans_measurements, pd.core.frame.DataFrame):
            trans_measurements = trans_measurements.iloc[:,0]
        for i in range(len(prot_measurements)):
            if math.isnan(prot_measurements[i]):
                nan_indices.add(prot_indices[i])
                
        for i in range(len(trans_measurements)):
            if math.isnan(trans_measurements[i]):
                nan_indices.add(trans_indices[i])
        
        nan_indices = list(nan_indices)
        prot_measurements = prot_measurements.drop(nan_indices)
        trans_measurements = trans_measurements.drop(nan_indices)
        
        correlation,pval = scipy.stats.pearsonr(prot_measurements, trans_measurements)
        if math.isnan(correlation):
            continue
        corr_list.append([gene,correlation])
        pval_list.append([gene, pval])
    return corr_list, pval_list

In [4]:
def ret_list(li):
    #Returns a list of correlations from all genes
    ret_li = []
    for i in li:
        ret_li.append(i[1])   
    return ret_li

In [7]:
# cptac.download("brca")
# cptac.download("ccrcc")
# cptac.download("colon")
# cptac.download("endometrial")
# cptac.download("gbm")
# cptac.download("luad")
# cptac.download("ovarian")
# cptac.download("hnscc")
# cptac.download("lscc")
brca = cptac.Brca()
ccrcc = cptac.Ccrcc()
colon = cptac.Colon()
en = cptac.Endometrial()
gbm = cptac.Gbm()
luad = cptac.Luad()
ovarian = cptac.Ovarian()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

Password for hnscc dataset: ········                
Password for lscc dataset: ········       
Checking that luad index is up-to-date...       



Checking that ovarian index is up-to-date...



Checking that lscc index is up-to-date...   



                                         



In [8]:
brca_gene_list = get_gene_list(brca)
ccrcc_gene_list = get_gene_list(ccrcc)
colon_gene_list = get_gene_list(colon)
gbm_gene_list = get_gene_list(gbm)
luad_gene_list = get_gene_list(luad)
ovarian_gene_list = get_gene_list(ovarian)
en_gene_list = get_gene_list(en)
hnscc_gene_list = get_gene_list(hnscc)
lscc_gene_list = get_gene_list(lscc)

In [7]:
brca_corr, brca_pval = correlation_list(brca, brca_gene_list, tissue_type = "tumor")
ccrcc_corr, ccrcc_pval = correlation_list(ccrcc, ccrcc_gene_list, tissue_type = "tumor")
colon_corr, colon_pval = correlation_list(colon, colon_gene_list, tissue_type = "tumor")
en_corr, en_pval = correlation_list(en, en_gene_list, tissue_type = "tumor")
gbm_corr, gbm_pval = correlation_list(gbm, gbm_gene_list, tissue_type = "tumor")
luad_corr, luad_pval = correlation_list(luad,luad_gene_list, tissue_type = "tumor")
ovarian_corr, ovarian_pval = correlation_list(ovarian, ovarian_gene_list, tissue_type = "tumor")
hnscc_corr, hnscc_pval = correlation_list(hnscc, hnscc_gene_list, tissue_type = "tumor")
lscc_corr, lscc_pval = correlation_list(lscc, lscc_gene_list, tissue_type = "tumor")



In [9]:
genes = brca_gene_list + ccrcc_gene_list + colon_gene_list + en_gene_list + gbm_gene_list+ luad_gene_list + ovarian_gene_list + hnscc_gene_list + lscc_gene_list
common_genes = []
for i in genes:
    if i not in common_genes:
        common_genes.append(i)
common_genes = sorted(common_genes)

In [10]:
def fill_genes_nan(all_genes, tissue_corr):
    tissue_genes = []
    for i, j  in tissue_corr:
        tissue_genes.append(i)
        
    for i in all_genes:
        if i not in tissue_genes:
            tissue_corr.append([i, np.nan])
    end_list = sorted(tissue_corr)
           
    return end_list

In [11]:
brca_list_corr = ret_list(fill_genes_nan(common_genes, brca_corr))
ccrcc_list_corr = ret_list(fill_genes_nan(common_genes, ccrcc_corr))
colon_list_corr = ret_list(fill_genes_nan(common_genes, colon_corr))
en_list_corr = ret_list(fill_genes_nan(common_genes, en_corr))
gbm_list_corr = ret_list(fill_genes_nan(common_genes, gbm_corr))
luad_list_corr = ret_list(fill_genes_nan(common_genes, luad_corr))
ovarian_list_corr = ret_list(fill_genes_nan(common_genes, ovarian_corr))
hnscc_list_corr = ret_list(fill_genes_nan(common_genes, hnscc_corr))
lscc_list_corr = ret_list(fill_genes_nan(common_genes, lscc_corr))

mean_list = []
std_list = []
sigma_list = []
range_list = []
def minmax(val_list):
    if len(val_list) == 0:
        return np.nan, np.nan
    else:
        min_val = min(val_list)
        max_val = max(val_list)

    return (min_val, max_val)
for i in range(len(common_genes)):
    
    row = list(filter(lambda x:not math.isnan(x),[brca_list_corr[i], ccrcc_list_corr[i], colon_list_corr[i], 
             en_list_corr[i],gbm_list_corr[i],luad_list_corr[i],ovarian_list_corr[i],
             hnscc_list_corr[i],lscc_list_corr[i]]))#Creates a list without Nan on each row

    counter = len(row)
    if counter == 0:
        counter = 1
        
    ###Mean calculation
    mean = sum(row)/counter
    mean_list.append(mean)
    
    ###Standard Deviation calculation
    std = 0
    if len(row) < 2:
        std = np.nan    
    else:
        std = statistics.stdev(row)
    std_list.append(std)
    
    ### 3 sigma calculation
    neg_sigma = mean - 3 * std
    pos_sigma = mean + 3 * std
    
    for i in row:
        sigma_bool = False
        if i > pos_sigma or i < neg_sigma:
            sigma_bool = True
            break
    sigma_list.append(sigma_bool)
    
    ###Calculating range
    min_value, max_value = minmax(row)
    range_list.append(max_value - min_value)
    
    
correlation_columns = {'BRCA': (brca_list_corr),'CCRCC': (ccrcc_list_corr), 
       'Colon': (colon_list_corr), 'Endometrial': (en_list_corr),
       'GBM': (gbm_list_corr), 'LUAD': (luad_list_corr), 'Ovarian': (ovarian_list_corr),
        'HNSCC' : (hnscc_list_corr), 'LSCC' : (lscc_list_corr), 'Mean' : mean_list, 'Stnd Deviation' :std_list, '+/- 3 sigma' : sigma_list, 'Range' : range_list }

correlation_df = pd.DataFrame(correlation_columns , columns = ['BRCA', 'CCRCC', 'Colon', 'Endometrial', 'GBM', 'LUAD', 'Ovarian', 'HNSCC', 'LSCC', 'Mean', 'Stnd Deviation', '+/- 3 sigma', 'Range'], index=common_genes)
correlation_df

NameError: name 'brca_corr' is not defined

In [11]:
correlation_df.loc[correlation_df['+/- 3 sigma'] == True]

Unnamed: 0,BRCA,CCRCC,Colon,Endometrial,GBM,LUAD,Ovarian,HNSCC,LSCC,Mean,Stnd Deviation,+/- 3 sigma,Range


In [12]:
brca_list_pval = (ret_list(fill_genes_nan(common_genes, brca_pval)))
ccrcc_list_pval = (ret_list(fill_genes_nan(common_genes, ccrcc_pval)))
colon_list_pval = (ret_list(fill_genes_nan(common_genes, colon_pval)))
en_list_pval = (ret_list(fill_genes_nan(common_genes, en_pval)))
gbm_list_pval = (ret_list(fill_genes_nan(common_genes, gbm_pval)))
luad_list_pval = (ret_list(fill_genes_nan(common_genes, luad_pval)))
ovarian_list_pval = (ret_list(fill_genes_nan(common_genes, ovarian_pval)))
hnscc_list_pval = (ret_list(fill_genes_nan(common_genes, hnscc_pval)))
lscc_list_pval = (ret_list(fill_genes_nan(common_genes, lscc_pval)))


pval_columns = {'BRCA': (brca_list_pval),
        'CCRCC': (ccrcc_list_pval), 'Colon': (colon_list_pval), 'Endometrial': (en_list_pval),
       'GBM': (gbm_list_pval), 'LUAD': (luad_list_pval), 'Ovarian': (ovarian_list_pval), 'HNSCC' : (hnscc_list_pval), 'LSCC' : (lscc_list_pval)}

pval_df= pd.DataFrame(pval_columns , columns = ['BRCA', 'CCRCC', 'Colon', 'Endometrial', 'GBM', 'LUAD', 'Ovarian', 'HNSCC', 'LSCC'], index=common_genes)
pval_df

Unnamed: 0,BRCA,CCRCC,Colon,Endometrial,GBM,LUAD,Ovarian,HNSCC,LSCC
A1BG,1.657241e-01,5.350095e-01,,1.084161e-04,4.651711e-02,4.807936e-01,,2.456948e-01,9.082049e-02
A2M,,3.432584e-39,5.325574e-15,,,,,,
A2ML1,1.249853e-06,9.150454e-01,2.353794e-01,8.172572e-01,1.612770e-03,4.015329e-05,0.208548,3.602656e-02,1.703835e-04
AAAS,1.373971e-17,,,9.999470e-23,,,0.000013,1.908976e-33,6.589307e-27
AACS,,,,2.594752e-05,,,,,1.805955e-02
...,...,...,...,...,...,...,...,...,...
ZNF782,4.909360e-02,2.958874e-01,,1.559574e-01,7.958310e-02,5.049816e-02,,3.776369e-07,3.654355e-10
ZNF85,7.470119e-01,1.109573e-02,,2.417794e-02,1.861457e-06,2.757689e-10,0.000585,5.811263e-04,3.793685e-05
ZNF880,7.319264e-11,2.569032e-02,1.594617e-01,2.798595e-08,3.274769e-12,6.578244e-06,0.006064,9.742303e-10,7.881491e-09
ZNF91,1.736969e-03,3.883803e-02,3.949742e-01,3.159430e-03,1.218953e-05,3.279695e-16,0.048267,3.547520e-10,3.444445e-13
