In [1]:
import cptac
import scipy
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
# from venn import venn
import numpy as np
import math
import pandas as pd
import statistics
import cptac.utils as ut
from functools import reduce



In [2]:
def get_gene_list(tissue):
    #Returns a list of the genes that are in both proteomics and transcriptomics
    gene_list = []
    prot = tissue.get_proteomics()
    if isinstance(prot.columns, pd.MultiIndex):
        prot = prot.columns.get_level_values(0)
    trans = tissue.get_transcriptomics()
    if isinstance(trans.columns, pd.MultiIndex):
        trans = trans.columns.get_level_values(0)
    for i in prot:
        if i in trans and i not in gene_list:
            gene_list.append(i)
    return gene_list


In [3]:
def fill_genes_nan(all_genes, tissue_corr):
    tissue_genes = []
    for i, j  in tissue_corr:
        tissue_genes.append(i)
        
    for i in all_genes:
        if i not in tissue_genes:
            tissue_corr.append([i, np.nan])
    end_list = sorted(tissue_corr)
           
    return end_list

In [4]:

def correlation_list(tissue, gene_list, valid_sample_number = 30, tissue_type = "both"):
    #Returns a list of lists containing the name of the gene and the correlation
    corr_list = []
    pval_list = []
    prot = tissue.get_proteomics(tissue_type)
    if isinstance(prot.columns, pd.MultiIndex):
        prot = cptac.utils.reduce_multiindex(prot, levels_to_drop = "Database_ID")
    
    trans = tissue.get_transcriptomics(tissue_type)
    if isinstance(trans.columns, pd.MultiIndex):
        trans = cptac.utils.reduce_multiindex(trans, levels_to_drop = "Database_ID")
        
    prot_index_values = list(prot.index.values)
    trans_index_values = list(trans.index.values)
    

    for found_index in prot_index_values:
        if found_index not in trans_index_values:
            prot = prot.drop(index=found_index)
    for found_index in trans_index_values:
        if found_index not in prot_index_values:
            trans = trans.drop(index=found_index)
        
    for gene in gene_list:
        prot_measurements = prot[gene]
        prot_count = len(prot_measurements.dropna())
        
        trans_measurements = trans[gene]
        trans_count = len(trans_measurements.dropna())
        
        if prot_count < valid_sample_number or trans_count < valid_sample_number:
            continue
            
        nan_indices = set()
        prot_indices = list(prot_measurements.index.values)
        trans_indices = list(trans_measurements.index.values)
        
        #Here we are only currently taking the first column of multi_indices
        if isinstance(prot_measurements, pd.core.frame.DataFrame): 
            prot_measurements = prot_measurements.iloc[:,0]
        if isinstance(trans_measurements, pd.core.frame.DataFrame):
            trans_measurements = trans_measurements.iloc[:,0]
        for i in range(len(prot_measurements)):
            if math.isnan(prot_measurements[i]):
                nan_indices.add(prot_indices[i])
                
        for i in range(len(trans_measurements)):
            if math.isnan(trans_measurements[i]):
                nan_indices.add(trans_indices[i])
        
        nan_indices = list(nan_indices)
        prot_measurements = prot_measurements.drop(nan_indices)
        trans_measurements = trans_measurements.drop(nan_indices)
        
        correlation,pval = scipy.stats.pearsonr(prot_measurements, trans_measurements)
        if math.isnan(correlation):
            continue
        corr_list.append([gene,correlation])
        pval_list.append([gene, pval])
    return corr_list, pval_list

In [5]:
def ret_list(li):
    #Returns a list of correlations from all genes
    ret_li = []
    for i in li:
        ret_li.append(i[1])   
    return ret_li

In [6]:
def intersection(lst1, lst2): 
    return list(set(lst1).intersection(lst2))

In [7]:
cptac.download("brca")
cptac.download("ccrcc")
cptac.download("colon")
cptac.download("endometrial")
cptac.download("gbm")
cptac.download("luad")
cptac.download("ovarian")
cptac.download("hnscc")
cptac.download("lscc")

brca = cptac.Brca()
ccrcc = cptac.Ccrcc()
colon = cptac.Colon()
en = cptac.Endometrial()
gbm = cptac.Gbm()
luad = cptac.Luad()
ovarian = cptac.Ovarian()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

Checking that luad index is up-to-date...       



Checking that lscc index is up-to-date...   



                                         



In [8]:
brca_gene_list = get_gene_list(brca)
ccrcc_gene_list = get_gene_list(ccrcc)
colon_gene_list = get_gene_list(colon)
gbm_gene_list = get_gene_list(gbm)
luad_gene_list = get_gene_list(luad)
ovarian_gene_list = get_gene_list(ovarian)
en_gene_list = get_gene_list(en)
hnscc_gene_list = get_gene_list(hnscc)
lscc_gene_list = get_gene_list(lscc)

In [9]:
ccrcc_corr, ccrcc_pval = correlation_list(ccrcc, ccrcc_gene_list, tissue_type = "tumor")
en_corr, en_pval = correlation_list(en, en_gene_list, tissue_type = "tumor")
gbm_corr, gbm_pval = correlation_list(gbm, gbm_gene_list, tissue_type = "tumor")
luad_corr, luad_pval = correlation_list(luad,luad_gene_list, tissue_type = "tumor")
hnscc_corr, hnscc_pval = correlation_list(hnscc, hnscc_gene_list, tissue_type = "tumor")
lscc_corr, lscc_pval = correlation_list(lscc, lscc_gene_list, tissue_type = "tumor")



In [10]:
normal_ccrcc_corr, normal_ccrcc_pval = correlation_list(ccrcc, ccrcc_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_en_corr, normal_en_pval = correlation_list(en, en_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_gbm_corr, normal_gbm_pval = correlation_list(gbm, gbm_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_luad_corr, normal_luad_pval = correlation_list(luad,luad_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_hnscc_corr, normal_hnscc_pval = correlation_list(hnscc, hnscc_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_lscc_corr, normal_lscc_pval = correlation_list(lscc, lscc_gene_list,valid_sample_number = 7, tissue_type="normal")



In [11]:
ccrcc_gene_list = list(dict(ccrcc_corr).keys())
en_gene_list = list(dict(en_corr).keys())
gbm_gene_list = list(dict(gbm_corr).keys())
luad_gene_list = list(dict(luad_corr).keys())
hnscc_gene_list = list(dict(hnscc_corr).keys())
lscc_gene_list = list(dict(lscc_corr).keys())
all_genes_list = [ccrcc_gene_list,en_gene_list, gbm_gene_list, luad_gene_list, hnscc_gene_list, lscc_gene_list]
all_corr_list = [['CCRCC', dict(ccrcc_corr)],['Endometrial', dict(en_corr)], ['GBM', dict(gbm_corr)], ['LUAD', dict(luad_corr)], ['HNSCC', dict(hnscc_corr)], ['LSCC', dict(lscc_corr)]]
all_normal_corr_list = [['CCRCC', dict(normal_ccrcc_corr)],['Endometial', dict(normal_en_corr)], ['GBM', dict(normal_gbm_corr)], ['LUAD', dict(normal_luad_corr)], ['HNSCC', dict(normal_hnscc_corr)], ['LSCC', dict(normal_lscc_corr)]]

In [12]:
from functools import reduce

In [13]:
common_genes = reduce(intersection, all_genes_list)

In [14]:
common_genes = (sorted(common_genes))

In [17]:
def gene_corr_finder(gene, genes, tumor_corr, normal_corr):
    tumor_tmp = {}
    normal_tmp = {}
    highest_difference = 0
    highest_tissue = ''
    for j, k in tumor_corr:
            tumor_tmp[j]=k[gene]
    for j, k in normal_corr:
            normal_tmp[j]=k[gene]
    tissues = sorted(intersection(list(tumor_tmp.keys()), list(normal_tmp.keys())))
    
    print('Tumor results vs Normal results:')
    for i in tissues:
        print(i+': ' + str(tumor_tmp[i]) + ' / ' + str(normal_tmp[i]))
        if tumor_tmp[i] - normal_tmp[i] < highest_difference:
            highest_difference = abs(tumor_tmp[i] - normal_tmp[i])
            highest_tissue = i
    print('Highest difference from all tissues is  ' + highest_tissue + ': ' +  str(highest_difference))

In [18]:
gene_corr_finder('A1BG', common_genes, all_corr_list, all_normal_corr_list)

Tumor results vs Normal results:
CCRCC: -0.05978060489338315 / 0.32896429972846275
GBM: -0.2005791612564875 / 0.09635238880893919
HNSCC: 0.11213286663631726 / 0.11478362220348606
LSCC: 0.12207189606386522 / -0.07487443905384072
LUAD: -0.06791985940991745 / -0.028770547941812778
Highest difference from all tissues is  LUAD: 0.03914931146810467
