In [1]:
import cptac
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import statistics

In [2]:
cptac.download("brca")
cptac.download("ccrcc")
cptac.download("colon")
cptac.download("endometrial")
cptac.download("gbm")
cptac.download("luad")
cptac.download("ovarian")
cptac.download("hnscc")
cptac.download("lscc")

brca = cptac.Brca()
ccrcc = cptac.Ccrcc()
colon = cptac.Colon()
en = cptac.Endometrial()
gbm = cptac.Gbm()
luad = cptac.Luad()
ovarian = cptac.Ovarian()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

Checking that luad index is up-to-date...       



Checking that lscc index is up-to-date...   



                                         



In [11]:
def get_gene_list(tissue):
    #Returns a list of the genes that are in both proteomics and transcriptomics
    gene_list = []
    prot = tissue.get_proteomics()
    if isinstance(prot.columns, pd.MultiIndex):
        prot = prot.columns.get_level_values(0)
    trans = tissue.get_transcriptomics()
    if isinstance(trans.columns, pd.MultiIndex):
        trans = trans.columns.get_level_values(0)
    for i in prot:
        if i in trans and i not in gene_list:
            gene_list.append(i)
    return gene_list


def correlation_list(tissue, gene_list, valid_sample_number = 30, tissue_type = "both"):
    #Returns a list of lists containing the name of the gene and the correlation
    corr_list = []
    pval_list = []
    
    prot = tissue.get_proteomics(tissue_type)
    if isinstance(prot.columns, pd.MultiIndex):
        prot = cptac.utils.reduce_multiindex(prot, levels_to_drop = "Database_ID")
    
    trans = tissue.get_transcriptomics(tissue_type)
    if isinstance(trans.columns, pd.MultiIndex):
        trans = cptac.utils.reduce_multiindex(trans, levels_to_drop = "Database_ID")
        
    prot_index_values = list(prot.index.values)
    trans_index_values = list(trans.index.values)
    

    for found_index in prot_index_values:
        if found_index not in trans_index_values:
            prot = prot.drop(index=found_index)
    for found_index in trans_index_values:
        if found_index not in prot_index_values:
            trans = trans.drop(index=found_index)
  
    for gene in gene_list:
        prot_measurements = prot[gene]
        prot_count = len(prot_measurements.dropna())
        
        trans_measurements = trans[gene]
        trans_count = len(trans_measurements.dropna())
        
        if prot_count < valid_sample_number or trans_count < valid_sample_number:
            continue
            
        nan_indices = set()
        prot_indices = list(prot_measurements.index.values)
        trans_indices = list(trans_measurements.index.values)
        
        #Here we are only currently taking the first column of multi_indices
        if isinstance(prot_measurements, pd.core.frame.DataFrame): 
            prot_measurements = prot_measurements.iloc[:,0]
        if isinstance(trans_measurements, pd.core.frame.DataFrame):
            trans_measurements = trans_measurements.iloc[:,0]
        for i in range(len(prot_measurements)):
            if math.isnan(prot_measurements[i]):
                nan_indices.add(prot_indices[i])
                
        for i in range(len(trans_measurements)):
            if math.isnan(trans_measurements[i]):
                nan_indices.add(trans_indices[i])
        
        nan_indices = list(nan_indices)
        prot_measurements = prot_measurements.drop(nan_indices)
        trans_measurements = trans_measurements.drop(nan_indices)
        
        correlation,pval = scipy.stats.pearsonr(prot_measurements, trans_measurements)
        if math.isnan(correlation):
            continue

        corr_list.append([gene,correlation])
        pval_list.append([gene, pval])
    return corr_list, pval_list
def ret_list(li):
    #Returns a list of correlations from all genes
    ret_li = []
    for i in li:
        ret_li.append(i[1])   
    return ret_li
def minmax(val_list):
    if len(val_list) == 0:
        return np.nan, np.nan
    else:
        min_val = min(val_list)
        max_val = max(val_list)

    return (min_val, max_val)
def fill_genes_nan(all_genes, tissue_corr):
    tissue_genes = []
    for i, j  in tissue_corr:
        tissue_genes.append(i)
        
    for i in all_genes:
        if i not in tissue_genes:
            tissue_corr.append([i, np.nan])
    end_list = sorted(tissue_corr)
           
    return end_list

In [12]:
brca_gene_list = get_gene_list(brca)
ccrcc_gene_list = get_gene_list(ccrcc)
colon_gene_list = get_gene_list(colon)
gbm_gene_list = get_gene_list(gbm)
luad_gene_list = get_gene_list(luad)
ovarian_gene_list = get_gene_list(ovarian)
en_gene_list = get_gene_list(en)
hnscc_gene_list = get_gene_list(hnscc)
lscc_gene_list = get_gene_list(lscc)

brca_corr, brca_pval = correlation_list(brca, brca_gene_list, tissue_type = "tumor")
ccrcc_corr, ccrcc_pval = correlation_list(ccrcc, ccrcc_gene_list, tissue_type = "tumor")
colon_corr, colon_pval = correlation_list(colon, colon_gene_list, tissue_type = "tumor")
en_corr, en_pval = correlation_list(en, en_gene_list, tissue_type = "tumor")
gbm_corr, gbm_pval = correlation_list(gbm, gbm_gene_list, tissue_type = "tumor")
luad_corr, luad_pval = correlation_list(luad,luad_gene_list, tissue_type = "tumor")
ovarian_corr, ovarian_pval = correlation_list(ovarian, ovarian_gene_list, tissue_type = "tumor")
hnscc_corr, hnscc_pval = correlation_list(hnscc, hnscc_gene_list, tissue_type = "tumor")
lscc_corr, lscc_pval = correlation_list(lscc, lscc_gene_list, tissue_type = "tumor")

normal_brca_corr, normal_brca_pval = correlation_list(brca, brca_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_ccrcc_corr, normal_ccrcc_pval = correlation_list(ccrcc, ccrcc_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_colon_corr, normal_colon_pval = correlation_list(colon, colon_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_en_corr, normal_en_pval = correlation_list(en, en_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_gbm_corr, normal_gbm_pval = correlation_list(gbm, gbm_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_luad_corr, normal_luad_pval = correlation_list(luad,luad_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_ovarian_corr, normal_ovarian_pval = correlation_list(ovarian, ovarian_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_hnscc_corr, normal_hnscc_pval = correlation_list(hnscc, hnscc_gene_list,valid_sample_number = 7, tissue_type="normal")
normal_lscc_corr, normal_lscc_pval = correlation_list(lscc, lscc_gene_list,valid_sample_number = 7, tissue_type="normal")



In [30]:
df_gene_list =[]
df_correlation_list = []
df_index = []
tissue_type_list = []
for gene , correlation in brca_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("BRCA")    
    tissue_type_list.append("tumor")
    
    
for gene , correlation in ccrcc_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("CCRCC")    
    tissue_type_list.append("tumor")
for gene , correlation in normal_ccrcc_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("CCRCC")    
    tissue_type_list.append("normal")
    
for gene , correlation in colon_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("colon")    
    tissue_type_list.append("tumor")
    
    
for gene , correlation in en_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("Endometrial")    
    tissue_type_list.append("tumor")
for gene , correlation in normal_en_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("Endometrial")    
    tissue_type_list.append("normal")
    
    
for gene , correlation in gbm_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("GBM")    
    tissue_type_list.append("tumor")
for gene , correlation in normal_gbm_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("GBM")    
    tissue_type_list.append("normal")
    
    
for gene , correlation in luad_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("LUAD")    
    tissue_type_list.append("tumor")
for gene , correlation in normal_luad_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("LUAD")    
    tissue_type_list.append("normal")
    
    
for gene , correlation in ovarian_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("Ovarian")    
    tissue_type_list.append("tumor")
    
    
for gene , correlation in hnscc_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("HNSCC")    
    tissue_type_list.append("tumor")
for gene , correlation in normal_hnscc_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("HNSCC")    
    tissue_type_list.append("normal")
    
    
for gene , correlation in lscc_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("LSCC")    
    tissue_type_list.append("tumor")
for gene , correlation in normal_lscc_corr:
    df_gene_list.append(gene)
    df_correlation_list.append(correlation)
    df_index.append("LSCC")    
    tissue_type_list.append("normal")


In [34]:
df_pval_list =[]
for gene , pval in brca_pval:
    df_pval_list.append(pval)
    
for gene , pval in ccrcc_pval:
    df_pval_list.append(pval)
for gene , pval in normal_ccrcc_pval:
    df_pval_list.append(pval)
    
for gene , pval in colon_pval:
    df_pval_list.append(pval)
    
for gene , pval in en_pval:
    df_pval_list.append(pval)
for gene , pval in normal_en_pval:
    df_pval_list.append(pval)
    
for gene , pval in gbm_pval:
    df_pval_list.append(pval)
for gene , pval in normal_gbm_pval:
    df_pval_list.append(pval)
    
for gene , pval in luad_pval:
    df_pval_list.append(pval)
for gene , pval in normal_luad_pval:
    df_pval_list.append(pval)
    
for gene , pval in ovarian_pval:
    df_pval_list.append(pval)

for gene , pval in hnscc_pval:
    df_pval_list.append(pval)
for gene , pval in normal_hnscc_pval:
    df_pval_list.append(pval)
    
for gene , pval in lscc_pval:
    df_pval_list.append(pval)
for gene , pval in normal_lscc_pval:
    df_pval_list.append(pval)

In [39]:


df_columns = {'Tissue Type': (tissue_type_list),'Gene': (df_gene_list), 
       'Correlation': (df_correlation_list), 'P-value': (df_pval_list)}
       #'R-squared': (gbm_list_corr)}

In [36]:
len(df_pval_list) - len (tissue_type_list)

0

In [40]:
df = pd.DataFrame(df_columns , columns = ["Tissue Type", "Gene", "Correlation", "P-value"], index=df_index)


In [41]:
df

Unnamed: 0,Tissue Type,Gene,Correlation,P-value
BRCA,tumor,A1BG,0.131286,1.657241e-01
BRCA,tumor,A2M,0.422497,1.249853e-06
BRCA,tumor,A2ML1,0.713289,1.373971e-17
BRCA,tumor,AAAS,0.086532,3.432747e-01
BRCA,tumor,AACS,0.747434,4.695304e-23
...,...,...,...,...
LSCC,normal,ZXDC,0.268095,1.257086e-02
LSCC,normal,ZYG11B,0.132199,2.040310e-01
LSCC,normal,ZYX,0.225703,2.872053e-02
LSCC,normal,ZZEF1,0.356916,4.137038e-04
