In [9]:
import cptac
import scipy
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import statistics
import parse_correlations_dataframe as get_corr
import copy
import get_correlations
import cptac.utils as ut
import csv

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
brca = cptac.Brca()
ccrcc = cptac.Ccrcc()
colon = cptac.Colon()
en = cptac.Endometrial()
gbm = cptac.Gbm()
luad = cptac.Luad()
ovarian = cptac.Ovarian()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

                                                

In [4]:
cancers = [ccrcc, en, luad, hnscc, lscc]

In [5]:
def get_all_gene_list(tissues):
    #Returns a list of the genes that are in both proteomics and transcriptomics
    whole_gene_list = set()
    for tissue in tissues:
        tissue_list = get_gene_list(tissue)
        for gene in tissue_list:
            whole_gene_list.add(gene)
    return sorted(list(whole_gene_list))

In [6]:
def get_gene_list(tissue):
    #Returns a list of the genes that are in both proteomics and transcriptomics
    gene_list = []
    prot = tissue.get_proteomics()
    if isinstance(prot.columns, pd.MultiIndex):
        prot = prot.columns.get_level_values(0)
    trans = tissue.get_transcriptomics()
    if isinstance(trans.columns, pd.MultiIndex):
        trans = trans.columns.get_level_values(0)
    for i in prot:
        if i in trans and i not in gene_list:
            gene_list.append(i)
    return gene_list

In [7]:
# gene_list = get_all_gene_list(cancers)

In [10]:
with open('gene_list.csv', newline='') as in_file:
    reader = csv.reader(in_file)
    gene_list = list(reader)[0]

In [11]:
type_dict = {brca:"brca",ccrcc:"ccrcc",colon:"colon",en:"endometrial",gbm:"gbm",luad:"luad",
                  ovarian:"ovarian",hnscc:"hnscc",lscc:"lscc"}

In [14]:
# https://link.springer.com/article/10.3758/s13428-012-0289-7
def compare_correlations(r1, r2, n1, n2):
    rp1 = np.arctanh(r1)
    rp2 = np.arctanh(r2)
    
    if n1 < 4 or n2 < 4:
        return(np.nan)
    Sr12 = math.sqrt((1/(n1-3))+(1/(n2-3)))
    z = (rp1-rp2) / Sr12
    p = scipy.stats.norm.sf(abs(z))*2
    return (p)

In [15]:
def permute(df,original_correlation, label_1, label_2, column_one, column_two, permutation_times):
    permutation_list = []
    permu_df = copy.deepcopy(df)

    for i in range(permutation_times):
        permu_df["Type"] = np.random.permutation(permu_df["Type"])
        permu_is_label_1 = permu_df["Type"] == label_1
        permu_is_label_2 = permu_df["Type"] == label_2
        label_1_correlation,label_1_pval = scipy.stats.pearsonr(permu_df[permu_is_label_1][column_one], permu_df[permu_is_label_1][column_two])
        label_2_correlation,label_2_pval = scipy.stats.pearsonr(permu_df[permu_is_label_2][column_one], permu_df[permu_is_label_2][column_two])
        delta = label_1_correlation - label_2_correlation
        permutation_list.append(delta)
        
    z_score = (original_correlation - np.mean(permutation_list)) / np.std(permutation_list)
    p_val = scipy.stats.norm.sf(abs(z_score))*2
    return p_val

In [17]:
tot_diff_list = []
tot_pval_list = []
tot_perm_list = []
for cancer in cancers:
    cancer_diff_list = [type_dict[cancer]]
    cancer_pval_list = [type_dict[cancer]]
    cancer_perm_list = [type_dict[cancer]]
    
    tumor_cancer_df = cancer.join_omics_to_omics("transcriptomics","proteomics",tissue_type="tumor",quiet=True)
    if isinstance(tumor_cancer_df.columns, pd.MultiIndex):
        tumor_cancer_df = ut.reduce_multiindex(df = tumor_cancer_df, levels_to_drop="Database_ID",quiet=True)
            
    normal_cancer_df = cancer.join_omics_to_omics("transcriptomics","proteomics",tissue_type="normal",quiet=True)
    if isinstance(normal_cancer_df.columns, pd.MultiIndex):
        normal_cancer_df = ut.reduce_multiindex(df = normal_cancer_df, levels_to_drop="Database_ID",quiet=True)
    
    for gene in gene_list:
        gene_trans = gene + "_transcriptomics"
        gene_prot = gene + "_proteomics"
        gene_in_tumor = gene_trans in tumor_cancer_df.columns and gene_prot in tumor_cancer_df.columns
        gene_in_normal = gene_trans in normal_cancer_df.columns and gene_prot in normal_cancer_df.columns
        
        if not(gene_in_tumor and gene_in_normal):
            cancer_diff_list.append(np.nan)
            cancer_pval_list.append(np.nan)
            cancer_perm_list.append(np.nan)
            continue

        tumor_df = tumor_cancer_df[[gene_trans,gene_prot]]
        #The following line takes care of the problem that arises when reducing the multi-index like we did earlier.
        #There are sometimes multiple columns of proteomics and sometimes multiple of transcriptomics, this 
        #takes the first one.
        if isinstance(tumor_df[gene_trans], pd.core.frame.DataFrame) or isinstance(tumor_df[gene_prot], pd.core.frame.DataFrame): #This is to take first column of multi-index
            trans_col = tumor_df[gene_trans]
            if isinstance(tumor_df[gene_trans], pd.core.frame.DataFrame):
                trans_col = trans_col.iloc[:,0]
            prot_col = tumor_df[gene_prot]
            if isinstance(tumor_df[gene_prot], pd.core.frame.DataFrame):
                prot_col = prot_col.iloc[:,0]
            frame = {gene_trans : trans_col, gene_prot : prot_col}
            tumor_df = pd.DataFrame(frame)
        tumor_df = tumor_df.dropna()
        num_tumor = len(tumor_df)
        tumor_corr = tumor_df.corr().iloc[0][1]
        
        normal_df = normal_cancer_df[[gene_trans,gene_prot]]
        if isinstance(normal_df[gene_trans], pd.core.frame.DataFrame) or isinstance(normal_df[gene_prot], pd.core.frame.DataFrame): #This is to take first column of multi-index
            trans_col = normal_df[gene_trans]
            if isinstance(normal_df[gene_trans], pd.core.frame.DataFrame):
                trans_col = trans_col.iloc[:,0]
            prot_col = normal_df[gene_prot]
            if isinstance(normal_df[gene_prot], pd.core.frame.DataFrame):
                prot_col = prot_col.iloc[:,0]
            frame = {gene_trans : trans_col, gene_prot : prot_col}
            normal_df = pd.DataFrame(frame)
        normal_df = normal_df.dropna()
        num_normal = len(normal_df)
        normal_corr = normal_df.corr().iloc[0][1]
        
        corr_diff = tumor_corr - normal_corr
        cancer_diff_list.append(corr_diff)
        
        gene_pval = compare_correlations(tumor_corr, normal_corr, num_tumor, num_normal)
        cancer_pval_list.append(gene_pval)
        cancer_perm_list.append(np.nan)
        
        #Permutations
#         if num_tumor < 4 or num_normal < 4 or gene_pval > .2:
#             cancer_perm_list.append(np.nan)
#             continue
#         tumor_label_list = ['tumor'] * len(tumor_df)
#         tumor_df["Type"] = tumor_label_list
        
#         normal_label_list = ['normal'] * len(normal_df)
#         normal_df["Type"] = normal_label_list
        
#         perm_list = [tumor_df,normal_df]
#         perm_df = pd.concat(perm_list)
        
#         column_one = perm_df.columns[0]
#         column_two = perm_df.columns[1]
#         perm_val = permute(perm_df,corr_diff,"tumor","normal",column_one,column_two,1000)
#         cancer_perm_list.append(perm_val)
        
    tot_diff_list.append(cancer_diff_list)
    tot_pval_list.append(cancer_pval_list)
    tot_perm_list.append(cancer_perm_list)

In [18]:
labels = ["Cancer"] 
labels.extend(gene_list)
df = pd.DataFrame.from_records(tot_diff_list,columns=labels)
df2 = pd.DataFrame.from_records(tot_pval_list,columns=labels)
df3 = pd.DataFrame.from_records(tot_perm_list,columns=labels)

In [20]:
df

Unnamed: 0,Cancer,A1BG,A1CF,A2M,A2ML1,A4GALT,AAAS,AACS,AADAC,AADAT,...,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDA,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
0,ccrcc,-0.388745,0.146993,-0.277035,,-0.795957,-0.044291,-0.128218,1.196756,-0.174925,...,,0.269728,0.329727,-0.337301,,0.33747,0.109414,-0.223934,-0.138352,-0.220727
1,endometrial,0.061654,,-0.025535,0.40221,0.399061,0.663703,0.593777,,0.302879,...,0.177842,0.088061,0.759429,1.006282,,-0.267078,0.400272,0.147305,0.367791,0.342423
2,luad,-0.039149,,0.097233,,,0.206094,0.683092,-0.119746,0.630155,...,,0.557832,0.904669,0.645076,,0.087794,0.707991,0.472139,0.418589,0.661323
3,hnscc,-0.002651,0.407295,-0.194186,0.117838,0.555675,0.462864,0.024965,-0.333934,-0.294627,...,1.039965,0.395189,0.119698,1.079145,,0.591841,-0.453269,-0.010672,0.255505,-0.060708
4,lscc,0.196946,,-0.06263,0.649123,0.246831,0.304418,0.150665,-0.068258,,...,0.613891,0.491043,0.64643,0.838981,,0.33235,0.231167,0.315679,0.268239,0.446553


In [21]:
df.to_csv("corr_diff.csv",index=False)

In [22]:
df2.to_csv("p_val.csv",index=False)

In [17]:
#  df3.to_csv("nathaniel_list_permutation_pval01.csv",index=False)