In [7]:
import cptac
import scipy
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import statistics
import parse_correlations_dataframe as get_corr
import copy
import get_correlations
import cptac.utils as ut

In [107]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
brca = cptac.Brca()
ccrcc = cptac.Ccrcc()
colon = cptac.Colon()
en = cptac.Endometrial()
gbm = cptac.Gbm()
luad = cptac.Luad()
ovarian = cptac.Ovarian()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

Checking that luad index is up-to-date...       



Checking that lscc index is up-to-date...   



                                         



In [66]:
cancers = [brca, ccrcc, colon, en, gbm, luad, ovarian, hnscc, lscc]

In [114]:
cancers2 = [ccrcc, en, gbm, luad, hnscc, lscc]

In [71]:
hi = set()
hi.add("yo")

In [72]:
hi

{'yo'}

In [82]:
def get_all_gene_list(tissues):
    #Returns a list of the genes that are in both proteomics and transcriptomics
    whole_gene_list = set()
    for tissue in tissues:
        tissue_list = get_gene_list(tissue)
        for gene in tissue_list:
            whole_gene_list.add(gene)
    return sorted(list(whole_gene_list))

In [83]:
def get_all_gene_list_old(tissues):
    #Returns a list of the genes that are in both proteomics and transcriptomics
    whole_gene_list = set()
    for tissue in tissues:
        gene_list = []
        prot = tissue.get_proteomics()
        if isinstance(prot.columns, pd.MultiIndex):
            prot = ut.reduce_multiindex(df = prot, levels_to_drop="Database_ID",quiet=True)
        trans = tissue.get_transcriptomics()
        if isinstance(trans.columns, pd.MultiIndex):
            trans = ut.reduce_multiindex(df = trans, levels_to_drop="Database_ID",quiet=True)
        for gene in prot:
            whole_gene_list.add(gene)
        for gene in trans:
            whole_gene_list.add(gene)
    return sorted(list(whole_gene_list))

In [84]:
def get_gene_list(tissue):
    #Returns a list of the genes that are in both proteomics and transcriptomics
    gene_list = []
    prot = tissue.get_proteomics()
    if isinstance(prot.columns, pd.MultiIndex):
        prot = prot.columns.get_level_values(0)
    trans = tissue.get_transcriptomics()
    if isinstance(trans.columns, pd.MultiIndex):
        trans = trans.columns.get_level_values(0)
    for i in prot:
        if i in trans and i not in gene_list:
            gene_list.append(i)
    return gene_list

In [85]:
gene_list = get_all_gene_list(cancers)

In [60]:
type_dict = {brca:"brca",ccrcc:"ccrcc",colon:"colon",en:"endometrial",gbm:"gbm",luad:"luad",
                  ovarian:"ovarian",hnscc:"hnscc",lscc:"lscc"}

In [111]:
# https://link.springer.com/article/10.3758/s13428-012-0289-7
def compare_correlations(r1, r2, n1, n2):
    rp1 = np.arctanh(r1)
    rp2 = np.arctanh(r2)
    
    if n1 < 4 or n2 < 4:
        return(np.nan)
    Sr12 = math.sqrt((1/(n1-3))+(1/(n2-3)))
    z = (rp1-rp2) / Sr12
    p = scipy.stats.norm.sf(abs(z))*2
    return (p)

In [4]:
def find_mut_tumor(cancer_type, gene):
    try:
        gene_multi = cancer_type.multi_join({'proteomics': gene, 'transcriptomics': gene, 'somatic_mutation': gene}, tissue_type = 'tumor', flatten = True)
        normal = cancer_type.multi_join({'proteomics': gene, 'transcriptomics': gene}, tissue_type= 'normal', flatten = True)
        normal.columns = ['proteomics', 'transcriptomics']
        gene_multi.columns = ['proteomics', 'transcriptomics', 'mutation', 'location', 'mutation_status']
        trans = list(gene_multi["transcriptomics"])
        prot = list(gene_multi['proteomics'])
        group = []
        for i in gene_multi['mutation_status']:
            if type(i) == str:
                group.append("mutation")
            else:
                group.append("wt") 
        if group.count('wt') < 2 or len(normal) < 2:
            return float("NaN")
        group.extend(['normal']*len(normal))
        prot.extend(list(normal['proteomics']))
        trans.extend(list(normal['transcriptomics']))
        gene_df = pd.DataFrame({'Type': group, 'Proteomics': prot, 'Transcriptomics': trans})
        gene_df = gene_df.dropna()
        return gene_df
    except:
        return float("NaN")

In [5]:
def get_df_with_type(df,input_type):
    return(df.loc[df['Type']==input_type])

In [52]:
def permute(df,original_correlation, label_1, label_2, column_one, column_two, permutation_times):
    permutation_list = []
    permu_df = copy.deepcopy(df)

    for i in range(permutation_times):
        permu_df["Type"] = np.random.permutation(permu_df["Type"])
        permu_is_label_1 = permu_df["Type"] == label_1
        permu_is_label_2 = permu_df["Type"] == label_2
        label_1_correlation,label_1_pval = scipy.stats.pearsonr(permu_df[permu_is_label_1][column_one], permu_df[permu_is_label_1][column_two])
        label_2_correlation,label_2_pval = scipy.stats.pearsonr(permu_df[permu_is_label_2][column_one], permu_df[permu_is_label_2][column_two])
        delta = label_1_correlation - label_2_correlation
        permutation_list.append(delta)
        
    z_score = (original_correlation - np.mean(permutation_list)) / np.std(permutation_list)
    p_val = scipy.stats.norm.sf(abs(z_score))*2
    return p_val

In [115]:
# first_pass = dict()
tot_diff_list = []
tot_pval_list = []
tot_perm_list = []
for cancer in cancers2:
    cancer_diff_list = [type_dict[cancer]]
    cancer_pval_list = [type_dict[cancer]]
    cancer_perm_list = [type_dict[cancer]]
    for gene in gene_list[0:100]:
        tumor_df = cancer.join_omics_to_omics("transcriptomics","proteomics",genes1=gene,genes2=gene,tissue_type="tumor",quiet=True)
        if isinstance(tumor_df.columns, pd.MultiIndex):
            tumor_df = ut.reduce_multiindex(df = tumor_df, levels_to_drop="Database_ID",quiet=True)
        tumor_df = tumor_df.dropna()
        num_tumor = len(tumor_df)
        tumor_corr = tumor_df.corr().iloc[0][1]
        
        normal_df = cancer.join_omics_to_omics("transcriptomics","proteomics",genes1=gene,genes2=gene,tissue_type="normal",quiet=True)
        if isinstance(normal_df.columns, pd.MultiIndex):
            normal_df = ut.reduce_multiindex(df = normal_df, levels_to_drop="Database_ID",quiet=True)
        normal_df = normal_df.dropna()
        num_normal = len(normal_df)
        normal_corr = normal_df.corr().iloc[0][1]
        
        if math.isnan(tumor_corr) or math.isnan(normal_corr):
            cancer_diff_list.append(np.nan)
            cancer_pval_list.append(np.nan)
            cancer_perm_list.append(np.nan)
            continue
        
        corr_diff = tumor_corr - normal_corr
        cancer_diff_list.append(corr_diff)
        
        gene_pval = compare_correlations(tumor_corr, normal_corr, num_tumor, num_normal)
        cancer_pval_list.append(gene_pval)
        
        #Here we do permutations. Cut down number?
#         tumor_label_list = ['tumor'] * len(tumor_df)
#         tumor_df["Type"] = tumor_label_list
        
#         normal_label_list = ['normal'] * len(normal_df)
#         normal_df["Type"] = normal_label_list
        
#         perm_list = [tumor_df,normal_df]
#         perm_df = pd.concat(perm_list)
        
#         column_one = perm_df.columns[0]
#         column_two = perm_df.columns[1]
#         perm_val = permute(perm_df,corr_diff,"tumor","normal",column_one,column_two,1000)
#         cancer_perm_list.append(perm_val)
        cancer_perm_list.append(np.nan)
        
    tot_diff_list.append(cancer_diff_list)
    tot_pval_list.append(cancer_pval_list)
    tot_perm_list.append(cancer_perm_list)
labels = gene_list[0:5]
# df = pd.DataFrame.from_records(tot_diff_list,columns=labels)
# df2 = pd.DataFrame.from_records(tot_pval_list,columns=labels)
# df3 = pd.DataFrame.from_records(tot_perm_list,columns=labels)

In [118]:
tot_pval_list

[['ccrcc',
  0.008435338335546692,
  0.001957931802613312,
  0.061197926734886665,
  nan,
  0.3472955578251835,
  0.7672479652906784,
  0.1477091655116799,
  0.08118479297357405,
  0.3707173051083895,
  0.7393233345698619,
  5.086837385676488e-05,
  0.0065422617666125536,
  nan,
  0.002565573021437835,
  0.08533261280297942,
  7.56080557759791e-05,
  0.3889350113319997,
  0.00013279970981353652,
  0.037154526031547064,
  0.6463004427661148,
  0.038238740202974734,
  0.7971671813076718,
  nan,
  4.6181623660180835e-06,
  0.20321413144849687,
  0.8215751669353271,
  nan,
  0.7851714173111022,
  0.3746835683280799,
  0.5666647164906664,
  0.02364378312375278,
  nan,
  0.607845947785254,
  0.061988250982784814,
  0.01884281936091846,
  0.2086424152144304,
  0.04567235058321581,
  0.6302826390389238,
  0.6301993394758363,
  0.020111369403424822,
  0.0005819933286459523,
  0.6405478660766019,
  nan,
  0.968995916648499,
  0.018363766239681414,
  0.4366969100683783,
  0.9137355673080259,
  0.

In [103]:
df3

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A4GALT
0,brca,,,,
1,ccrcc,0.049344,0.000831,0.019891,
2,colon,,,,
3,endometrial,0.775861,,0.912147,0.01393949
4,gbm,0.380497,,0.72014,
5,luad,0.77103,,0.009256,
6,ovarian,,,,
7,hnscc,0.991894,0.788576,0.211574,0.0004487003
8,lscc,0.140028,,0.098622,1.278508e-22


In [None]:
        
        #join omics to omics, proteomics to transcriptomics. One call for tumor, one to normal
        #join one column from proteomics, one from transcriptomics. Make one normal, one tumor. each will have 2 columns
        #here we call Humberto's function as well, giving us p-value - make dataframe with multijoin that has label for 
        #whether tumor or normal (3 columns in total, transcrip, prot, and tumor/normal. delta correlation as corr1 - corr2)
        #just a stats call to get corr numbers
        #If Nathaniel's p-value looks bad, then don't give it to Humberto. Otherwise, pass it in
#         corr_1,num_samples_1 = get_single_gene_correlations(gene, [cancer],input_tissue_type = "tumor")
#         corr_2,num_samples_2 = get_single_gene_correlations(gene,[cancer],input_tissue_type = "normal")
#             check mut vs wt tumor:

In [None]:
#create 3 giant dataframes - difference in correlation, Nathaniel's p-value from z score, and Humberto's permutation based p-value