In [1]:
import cptac
import scipy
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import statistics
import parse_correlations_dataframe as get_corr
import copy

In [2]:
cptac.download("endometrial")
en = cptac.Endometrial()

                                                

In [3]:
def find_mut_tumor(cancer_type, gene):
    try:
        gene_multi = cancer_type.multi_join({'proteomics': gene, 'transcriptomics': gene, 'somatic_mutation': gene}, tissue_type = 'tumor', flatten = True)
        normal = cancer_type.multi_join({'proteomics': gene, 'transcriptomics': gene}, tissue_type= 'normal', flatten = True)
        normal.columns = ['proteomics', 'transcriptomics']
        gene_multi.columns = ['proteomics', 'transcriptomics', 'mutation', 'location', 'mutation_status']
        trans = list(gene_multi["transcriptomics"])
        prot = list(gene_multi['proteomics'])
        group = []
        for i in gene_multi['mutation_status']:
            if type(i) == str:
                group.append("mutation")
            else:
                group.append("wt") 
        if group.count('wt') < 2 or len(normal) < 2:
            return float("NaN")
        group.extend(['normal']*len(normal))
        prot.extend(list(normal['proteomics']))
        trans.extend(list(normal['transcriptomics']))
        gene_df = pd.DataFrame({'Type': group, 'Proteomics': prot, 'Transcriptomics': trans})
        gene_df = gene_df.dropna()
        return gene_df
    except:
        return float("NaN")
   

In [4]:
def get_df_with_type(df,input_type):
    return(df.loc[df['Type']==input_type])

In [5]:
df = find_mut_tumor(en, 'TP53')



In [6]:
df

Unnamed: 0,Type,Proteomics,Transcriptomics
0,mutation,0.2950,11.62
1,wt,0.2770,11.83
2,wt,-0.8710,10.73
3,wt,-0.3430,10.95
4,mutation,3.0100,10.69
...,...,...,...
115,normal,0.7950,9.83
116,normal,0.6270,10.64
119,normal,-0.2370,9.62
120,normal,-0.7290,10.45


In [7]:
df2 = get_df_with_type(df,"normal")

In [8]:
df2

Unnamed: 0,Type,Proteomics,Transcriptomics
95,normal,-0.0919,9.87
96,normal,-0.593,10.52
98,normal,-0.131,9.98
99,normal,-0.0844,9.92
113,normal,0.514,10.35
114,normal,0.25,10.27
115,normal,0.795,9.83
116,normal,0.627,10.64
119,normal,-0.237,9.62
120,normal,-0.729,10.45


In [9]:
df2.corr(method="pearson")

Unnamed: 0,Proteomics,Transcriptomics
Proteomics,1.0,-0.045068
Transcriptomics,-0.045068,1.0


In [10]:
df2.corr(method="pearson")['Transcriptomics'][0]

-0.04506791545565061

In [11]:
df3 = df[df['Type'] != 'mutation']

In [12]:
def get_cancer_string(cancer):
    if cancer == brca:
        return "brca"
    if cancer == ccrcc:
        return "ccrcc"
    if cancer == colon:
        return 'colon'
    if cancer == en:
        return 'en'
    if cancer == gbm:
        return 'gbm'
    if cancer == luad:
        return 'luad'
    if cancer == ovarian:
        return 'ovarian'
    if cancer == hnscc:
        return 'hnscc'
    if cancer == lscc:
        return 'lscc'

In [13]:
brca = cptac.Brca()
ccrcc = cptac.Ccrcc()
colon = cptac.Colon()
en = cptac.Endometrial()
gbm = cptac.Gbm()
luad = cptac.Luad()
ovarian = cptac.Ovarian()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

Checking that luad index is up-to-date...       



Checking that lscc index is up-to-date...   



                                         



In [14]:
cancer_genes = ['ABL1','BRAF','FAT1','IDH1','TP53','BRCA2','KMT2D','KRAS','PIK3CA','PTEN','SETD2','SPTA1']
cancer_list = [brca,ccrcc,colon,en,gbm,luad,ovarian,hnscc,lscc]

In [None]:
cancer_type = []
genes = []
gene_dfs = []
norm_corr = []
wt_corr = []
for cancer in cancer_list:
    for gene in cancer_genes:
        if gene == 'KRAS' and cancer == ccrcc:
            continue
        df = find_mut_tumor(cancer, gene)
        if type(df) == pd.DataFrame:
            genes.append(gene)
            cancer_type.append(get_cancer_string(cancer))
            gene_dfs.append(df)
            norm_df = get_df_with_type(df,"normal")
            wt_df = get_df_with_type(df, 'wt')
            norm_cor = norm_df.corr(method="pearson")['Transcriptomics'][0]
            wt_cor = wt_df.corr(method="pearson")['Transcriptomics'][0]
            norm_corr.append(norm_cor)
            wt_corr.append(wt_cor)
norm_vs_wt_tumor = pd.DataFrame({'cancer': cancer_type, 'gene': genes, 'gene_df': gene_dfs,'normal_correlation': norm_corr, 'tumor_non-mutated_correlation': wt_corr})
norm_vs_wt_tumor.head()           






In [None]:
norm_vs_wt_tumor = norm_vs_wt_tumor.assign(corr_dif = abs(norm_vs_wt_tumor['normal_correlation'] - norm_vs_wt_tumor['tumor_non-mutated_correlation']))
norm_vs_wt_tumor.head()

In [None]:
norm_vs_wt_tumor = norm_vs_wt_tumor.sort_values(by = 'corr_dif', ascending=False, ignore_index=True)
norm_vs_wt_tumor.head()

In [None]:
for index, df in enumerate(norm_vs_wt_tumor['gene_df']):
    
    is_norm = df['Type']=='normal'
    is_tum = df['Type'] == 'wt'
    is_mut = df['Type'] == 'mutation'
    ax = sns.regplot(x = 'Proteomics', y = 'Transcriptomics', data = df[is_norm], label = "Normal")
    ax = sns.regplot(x = 'Proteomics', y = 'Transcriptomics', data = df[is_tum], label = 'Non-mutated Tumor')
    ax = sns.regplot(x = 'Proteomics', y = 'Transcriptomics', data = df[is_mut], label = 'Mutated Tumor')
    ax.legend(loc = 'best')
    plt.title(str(norm_vs_wt_tumor['cancer'][index] + ' | ' + norm_vs_wt_tumor['gene'][index]))
    plt.show()