In [1]:
import cptac
import pandas as pd
from scipy import stats
import numpy as np
import statsmodels.stats.multitest as ssm

In [None]:
ccrcc = cptac.Ccrcc()
en = cptac.Endometrial()
luad = cptac.Luad()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

Loading ccrcc v0.1.1..                    

In [None]:
def find_norm_tumor_corr_effect(cancer_type, gene):
    tumor = cancer_type.multi_join({'proteomics': gene, 'transcriptomics': gene}, tissue_type= 'tumor', flatten = True)
    normal = cancer_type.multi_join({'proteomics': gene, 'transcriptomics': gene}, tissue_type= 'normal', flatten = True)
    tumor = tumor.dropna()
    normal = normal.dropna()
    if len(normal) < 2 or len(tumor) < 2:
        return float("NaN"), float("NaN"), float("NaN")
    tumor.columns = ['proteomics', 'transcriptomics']
    normal.columns = ['proteomics', 'transcriptomics']
    groups = ['tumor'] * len(tumor)
    groups.extend(['normal']*len(normal))
    prot_list = list(tumor['proteomics'])
    prot_list.extend(list(normal['proteomics']))
    trans_list = list(tumor['transcriptomics'])
    trans_list.extend(list(normal['transcriptomics']))
    gene_df = pd.DataFrame({'Type': groups, 'Proteomics': prot_list, 'Transcriptomics': trans_list})
    is_tum = gene_df['Type'] == 'tumor'
    is_norm = gene_df['Type'] == 'normal'
    tum_cor, tum_pval = stats.pearsonr(gene_df[is_tum]['Proteomics'], gene_df[is_tum]['Transcriptomics'])
    norm_cor, norm_pval = stats.pearsonr(gene_df[is_norm]['Proteomics'], gene_df[is_norm]['Transcriptomics'])
    return tum_cor, tum_pval, norm_cor, norm_pval, gene_df  

In [None]:
def get_cancer_string(cancer):
    if cancer == ccrcc:
        return "ccrcc"
    if cancer == en:
        return 'endometrial'
    if cancer == luad:
        return 'luad'
    if cancer == hnscc:
        return 'hnscc'
    if cancer == lscc:
        return 'lscc'

See https://statisticsbyjim.com/regression/comparing-regression-lines/

In [None]:


from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
import rpy2.robjects as robj
import pandas as pd
def linear_model(data, Input, Output, Condition):
    try:
        stats = importr('stats')
        base = importr('base')
        pandas2ri.activate()
        r_df = pandas2ri.py2rpy(data)
        pandas2ri.deactivate()
        formula = '{y}~{x}*{condition}'.format(y = Output, x = Input, condition = Condition)
        lm = stats.lm(formula, r_df)
        summary = (base.summary(lm))
        results = summary.rx2('coefficients')
        results_df = base.as_data_frame_matrix(results)
        py_results_df = pd.DataFrame(results_df).transpose()
        py_results_df.columns = results_df.colnames
        py_results_df.index = results_df.rownames
        return(py_results_df)
    except:
        print(data)

In [None]:
cancer_genes = ['BRAF','FAT1','IDH1','TP53','KMT2D','KRAS','PIK3CA','PTEN','SPTA1']
cancer_list = [ccrcc,en,luad,hnscc,lscc]

In [None]:
# Make df for grid plot
dfs = []
rows = []
for cancer in cancer_list:
    for gene in cancer_genes:
        if gene == "KRAS" and cancer == ccrcc:
            continue
        tum_cor, tum_pval, norm_cor, norm_pval, gene_df = find_norm_tumor_corr_effect(cancer, gene)
        if not np.isnan(tum_cor):
            d = {}
            gene_df['Cancer'] = [get_cancer_string(cancer)] * len(gene_df)
            gene_df['Gene'] = [gene] * len(gene_df)
            dfs.append(gene_df)
            d['cancer'] = get_cancer_string(cancer)
            d['gene'] = gene
            d['tum_pval'] = tum_pval
            d['tum_corr'] = tum_cor
            d['norm_pval'] = norm_pval
            d['norm_corr'] = norm_cor
            
            rows.append(d)
full_df = pd.concat(dfs)
full_df = full_df.rename(columns ={'Type': 'Tissue'})
corr_df = pd.DataFrame(rows)

In [None]:
full_df

In [None]:
corr_df

In [None]:
cancer_dfs = []
for cancer in pd.unique(full_df.Cancer):
    print(cancer)
    rows = []
    for gene in list(pd.unique(full_df.Gene)):
        print(gene)
        d = {}
        df = full_df[full_df.Gene == gene]
        df = df[df.Cancer == cancer]
        if len(df) < 4:
            continue
        df = df[['Tissue', 'Proteomics', 'Transcriptomics']]
        lm_df= linear_model(df, 'Transcriptomics', 'Proteomics', 'Tissue')
        d['gene'] = gene
        d['cancer'] = cancer
        d['interaction_coeff'] = lm_df['Estimate'][3]
        d['condition_coeff'] = lm_df['Estimate'][2]
        d['transcript_coeff'] = lm_df['Estimate'][1]
        d['intercept'] = lm_df['Estimate'][0]
        d['interaction_pval'] = lm_df['Pr(>|t|)'][3]
        d['condition_pval'] = lm_df['Pr(>|t|)'][2]
        d['transcript_pval'] = lm_df['Pr(>|t|)'][1]
        d['intercept_pval'] = lm_df['Pr(>|t|)'][0]
        rows.append(d)
    cancer_df = pd.DataFrame(rows)
    for column in cancer_df:
        if 'pval' in column:
            old_pvals = list(cancer_df[column])
            adj_pvals = list(ssm.fdrcorrection(old_pvals)[1])
            cancer_df[column] = adj_pvals    
    cancer_dfs.append(cancer_df)
            
lm_df = pd.concat(cancer_dfs)
lm_df

In [None]:
import seaborn as sns
import matplotlib as plt

In [None]:
# get delta correlation p-values
delta_corr_pvals_df = pd.read_csv('data/p_val.csv')
combined_df = pd.merge(normal_df, tumor_df, how = 'outer')
combined_df['Significant Correlation'] = combined_df['p-val'] <= 0.05
delta_corr_pvals = []
for index, row in combined_df.iterrows():
    cancer = row['Cancer']
    gene = row['Gene']
    pval = delta_corr_pvals_df[delta_corr_pvals_df.Cancer == cancer].reset_index()[gene][0]
    delta_corr_pvals.append(pval)
combined_df['delta_corr_pval'] = delta_corr_pvals
combined_df['Significant Delta Correlation'] = combined_df['delta_corr_pval'] <= 0.05
combined_df