## Getting the genes that have a significant P-value for each cancer's delta correlation
### Do we want to use all of the significant genes (approx. 5000 genes for each cancer) or just the most significant (i.e. lowest 50 p-values)?

In [1]:
import pandas as pd
import pcprutils as ut
import scipy.odr
import altair as alt
import numpy as np

In [2]:
delta_correlation_df = pd.read_csv('../data/delta_correlation_df.csv')
delta_correlation_df

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
0,A1BG,-0.198013,2.451044e-01,4.045115e-01,HNSCC
1,A2M,-0.118384,4.480278e-01,6.091130e-01,HNSCC
2,A2ML1,-0.023469,2.918125e-01,4.561968e-01,HNSCC
3,AAAS,0.275905,1.051756e-01,2.209072e-01,HNSCC
4,AACS,-0.136836,1.800586e-01,3.266475e-01,HNSCC
...,...,...,...,...,...
50684,ZWINT,1.219024,2.267627e-09,1.049863e-07,Endometrial
50685,ZXDC,-0.346532,2.983295e-01,5.386144e-01,Endometrial
50686,ZYG11B,0.768196,5.463938e-04,5.319699e-03,Endometrial
50687,ZYX,0.253630,2.456049e-01,4.795301e-01,Endometrial


In [3]:
# Get all significant delta correlations

delta_corr_sig = delta_correlation_df.loc[delta_correlation_df['FDR'] < 0.05]
delta_corr_sig

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
5,AADAC,-0.538428,2.101571e-06,3.348778e-05,HNSCC
14,AASDHPPT,0.833326,3.804959e-13,2.248846e-11,HNSCC
15,AASS,0.273200,7.531252e-05,7.264567e-04,HNSCC
17,ABAT,0.134938,2.447446e-03,1.250262e-02,HNSCC
20,ABCA12,0.354247,8.862597e-06,1.172701e-04,HNSCC
...,...,...,...,...,...
50666,ZNFX1,0.778662,5.556585e-04,5.391808e-03,Endometrial
50673,ZNRF2,0.737945,2.571580e-03,1.850902e-02,Endometrial
50683,ZWILCH,0.734288,5.623849e-05,7.920710e-04,Endometrial
50684,ZWINT,1.219024,2.267627e-09,1.049863e-07,Endometrial


In [4]:
# Get all significant genes for each specific cancer

ccrcc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'CCRCC']
hnscc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'HNSCC']
endo_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'Endometrial']
lscc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'LSCC']
luad_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'LUAD']

## Calculate regression line for all patients for each tissue type for each gene

In [5]:
prot_trans = ut.load_prot_trans([
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
])

                                          



                                                



                                         



In [6]:
def get_regression(corr_df, prot_trans_df):
    
    sig_prot_trans = prot_trans_df[prot_trans_df["Gene"].isin(corr_df["Gene"])]
    
    def get_odr(df):
        
        def f(B, x):
            return B[0]*x + B[1]
        
        linear = scipy.odr.Model(f)
        data = scipy.odr.Data(df["Transcriptomics"], df["Proteomics"])
        odr = scipy.odr.ODR(data, linear, beta0=[1, 1])
        output = odr.run()
        
        return output.beta
    
    results = sig_prot_trans.\
    groupby(["Tissue", "Gene"]).\
    apply(get_odr).\
    reset_index(drop=False).\
    rename(columns={0: "beta"})
    
    mb = pd.DataFrame(results["beta"].tolist(), columns=["m", "b"])
    results = pd.concat([results, mb], axis=1).drop(columns=["beta"])
    
    return results

hnscc_results = get_regression(hnscc_delta, prot_trans["hnscc"])
luad_results = get_regression(luad_delta, prot_trans["luad"])
lscc_results = get_regression(lscc_delta, prot_trans["lscc"])
ccrcc_results = get_regression(ccrcc_delta, prot_trans["ccrcc"])
endo_results = get_regression(endo_delta, prot_trans["endometrial"])

In [7]:
def make_chart(df, m, b):

    scatter = alt.Chart(df).mark_circle().encode(
        x="Transcriptomics",
        y="Proteomics",
        color="Tissue",
        tooltip=["Patient_ID"]
    )
    
    x = np.arange(14)
    y = m * x + b
    
    df = pd.DataFrame({"x":x, "y": y})
    
    line = alt.Chart(df).mark_line().encode(
        x='x',
        y='y'
    )
    
    return scatter+line

In [8]:
#for gene in hnscc_results['Gene']:
gene = 'AADAC'
tissue_type = ['Tumor', 'Normal']

chart = []

for tissue in tissue_type:
    pr_tr_cancer = prot_trans['hnscc']
    gene_df = pr_tr_cancer.loc[(pr_tr_cancer['Gene'] == gene) & (pr_tr_cancer['Tissue'] == tissue)]
    gene_slope = hnscc_results.loc[(hnscc_results['Gene'] == gene) & (hnscc_results['Tissue'] == tissue)]
    m = gene_slope['m'].values[0]
    b = gene_slope['b'].values[0]
    
    chart.append(make_chart(gene_df, m, b))

a = alt.vconcat(*chart)

In [9]:
hnscc_results.to_csv('hnscc_regression.tsv', sep='\t', index=False)
ccrcc_results.to_csv('ccrcc_regression.tsv', sep='\t', index=False)
lscc_results.to_csv('lscc_regression.tsv', sep='\t', index=False)
luad_results.to_csv('luad_regression.tsv', sep='\t', index=False)
endo_results.to_csv('endometrial_regression.tsv', sep='\t', index=False)