## Getting the genes that have a significant P-value for each cancer's delta correlation
### Do we want to use all of the significant genes (approx. 5000 genes for each cancer) or just the most significant (i.e. lowest 50 p-values)?

In [1]:
import pandas as pd
import pcprutils as ut
import scipy.odr
import altair as alt
import numpy as np

In [2]:
delta_correlation_df = pd.read_csv('../data/delta_correlation_df.csv')
delta_correlation_df

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
0,A1BG,-0.268533,5.703182e-02,1.320375e-01,CCRCC
1,A1CF,0.192038,1.063340e-04,6.401858e-04,CCRCC
2,A2M,-0.191619,1.277644e-01,2.439276e-01,CCRCC
3,AAAS,0.019654,8.963138e-01,9.409267e-01,CCRCC
4,AACS,-0.169937,6.007042e-02,1.375402e-01,CCRCC
...,...,...,...,...,...
50303,ZXDC,0.073854,6.395306e-01,6.962162e-01,LUAD
50304,ZYG11B,0.734569,1.583743e-10,1.504331e-09,LUAD
50305,ZYX,0.440861,2.253307e-06,9.499585e-06,LUAD
50306,ZZEF1,0.422001,3.731185e-11,3.889196e-10,LUAD


In [23]:
# Get all significant delta correlations

delta_corr_sig = delta_correlation_df.loc[delta_correlation_df['FDR'] < 0.05]
delta_corr_sig

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
1,A1CF,0.192038,1.063340e-04,6.401858e-04,CCRCC
5,AADAT,-0.263372,1.531939e-02,4.572896e-02,CCRCC
7,AAGAB,0.364999,7.633656e-04,3.646252e-03,CCRCC
8,AAK1,0.356932,6.347464e-14,1.385608e-12,CCRCC
11,AARS,0.415815,6.031667e-03,2.119475e-02,CCRCC
...,...,...,...,...,...
50302,ZWINT,0.837582,2.770355e-08,1.722312e-07,LUAD
50304,ZYG11B,0.734569,1.583743e-10,1.504331e-09,LUAD
50305,ZYX,0.440861,2.253307e-06,9.499585e-06,LUAD
50306,ZZEF1,0.422001,3.731185e-11,3.889196e-10,LUAD


In [24]:
gene_freq = delta_corr_sig["Gene"].\
value_counts().\
to_frame().\
reset_index(drop=False).\
rename(columns={"Gene": "count", "index": "Gene"})

delta_corr_sig_with_counts = delta_corr_sig.merge(gene_freq, on="Gene", how="inner")

In [22]:
delta_corr_sig_with_counts[delta_corr_sig_with_counts["count"] > 3].sort_values(by=['Gene','Delta_Correlation']).head(30)


Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer,count
3,AAGAB,0.364999,0.0007633656,0.003646252,CCRCC,4
5,AAGAB,0.53463,9.313146e-27,2.721592e-25,LSCC,4
6,AAGAB,0.541641,9.576553e-08,5.369596e-07,LUAD,4
4,AAGAB,0.626523,0.0001265797,0.001448009,Endometrial,4
7,AAK1,0.356932,6.347464e-14,1.385608e-12,CCRCC,4
9,AAK1,0.614117,1.000902e-11,3.887029e-10,HNSCC,4
10,AAK1,0.655869,1.061364e-08,5.576651e-08,LSCC,4
8,AAK1,0.711044,0.000473655,0.004325435,Endometrial,4
11,AARS,0.415815,0.006031667,0.02119475,CCRCC,4
13,AARS,0.485189,2.635135e-22,5.364523e-21,LSCC,4


In [4]:
# Get all significant genes for each specific cancer

ccrcc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'CCRCC']
hnscc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'HNSCC']
endo_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'Endometrial']
lscc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'LSCC']
luad_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'LUAD']

In [5]:
sort_hnscc = hnscc_delta.sort_values(by=['FDR'])
hnscc_genes = sort_hnscc['Gene'].to_list()

## Calculate regression line for all patients for each tissue type for each gene

In [6]:
prot_trans = ut.load_prot_trans([
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
])

                                          



                                                



                                         



In [7]:
def get_regression(corr_df, prot_trans_df):
    
    sig_prot_trans = prot_trans_df[prot_trans_df["Gene"].isin(corr_df["Gene"])]
    
    def get_odr(df):
        
        def f(B, x):
            return B[0]*x + B[1]
        
        linear = scipy.odr.Model(f)
        data = scipy.odr.Data(df["Transcriptomics"], df["Proteomics"])
        odr = scipy.odr.ODR(data, linear, beta0=[1, 1])
        output = odr.run()
        
        return output.beta
    
    results = sig_prot_trans.\
    groupby(["Tissue", "Gene"]).\
    apply(get_odr).\
    reset_index(drop=False).\
    rename(columns={0: "beta"})
    
    mb = pd.DataFrame(results["beta"].tolist(), columns=["m", "b"])
    results = pd.concat([results, mb], axis=1).drop(columns=["beta"])
    
    return results

hnscc_results = get_regression(hnscc_delta, prot_trans["hnscc"])
luad_results = get_regression(luad_delta, prot_trans["luad"])
lscc_results = get_regression(lscc_delta, prot_trans["lscc"])
ccrcc_results = get_regression(ccrcc_delta, prot_trans["ccrcc"])
endo_results = get_regression(endo_delta, prot_trans["endometrial"])

In [8]:
def make_chart(df, m, b):

    scatter = alt.Chart(df).mark_circle().encode(
        x="Transcriptomics",
        y="Proteomics",
        color="Tissue",
        tooltip=["Patient_ID"]
    )
    
    x = np.arange(14)
    y = m * x + b
    
    df = pd.DataFrame({"x":x, "y": y})
    
    line = alt.Chart(df).mark_line().encode(
        x='x',
        y='y'
    )
    
    return scatter+line

In [9]:
#for gene in hnscc_results['Gene']:
gene = 'AADAC'
tissue_type = ['Tumor', 'Normal']

chart = []

for tissue in tissue_type:
    pr_tr_cancer = prot_trans['hnscc']
    gene_df = pr_tr_cancer.loc[(pr_tr_cancer['Gene'] == gene) & (pr_tr_cancer['Tissue'] == tissue)]
    gene_slope = hnscc_results.loc[(hnscc_results['Gene'] == gene) & (hnscc_results['Tissue'] == tissue)]
    m = gene_slope['m'].values[0]
    b = gene_slope['b'].values[0]
    
    chart.append(make_chart(gene_df, m, b))

a = alt.vconcat(*chart)

In [10]:
hnscc_results.to_csv('hnscc_regression.tsv', sep='\t', index=False)
ccrcc_results.to_csv('ccrcc_regression.tsv', sep='\t', index=False)
lscc_results.to_csv('lscc_regression.tsv', sep='\t', index=False)
luad_results.to_csv('luad_regression.tsv', sep='\t', index=False)
endo_results.to_csv('endometrial_regression.tsv', sep='\t', index=False)