## Getting the genes that have a significant P-value for each cancer's delta correlation
### Do we want to use all of the significant genes (approx. 5000 genes for each cancer) or just the most significant (i.e. lowest 50 p-values)?

In [19]:
import pandas as pd
import pcprutils as ut
import scipy.odr

In [2]:
delta_correlation_df = pd.read_csv('../data/delta_correlation_df.csv')
delta_correlation_df

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
0,A1BG,-0.268533,5.703182e-02,1.320375e-01,CCRCC
1,A1CF,0.192038,1.063340e-04,6.401858e-04,CCRCC
2,A2M,-0.191619,1.277644e-01,2.439276e-01,CCRCC
3,AAAS,0.019654,8.963138e-01,9.409267e-01,CCRCC
4,AACS,-0.169937,6.007042e-02,1.375402e-01,CCRCC
5,AADAT,-0.263372,1.531939e-02,4.572896e-02,CCRCC
6,AAED1,0.190506,2.882440e-01,4.385777e-01,CCRCC
7,AAGAB,0.364999,7.633656e-04,3.646252e-03,CCRCC
8,AAK1,0.356932,6.347464e-14,1.385608e-12,CCRCC
9,AAMP,0.281272,2.536223e-02,6.920503e-02,CCRCC


In [3]:
# Get all significant delta correlations

delta_corr_sig = delta_correlation_df.loc[delta_correlation_df['FDR'] < 0.05]
delta_corr_sig

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
1,A1CF,0.192038,1.063340e-04,6.401858e-04,CCRCC
5,AADAT,-0.263372,1.531939e-02,4.572896e-02,CCRCC
7,AAGAB,0.364999,7.633656e-04,3.646252e-03,CCRCC
8,AAK1,0.356932,6.347464e-14,1.385608e-12,CCRCC
11,AARS,0.415815,6.031667e-03,2.119475e-02,CCRCC
13,AARSD1,0.464768,4.388407e-04,2.263637e-03,CCRCC
16,AASS,-0.278696,1.150360e-06,1.030519e-05,CCRCC
18,ABAT,-0.612928,7.461635e-19,2.593633e-17,CCRCC
22,ABCA6,0.370021,7.012739e-04,3.389490e-03,CCRCC
24,ABCB1,0.072682,2.975122e-03,1.171726e-02,CCRCC


In [4]:
# Get all significant genes for each specific cancer

ccrcc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'CCRCC']
hnscc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'HNSCC']
endo_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'Endometrial']
lscc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'LSCC']
luad_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'LUAD']

In [5]:
hnscc_delta

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
19247,AADAC,-0.495044,6.258443e-05,5.740776e-04,HNSCC
19250,AAK1,0.614117,1.000902e-11,3.887029e-10,HNSCC
19255,AARS2,0.565982,3.124882e-05,3.164216e-04,HNSCC
19256,AARSD1,0.450818,7.373637e-03,2.865766e-02,HNSCC
19258,AASDHPPT,0.689065,3.945785e-11,1.390700e-09,HNSCC
19259,AASS,0.526886,9.033256e-24,1.090013e-21,HNSCC
19263,ABCA12,0.252147,1.419386e-03,7.891878e-03,HNSCC
19265,ABCA8,0.204571,6.673680e-04,4.196725e-03,HNSCC
19275,ABCC5,0.428235,5.499244e-04,3.570809e-03,HNSCC
19277,ABCD1,0.438082,7.068001e-05,6.345550e-04,HNSCC


In [6]:
sort_hnscc = hnscc_delta.sort_values(by=['FDR'])

In [7]:
hnscc_genes = sort_hnscc['Gene'].to_list()
hnscc_genes

['MYO1B',
 'STAT1',
 'STAT2',
 'SULF1',
 'PTK7',
 'GBP1',
 'LAMC2',
 'NSUN2',
 'TYMP',
 'TPX2',
 'GBP5',
 'KIF2C',
 'DDX60',
 'MYO5A',
 'DDX58',
 'PLOD2',
 'UBE2L6',
 'RTKN',
 'KRT17',
 'ITGB6',
 'AFAP1L2',
 'RBP1',
 'ISG15',
 'NFIA',
 'TRIP13',
 'ANLN',
 'TNFRSF12A',
 'BST2',
 'APOL2',
 'OASL',
 'ABI3BP',
 'HMGA2',
 'ANKLE2',
 'PARP12',
 'PLEK2',
 'HAAO',
 'OAS2',
 'ITGA3',
 'PPFIA1',
 'SP100',
 'INHBA',
 'XAF1',
 'PLAU',
 'DHX36',
 'CDK6',
 'LOXL2',
 'MMP1',
 'SRPK2',
 'IGF2BP3',
 'IFIT3',
 'PLOD1',
 'APPL1',
 'KIF23',
 'MMP14',
 'POLR2H',
 'RAB7A',
 'TOP2A',
 'TFRC',
 'KYNU',
 'UBE2C',
 'IGF2BP2',
 'TSPAN7',
 'FBLIM1',
 'LRRC15',
 'SLC2A1',
 'OCLN',
 'IFI44',
 'PBX1',
 'MAMDC2',
 'FAT1',
 'SP110',
 'USP18',
 'PCM1',
 'KDELC1',
 'RRM2',
 'MFAP4',
 'SLC7A8',
 'DENND4C',
 'LUZP1',
 'AURKB',
 'HMGB3',
 'LPIN1',
 'APOBEC3G',
 'AASS',
 'EPS8',
 'SFRP4',
 'P3H2',
 'FST',
 'P4HA2',
 'GBP4',
 'CMTR1',
 'MISP',
 'PARP1',
 'IDO1',
 'DLGAP5',
 'RNMT',
 'TANC2',
 'XPO5',
 'OAS3',
 'MX2',
 'PRKAR

In [8]:
import cptac
cptac.download('hnscc')
hnscc = cptac.Hnscc()

                                          

In [9]:
hnscc_clin = hnscc.get_clinical()
hnscc_clin

Name,Sample_Tumor_Normal,Cored_Sample,P16,age,alcohol_consum,clinic_staging_dist_metas,country,follow_up_days,follow_up_is_contact,follow_up_vital_status,...,smoke_age_start,smoke_age_stop,smoking_history,smoking_inferred_binary,smoking_second_hand,tumor_focality,tumor_necrosis,tumor_site_curated,tumor_site_original,tumor_size_cm
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00977,Tumor,False,Not Evaluated,56.0,Lifelong non-drinker,cM0,Russia,377.0,No,Living,...,,,"Current reformed smoker, years unknown",strong_evidence,Yes,Unifocal,Not identified,Oral cavity,Floor of mouth,1.2
C3L-00987,Tumor,False,Not Evaluated,61.0,"Consumed alcohol in the past, but currently a ...",cM0,Ukraine,429.0,Yes,Living,...,18,,Current smoker: Includes daily and non-daily s...,strong_evidence,Yes,Unifocal,Present,Oral cavity,Tongue,4.0
C3L-00994,Tumor,False,Not Evaluated,50.0,Alcohol consumption more than 2 drinks per day...,cM0,Ukraine,132.0,No,Deceased,...,16,50,Current reformed smoker within past 15 years,strong_evidence,Exposure to secondhand smoke history not avail...,Unifocal,Present,Oral cavity,Tongue,3.0
C3L-00995,Tumor,False,Not Evaluated,56.0,Alcohol consumption more than 2 drinks per day...,cM0,Ukraine,-10.0,No,Deceased,...,25,56,"Current reformed smoker, more than 15 years",weak_evidence,Exposure to secondhand smoke history not avail...,Unifocal,Not identified,Oral cavity,Buccal mucosa,4.0
C3L-00997,Tumor,False,Not Evaluated,47.0,Lifelong non-drinker,cM0,Ukraine,442.0,Yes,Living,...,12,27,"Current reformed smoker, more than 15 years",strong_evidence,Yes,Unifocal,Present,Oropharynx,Oropharynx,4.0
C3L-00999,Tumor,False,Not Evaluated,56.0,Lifelong non-drinker,cM0,Russia,363.0,No,Living,...,20,,Current smoker: Includes daily and non-daily s...,weak_evidence,Yes,Unifocal,Present,Oral cavity,Floor of mouth,2.2
C3L-01138,Tumor,False,Not Evaluated,62.0,Alcohol consumption equal to or less than 2 dr...,cM0,Ukraine,363.0,No,Living,...,8,,Current smoker: Includes daily and non-daily s...,strong_evidence,Exposure to secondhand smoke history not avail...,Unifocal,Not identified,Larynx,Larynx,6.0
C3L-01237,Tumor,False,Not Evaluated,57.0,Alcohol consumption equal to or less than 2 dr...,cM0,Ukraine,20.0,Yes,Living,...,17,20,"Current reformed smoker, more than 15 years",strong_evidence,Exposure to secondhand smoke history not avail...,Unifocal,Not identified,Oral cavity,Floor of mouth,4.0
C3L-02617,Tumor,False,Not Evaluated,64.0,Alcohol consumption more than 2 drinks per day...,Staging Incomplete,Bulgaria,1233.0,No,Living,...,20,,Current smoker: Includes daily and non-daily s...,strong_evidence,Yes,Unifocal,Present,Larynx,Larynx,6.0
C3L-02621,Tumor,False,Unknown,68.0,Alcohol consumption more than 2 drinks per day...,Staging Incomplete,Bulgaria,314.0,Yes,Living,...,38,,Current smoker: Includes daily and non-daily s...,strong_evidence,Yes,Multifocal,Present,Larynx,Larynx,0.6


In [10]:
hnscc_delta

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
19247,AADAC,-0.495044,6.258443e-05,5.740776e-04,HNSCC
19250,AAK1,0.614117,1.000902e-11,3.887029e-10,HNSCC
19255,AARS2,0.565982,3.124882e-05,3.164216e-04,HNSCC
19256,AARSD1,0.450818,7.373637e-03,2.865766e-02,HNSCC
19258,AASDHPPT,0.689065,3.945785e-11,1.390700e-09,HNSCC
19259,AASS,0.526886,9.033256e-24,1.090013e-21,HNSCC
19263,ABCA12,0.252147,1.419386e-03,7.891878e-03,HNSCC
19265,ABCA8,0.204571,6.673680e-04,4.196725e-03,HNSCC
19275,ABCC5,0.428235,5.499244e-04,3.570809e-03,HNSCC
19277,ABCD1,0.438082,7.068001e-05,6.345550e-04,HNSCC


## Calculate regression line for all patients for each tissue type for each gene

In [35]:
prot_trans = ut.load_prot_trans([
    "ccrcc",
    "endometrial",
    "hnscc",
    "lscc",
    "luad",
])

                                          



                                                



                                         



In [68]:
def get_regression(corr_df, prot_trans_df):
    
    sig_prot_trans = prot_trans_df[prot_trans_df["Gene"].isin(corr_df["Gene"])]
    
    def get_odr(df):
        
        def f(B, x):
            return B[0]*x + B[1]
        
        linear = scipy.odr.Model(f)
        data = scipy.odr.Data(df["Transcriptomics"], df["Proteomics"])
        odr = scipy.odr.ODR(data, linear, beta0=[1, 1])
        output = odr.run()
        
        return output.beta
    
    results = sig_prot_trans.\
    groupby(["Tissue", "Gene"]).\
    apply(get_odr).\
    reset_index(drop=False).\
    rename(columns={0: "beta"})
    
    mb = pd.DataFrame(results["beta"].tolist(), columns=["m", "b"])
    results = pd.concat([results, mb], axis=1).drop(columns=["beta"])
    
    return results

hnscc_results = get_regression(hnscc_delta, prot_trans["hnscc"])