## Getting the genes that have a significant P-value for each cancer's delta correlation
### Do we want to use all of the significant genes (approx. 5000 genes for each cancer) or just the most significant (i.e. lowest 50 p-values)?

In [13]:
import pandas as pd

In [14]:
delta_correlation_df = pd.read_csv('~/PayneLab/pancancer/pancancerProteinMRNA/notebook_steps_Spearman/data/delta_correlation_df.csv')
delta_correlation_df

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
0,A1BG,-0.268533,5.703182e-02,1.320375e-01,CCRCC
1,A1CF,0.192038,1.063340e-04,6.401858e-04,CCRCC
2,A2M,-0.191619,1.277644e-01,2.439276e-01,CCRCC
3,AAAS,0.019654,8.963138e-01,9.409267e-01,CCRCC
4,AACS,-0.169937,6.007042e-02,1.375402e-01,CCRCC
...,...,...,...,...,...
50303,ZXDC,0.073854,6.395306e-01,6.962162e-01,LUAD
50304,ZYG11B,0.734569,1.583743e-10,1.504331e-09,LUAD
50305,ZYX,0.440861,2.253307e-06,9.499585e-06,LUAD
50306,ZZEF1,0.422001,3.731185e-11,3.889196e-10,LUAD


In [15]:
# Get all significant delta correlations

delta_corr_sig = delta_correlation_df.loc[delta_correlation_df['FDR'] < 0.05]
delta_corr_sig

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
1,A1CF,0.192038,1.063340e-04,6.401858e-04,CCRCC
5,AADAT,-0.263372,1.531939e-02,4.572896e-02,CCRCC
7,AAGAB,0.364999,7.633656e-04,3.646252e-03,CCRCC
8,AAK1,0.356932,6.347464e-14,1.385608e-12,CCRCC
11,AARS,0.415815,6.031667e-03,2.119475e-02,CCRCC
...,...,...,...,...,...
50302,ZWINT,0.837582,2.770355e-08,1.722312e-07,LUAD
50304,ZYG11B,0.734569,1.583743e-10,1.504331e-09,LUAD
50305,ZYX,0.440861,2.253307e-06,9.499585e-06,LUAD
50306,ZZEF1,0.422001,3.731185e-11,3.889196e-10,LUAD


In [16]:
# Get all significant genes for each specific cancer

ccrcc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'CCRCC']
hnscc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'HNSCC']
endo_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'Endometrial']
lscc_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'LSCC']
luad_delta = delta_corr_sig.loc[delta_corr_sig['Cancer'] == 'LUAD']

In [17]:
hnscc_delta

Unnamed: 0,Gene,Delta_Correlation,P_Value,FDR,Cancer
19247,AADAC,-0.495044,6.258443e-05,5.740776e-04,HNSCC
19250,AAK1,0.614117,1.000902e-11,3.887029e-10,HNSCC
19255,AARS2,0.565982,3.124882e-05,3.164216e-04,HNSCC
19256,AARSD1,0.450818,7.373637e-03,2.865766e-02,HNSCC
19258,AASDHPPT,0.689065,3.945785e-11,1.390700e-09,HNSCC
...,...,...,...,...,...
29369,ZSWIM8,0.502199,4.125039e-03,1.822641e-02,HNSCC
29370,ZW10,0.514764,7.248112e-04,4.498890e-03,HNSCC
29372,ZWINT,0.925693,2.668810e-05,2.757498e-04,HNSCC
29373,ZXDC,0.503974,1.021347e-02,3.715855e-02,HNSCC


In [18]:
sort_hnscc = hnscc_delta.sort_values(by=['FDR'])

In [19]:
hnscc_genes = sort_hnscc['Gene'].to_list()
hnscc_genes

['MYO1B',
 'STAT1',
 'STAT2',
 'SULF1',
 'PTK7',
 'GBP1',
 'LAMC2',
 'NSUN2',
 'TYMP',
 'TPX2',
 'GBP5',
 'KIF2C',
 'DDX60',
 'MYO5A',
 'DDX58',
 'PLOD2',
 'UBE2L6',
 'RTKN',
 'KRT17',
 'ITGB6',
 'AFAP1L2',
 'RBP1',
 'ISG15',
 'NFIA',
 'TRIP13',
 'ANLN',
 'TNFRSF12A',
 'BST2',
 'APOL2',
 'OASL',
 'ABI3BP',
 'HMGA2',
 'ANKLE2',
 'PARP12',
 'PLEK2',
 'HAAO',
 'OAS2',
 'ITGA3',
 'PPFIA1',
 'SP100',
 'INHBA',
 'XAF1',
 'PLAU',
 'DHX36',
 'CDK6',
 'LOXL2',
 'MMP1',
 'SRPK2',
 'IGF2BP3',
 'IFIT3',
 'PLOD1',
 'APPL1',
 'KIF23',
 'MMP14',
 'POLR2H',
 'RAB7A',
 'TOP2A',
 'TFRC',
 'KYNU',
 'UBE2C',
 'IGF2BP2',
 'TSPAN7',
 'FBLIM1',
 'LRRC15',
 'SLC2A1',
 'OCLN',
 'IFI44',
 'PBX1',
 'MAMDC2',
 'FAT1',
 'SP110',
 'USP18',
 'PCM1',
 'KDELC1',
 'RRM2',
 'MFAP4',
 'SLC7A8',
 'DENND4C',
 'LUZP1',
 'AURKB',
 'HMGB3',
 'LPIN1',
 'APOBEC3G',
 'AASS',
 'EPS8',
 'SFRP4',
 'P3H2',
 'FST',
 'P4HA2',
 'GBP4',
 'CMTR1',
 'MISP',
 'PARP1',
 'IDO1',
 'DLGAP5',
 'RNMT',
 'TANC2',
 'XPO5',
 'OAS3',
 'MX2',
 'PRKAR

In [20]:
import cptac
cptac.download('hnscc')
hnscc = cptac.Hnscc()

                                          

In [21]:
hnscc_clin = hnscc.get_clinical()
hnscc_clin

Name,Sample_Tumor_Normal,Cored_Sample,P16,age,alcohol_consum,clinic_staging_dist_metas,country,follow_up_days,follow_up_is_contact,follow_up_vital_status,...,smoke_age_start,smoke_age_stop,smoking_history,smoking_inferred_binary,smoking_second_hand,tumor_focality,tumor_necrosis,tumor_site_curated,tumor_site_original,tumor_size_cm
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00977,Tumor,False,Not Evaluated,56.0,Lifelong non-drinker,cM0,Russia,377.0,No,Living,...,,,"Current reformed smoker, years unknown",strong_evidence,Yes,Unifocal,Not identified,Oral cavity,Floor of mouth,1.2
C3L-00987,Tumor,False,Not Evaluated,61.0,"Consumed alcohol in the past, but currently a ...",cM0,Ukraine,429.0,Yes,Living,...,18,,Current smoker: Includes daily and non-daily s...,strong_evidence,Yes,Unifocal,Present,Oral cavity,Tongue,4.0
C3L-00994,Tumor,False,Not Evaluated,50.0,Alcohol consumption more than 2 drinks per day...,cM0,Ukraine,132.0,No,Deceased,...,16,50,Current reformed smoker within past 15 years,strong_evidence,Exposure to secondhand smoke history not avail...,Unifocal,Present,Oral cavity,Tongue,3.0
C3L-00995,Tumor,False,Not Evaluated,56.0,Alcohol consumption more than 2 drinks per day...,cM0,Ukraine,-10.0,No,Deceased,...,25,56,"Current reformed smoker, more than 15 years",weak_evidence,Exposure to secondhand smoke history not avail...,Unifocal,Not identified,Oral cavity,Buccal mucosa,4.0
C3L-00997,Tumor,False,Not Evaluated,47.0,Lifelong non-drinker,cM0,Ukraine,442.0,Yes,Living,...,12,27,"Current reformed smoker, more than 15 years",strong_evidence,Yes,Unifocal,Present,Oropharynx,Oropharynx,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04276.N,Normal,False,,,,,,,,,...,,,,,,,,,,
C3N-04277.N,Normal,False,,,,,,,,,...,,,,,,,,,,
C3N-04278.N,Normal,False,,,,,,,,,...,,,,,,,,,,
C3N-04279.N,Normal,False,,,,,,,,,...,,,,,,,,,,
