# Pan Cancer EGFR 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
#sys.path.append('C:\\Users\\brittany henderson\\GitHub\\GBM_for_CPTAC\\')
#import cis_functions as f

import cptac
import cptac.utils as u

In [131]:
#function takes df with proteomic and CNV data, a list of proteomic genes, and the CNV gene
#Creates a new df to be filled by function. Performs linear regression for each proteomic gene 

def wrap_lin_regression(prot_CNV_df, prot_genes, gene):
   
    newdf = pd.DataFrame(columns=['Interacting_gene', 'R_squared', 'P_value'])
    #loop through genes in proteomic_genes
    for inter_gene in prot_genes:
        interacting_gene = inter_gene +"_proteomics"
        #create subset df with interacting gene/ gene (otherwise drop NaN drops everything)
        prot_CNV_df_subset = prot_CNV_df[[interacting_gene,gene +"_CNV"]]
        #do a linear regression to see if it's a meaningful association
        #dropna will remove rows with nan
        prot_CNV_df_subset = prot_CNV_df_subset.dropna(axis=0, how="any")
        count_row = prot_CNV_df_subset.shape[0]
        if count_row > 0:
            x1 = prot_CNV_df_subset[[gene+"_CNV"]].values
            y1 = prot_CNV_df_subset[[interacting_gene]].values
            x1 = x1[:,0]
            y1 = y1[:,0]

            slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x1,y1)

            # only add to df if p value is significant 
            if p_value < 0.05:
                newdf = newdf.append({'Interacting_gene': interacting_gene, 'R_squared': str(r_value**2), 'P_value':  str(p_value)}, ignore_index=True)

    return (newdf)
       

# GBM

In [2]:
#load GBM data 
brain = cptac.Gbm()


                                    

In [167]:
#create list of all proteomic genes 
prot_df = brain.get_proteomics()
prot_genes_list = prot_df.columns.values.tolist()


In [168]:
#Create df or proteomic and CNV data. 
#Run linear regression function on all proteins. 
prot_and_CNV = brain.join_omics_to_omics(df1_name="proteomics", df2_name="CNV")


(wrap_lin_regression(prot_and_CNV,prot_genes_list,"EGFR"))




Unnamed: 0,Interacting_gene,R_squared,P_value
0,A1BG_proteomics,0.0987682479867272,0.0016243853962636407
1,A2M_proteomics,0.14797661416476227,9.197068139740165e-05
2,AAAS_proteomics,0.06007420273502935,0.014996482868120092
3,AADAT_proteomics,0.1028424615520735,0.0035130656472672495
4,AARS2_proteomics,0.06779346759617069,0.009617086717582228
...,...,...,...
2862,ZSCAN21_proteomics,0.10449565521976194,0.0020022011170153326
2863,ZSCAN31_proteomics,0.1533189736509552,0.016548520635900944
2864,ZSWIM8_proteomics,0.06637478859621035,0.010434063121886682
2865,ZXDC_proteomics,0.0485616004020721,0.03797427361405626


# Kidney 

In [95]:
kidney = cptac.Ccrcc()


                                    

In [137]:
#create list of all proteomic genes 
prot_df = kidney.get_proteomics()
prot_df.columns = prot_df.columns.droplevel(1)
prot_genes_list = prot_df.columns.values.tolist()


In [138]:
#Create df or proteomic and CNV data. 
#Run linear regression function on all proteins. 
prot_and_CNV = kidney.join_omics_to_omics(df1_name="proteomics", df2_name="CNV")
(wrap_lin_regression(prot_and_CNV,prot_genes_list,"EGFR"))





Unnamed: 0,Interacting_gene,R_squared,P_value
0,A4GALT_proteomics,0.886580679526646,0.016799162027927916
1,AASS_proteomics,0.07369657905069867,0.004120746113253676
2,AATF_proteomics,0.05255212297262058,0.015995262745602572
3,ABCA5_proteomics,0.36630168035365773,0.03704613993970029
4,ABCB1_proteomics,0.03757430343295188,0.042449652812760925
...,...,...,...
1135,ZNRD1_proteomics,0.07419587620127432,0.018067486429113825
1136,ZNRF2_proteomics,0.2827009858381019,2.2743944811637643e-09
1137,ZSCAN21_proteomics,0.3294283168320642,1.065933312739768e-05
1138,ZXDC_proteomics,0.06658908751510431,0.027510919099169716


# Endometrial 

In [140]:
en = cptac.Endometrial()

                                    

In [142]:
#create list of all proteomic genes 
prot_df = en.get_proteomics()

prot_genes_list = prot_df.columns.values.tolist()


In [144]:
#Create df or proteomic and CNV data. 
#Run linear regression function on all proteins. 
prot_and_CNV = en.join_omics_to_omics(df1_name="proteomics", df2_name="CNV")
(wrap_lin_regression(prot_and_CNV,prot_genes_list,"EGFR"))



Unnamed: 0,Interacting_gene,R_squared,P_value
0,AAAS_proteomics,0.04375827345739995,0.04190556261692782
1,AAMP_proteomics,0.04600841627131807,0.03685833020154935
2,ABCB8_proteomics,0.053661572591688016,0.02389770292185078
3,ABCF2_proteomics,0.08039906944943098,0.005361337771322349
4,ABHD2_proteomics,0.36341886818427166,0.0005379911493378324
...,...,...,...
866,ZNF691_proteomics,0.07844028544994366,0.00597939786479835
867,ZNF837_proteomics,0.14807979987751937,0.02462635144101073
868,ZNHIT2_proteomics,0.07235123211258064,0.008394673802051833
869,ZNRF2_proteomics,0.15480109873996825,7.999712422282647e-05


# Ovarian 

In [145]:
Ovar = cptac.Ovarian()

                                    

In [149]:
#create list of all proteomic genes 
prot_df = Ovar.get_proteomics()
prot_df.columns = prot_df.columns.droplevel(1)
prot_genes_list = prot_df.columns.values.tolist()

In [151]:
#Create df or proteomic and CNV data. 
#Run linear regression function on all proteins. 
prot_and_CNV = Ovar.join_omics_to_omics(df1_name="proteomics", df2_name="CNV")
(wrap_lin_regression(prot_and_CNV,prot_genes_list,"EGFR"))



Unnamed: 0,Interacting_gene,R_squared,P_value
0,A2M_proteomics,0.04853789822811085,0.048114470473907794
1,A2ML1_proteomics,0.12685702170696989,0.005220926873768543
2,AARS_proteomics,0.056980712989550424,0.03186551639710454
3,ABAT_proteomics,0.08446679522382952,0.00848573047475076
4,ABCA9_proteomics,0.1275934685630436,0.04852768121126485
...,...,...,...
740,ZNF316_proteomics,0.10546508556452973,0.0030966565531918536
741,ZNF592_proteomics,0.04951738017453731,0.045856924215800295
742,ZNF644_proteomics,0.05803982208227418,0.03604481602562296
743,ZNF865_proteomics,0.11263563833077958,0.012248019269928995


# Colon

In [None]:
colon = cptac.Colon()

In [None]:
colon.list_data()

colon does not have cnv data -

# Brca 

In [152]:
brca = cptac.Brca()

                                    

In [154]:
#create list of all proteomic genes 
prot_df = brca.get_proteomics()
prot_df.columns = prot_df.columns.droplevel(1)
prot_genes_list = prot_df.columns.values.tolist()

In [156]:
#Create df or proteomic and CNV data. 
#Run linear regression function on all proteins. 
prot_and_CNV = brca.join_omics_to_omics(df1_name="proteomics", df2_name="CNV")
(wrap_lin_regression(prot_and_CNV,prot_genes_list,"EGFR"))

Unnamed: 0,Interacting_gene,R_squared,P_value
0,ABCA2_proteomics,0.055624024611105806,0.043080339589750785
1,ABCC3_proteomics,0.043321920864427294,0.02626816204114852
2,ABCD3_proteomics,0.051368331145667745,0.01206041825177043
3,ABCE1_proteomics,0.04171208443619836,0.024041201027069692
4,ABCF2_proteomics,0.04390486766274212,0.0205397798926616
...,...,...,...
1196,ZNF740_proteomics,0.07566226783768767,0.002167095280633629
1197,ZNHIT3_proteomics,0.04471291269352962,0.036606223539851046
1198,ZPR1_proteomics,0.05821040949821707,0.007425548068685662
1199,ZXDC_proteomics,0.05711642091211942,0.01839642384855393


# LUAD

In [157]:
luad = cptac.Luad()

                                    

In [160]:
#create list of all proteomic genes 
prot_df = luad.get_proteomics()
prot_df.columns = prot_df.columns.droplevel(1)
prot_genes_list = prot_df.columns.values.tolist()

In [161]:
#Create df or proteomic and CNV data. 
#Run linear regression function on all proteins. 
prot_and_CNV = luad.join_omics_to_omics(df1_name="proteomics", df2_name="CNV")
(wrap_lin_regression(prot_and_CNV,prot_genes_list,"EGFR"))



Unnamed: 0,Interacting_gene,R_squared,P_value
0,AASS_proteomics,0.04533330526765004,0.02553265980063949
1,AATK_proteomics,0.08498519733345838,0.025076804307106505
2,ABCA3_proteomics,0.055893969804587086,0.012896441100305847
3,ABCA7_proteomics,0.061665697882585246,0.03678709988921711
4,ABCC1_proteomics,0.03943312618797753,0.03755598823575713
...,...,...,...
1346,ZNRF2_proteomics,0.1305398743986313,0.00010534688534850004
1347,ZRANB2_proteomics,0.06434327398420953,0.007497557533837104
1348,ZSCAN18_proteomics,0.039525833292387666,0.03732774546894278
1349,ZW10_proteomics,0.04810693262590555,0.02132315489021154


# HNSCC

In [162]:
Hnscc = cptac.Hnscc()

                                    

In [164]:
#create list of all proteomic genes 
prot_df = Hnscc.get_proteomics()
prot_genes_list = prot_df.columns.values.tolist()

In [165]:
#Create df or proteomic and CNV data. 
#Run linear regression function on all proteins. 
prot_and_CNV = Hnscc.join_omics_to_omics(df1_name="proteomics", df2_name="CNV")
(wrap_lin_regression(prot_and_CNV,prot_genes_list,"EGFR"))

  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Unnamed: 0,Interacting_gene,R_squared,P_value
0,ABCA12_proteomics,1.0,0.0
1,ABHD14A-ACY1;ACY1_proteomics,0.042654226061398265,0.030407165679960716
2,ABI3_proteomics,0.0713161675659758,0.006661589430560841
3,ABR_proteomics,0.06774916791274525,0.006028467205486407
4,ACACA_proteomics,0.08026420513965155,0.0027075066408910006
...,...,...,...
927,ZFAND5_proteomics,0.04960829851126563,0.01991875732257156
928,ZMYM6_proteomics,0.25469791367426686,0.02754623524945694
929,ZNF106_proteomics,0.1570932438802146,0.024719099954447963
930,ZNF629_proteomics,0.49746512733946496,0.03379592847602158
