# Exploring Combined Data frames

This data frame is looking at the large data frames that were created by recording the FDR significant pearson correlation between EGFR and all proteins. The perpose of this notebook is to find genes that either have different significant correlations (pos/neg) or have the same correlation, but only in some cancers.  

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import cptac
import cptac.utils as u
import plot_utils as p 

  import pandas.util.testing as tm


In [2]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < -0.3:
            hasNeg = True
        if item > 0.3:
            hasPos = True
            
    if hasPos & hasNeg:
        return True
    return False

def Pvalue_sig(row):
    numSig = 0

    for item in row:
        if pd.isnull(item):
            continue
        if item < 0.05:
            numSig += 1
            
    return numSig

def CountPosNeg(row):
    hasPos = False
    hasNeg= False
    counter = 0
    for item in row:
        if pd.isnull(item):
            continue
        if item < -0.3:
            hasNeg = True
            counter += 1
        if item > 0.3:
            hasPos = True
            counter += 1
    return counter

Read in the wide version of the data frame (FDR all proteins) then set comparison to be the index. Then filter the data frame to only hae the correlation columns  

In [3]:
prot_FDR = pd.read_csv("../Step3.2_combining_pearson_dfs/csv_files/pancan_EGFR_all_FDR_wide_pvals.csv")
prot_FDR = prot_FDR.drop(['Unnamed: 0'], axis=1)
prot_FDR = prot_FDR.set_index('Comparison')
prot_FDR = prot_FDR.drop(['EGFR_proteomics'], axis=0)
prot_FDR

Unnamed: 0_level_0,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney,Correlation_Ovar,P_value_Ovar,Correlation_Brca,P_value_Brca,Correlation_Luad,P_value_Luad,Correlation_hnscc,P_value_hnscc,Correlation_colon,P_value_colon,Correlation_Lscc,P_value_Lscc
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
PHLDA1_proteomics,0.816848,3.507071e-21,0.254436,0.060261,,,0.364797,0.002164,0.260110,0.074530,0.664271,8.888640e-12,0.386104,0.122847,0.713420,2.644826e-14
GRB2_proteomics,-0.610889,6.729990e-08,-0.217427,0.120342,-0.190090,0.346111,-0.177379,0.142733,-0.302439,0.020631,-0.532341,3.320092e-06,0.150960,0.347409,-0.198042,2.437176e-01
SOCS2_proteomics,0.562720,3.420388e-06,,,,,,,,,0.020297,9.557300e-01,,,0.472624,1.417921e-02
CDH4_proteomics,0.559180,3.420388e-06,0.148407,0.513490,,,,,,,,,,,,
DAB2_proteomics,-0.556402,3.420388e-06,-0.076173,0.673774,0.076981,0.750510,0.326055,0.003543,-0.086403,0.597546,-0.208437,1.490980e-01,-0.147519,0.360266,-0.072496,7.501117e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF283_proteomics,,,,,,,,,,,,,,,0.002610,9.941900e-01
TRIM39_proteomics,,,,,,,,,,,,,,,-0.002379,9.945224e-01
ADGRA3_proteomics,,,,,,,,,,,,,,,0.002228,9.958381e-01
CEP57L1_proteomics,,,,,,,,,,,,,,,-0.001102,9.977544e-01


# Positive and Negative Correlations 

Filter data frame to only have correlation columns. Then use HasPosNeg function to add column with T/F values to record whether or not a gene has both positive or negavtive correlations. 

In [4]:
col = ["Correlation_Gbm","Correlation_kidney","Correlation_Ovar","Correlation_Brca","Correlation_Luad","Correlation_hnscc","Correlation_colon","Correlation_Lscc"]
FDR_corr = prot_FDR[col]
FDR_corr["Pos_Neg"] = FDR_corr.apply(HasPosNeg, axis = 1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Filter data frame to have only proteins that have both pos and neg correlations. Then save file as csv file to be used as enrichment anaylsis notebook

In [5]:
FDR_corr_True = FDR_corr[FDR_corr['Pos_Neg']==True]
#FDR_corr_True.to_csv("csv_files/pancan_EGFR_all_pos_neg_FDR.csv")
FDR_corr_True

Unnamed: 0_level_0,Correlation_Gbm,Correlation_kidney,Correlation_Ovar,Correlation_Brca,Correlation_Luad,Correlation_hnscc,Correlation_colon,Correlation_Lscc,Pos_Neg
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DAB2_proteomics,-0.556402,-0.076173,0.076981,0.326055,-0.086403,-0.208437,-0.147519,-0.072496,True
CTSC_proteomics,-0.546285,-0.302316,-0.072453,0.266940,0.302760,-0.052297,0.193621,-0.037491,True
SCPEP1_proteomics,-0.531494,-0.386583,-0.021208,0.399187,-0.028282,0.184997,0.108437,0.122729,True
FAM129B_proteomics,-0.514984,0.016684,0.139030,0.344093,0.188913,0.211698,0.146329,0.360092,True
PPP1R18_proteomics,-0.497202,-0.116967,-0.039837,0.359142,-0.083955,-0.171855,0.192111,-0.058249,True
...,...,...,...,...,...,...,...,...,...
LNX2_proteomics,,,,0.420320,0.062128,-0.302918,,0.329286,True
HCN3_proteomics,,,,-0.368144,0.076930,,,0.449777,True
CRYBG2_proteomics,,,,,-0.342857,0.176104,,0.437455,True
NTS_proteomics,,,,,-0.318231,-0.121640,,0.300911,True


# Positive or Negative 

 Filter data frame to include only proteins where Pos_Neg column is false. Use CountPosNeg to count the number of columns that have correlations. 

In [6]:
FDR_corr_False = FDR_corr[FDR_corr['Pos_Neg']==False]
FDR_corr_False["Num_corr"] = FDR_corr_False.apply(CountPosNeg, axis = 1)
FDR_corr_False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Correlation_Gbm,Correlation_kidney,Correlation_Ovar,Correlation_Brca,Correlation_Luad,Correlation_hnscc,Correlation_colon,Correlation_Lscc,Pos_Neg,Num_corr
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
PHLDA1_proteomics,0.816848,0.254436,,0.364797,0.260110,0.664271,0.386104,0.713420,False,5
GRB2_proteomics,-0.610889,-0.217427,-0.190090,-0.177379,-0.302439,-0.532341,0.150960,-0.198042,False,3
SOCS2_proteomics,0.562720,,,,,0.020297,,0.472624,False,2
CDH4_proteomics,0.559180,0.148407,,,,,,,False,1
PLA2G15_proteomics,-0.556624,-0.298029,-0.035395,0.274185,-0.182930,-0.089313,,-0.147438,False,1
...,...,...,...,...,...,...,...,...,...,...
ZNF283_proteomics,,,,,,,,0.002610,False,0
TRIM39_proteomics,,,,,,,,-0.002379,False,0
ADGRA3_proteomics,,,,,,,,0.002228,False,0
CEP57L1_proteomics,,,,,,,,-0.001102,False,0


Sort Data frame with the Num_corr values descending. The filter out proteins that only had 0 or 1 correlation. Save file as csv to be used in enrichment anaylsis notebook 

In [7]:
FDR_corr_False.sort_values(by=['Num_corr'],ascending=False)
FDR_corr_False = FDR_corr_False[FDR_corr_False['Num_corr'].isin([2,3,4,5,6])]
FDR_corr_False

Unnamed: 0_level_0,Correlation_Gbm,Correlation_kidney,Correlation_Ovar,Correlation_Brca,Correlation_Luad,Correlation_hnscc,Correlation_colon,Correlation_Lscc,Pos_Neg,Num_corr
Comparison,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
PHLDA1_proteomics,0.816848,0.254436,,0.364797,0.260110,0.664271,0.386104,0.713420,False,5
GRB2_proteomics,-0.610889,-0.217427,-0.190090,-0.177379,-0.302439,-0.532341,0.150960,-0.198042,False,3
SOCS2_proteomics,0.562720,,,,,0.020297,,0.472624,False,2
PHLDA3_proteomics,0.525883,0.432530,-0.264481,0.185124,0.231789,0.763784,0.096883,0.682034,False,4
WAS_proteomics,-0.501918,-0.279814,-0.090413,0.153496,-0.337909,-0.503427,0.282401,-0.182781,False,3
...,...,...,...,...,...,...,...,...,...,...
CCDC158_proteomics,,,,,0.321006,-0.251365,,0.463473,False,2
BHLHA15_proteomics,,,,,-0.197812,-0.338481,,-0.366410,False,2
WNT7B_proteomics,,,,,-0.137353,0.331449,,0.372114,False,2
B3GNT8_proteomics,,,,,0.082775,0.303048,,0.360980,False,2


In [8]:
FDR_corr_False.to_csv("csv_files/pancan_EGFR_all_FDR_atleast2_same_corr.csv")  

In [9]:
gbm = FDR_corr[["Correlation_Gbm"]]
len(gbm)

14867

In [10]:
gbm = gbm[gbm > 0].dropna()

In [11]:
gbm.to_csv("csv_files/pancan_EGFR_Gbm_pos.csv")

In [12]:
gbm

Unnamed: 0_level_0,Correlation_Gbm
Comparison,Unnamed: 1_level_1
PHLDA1_proteomics,0.816848
SOCS2_proteomics,0.562720
CDH4_proteomics,0.559180
CKB_proteomics,0.544246
ARNT2_proteomics,0.542079
...,...
BBS2_proteomics,0.000036
ZNF598_proteomics,0.000083
RCBTB1_proteomics,0.000109
PRPF39_proteomics,0.000123
