# Step 3.2: Merge all cancer data frames and create csv files

Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as p

In [2]:
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'

Read in the signle cancer data frames with p-values and correlations (changes in median). Merge all dfs into one pancancer data frame.

In [3]:
g_merged = pd.read_csv(root+R'\Gbm_pval_medians.csv')
h_merged = pd.read_csv(root+R'\Hnscc_pval_medians.csv')
b_merged = pd.read_csv(root+R'\Brca_pval_medians.csv')
l_merged = pd.read_csv(root+R'\Luad_pval_medians.csv')
ls_merged = pd.read_csv(root+R'\Lscc_pval_medians.csv')
e_merged = pd.read_csv(root+R'\En_pval_medians.csv')
o_merged = pd.read_csv(root+R'\Ov_pval_medians.csv')
c_merged = pd.read_csv(root+R'\Colon_pval_medians.csv')

# Create csv with all data

In [4]:
df1 = g_merged.merge(h_merged, on='Proteomics',how='outer')
df2 = df1.merge(l_merged, on='Proteomics',how='outer')
df3 = df2.merge(ls_merged, on='Proteomics',how='outer')
df4 = df3.merge(b_merged, on='Proteomics',how='outer')
df5 = df4.merge(o_merged, on='Proteomics',how='outer')
df6 = df5.merge(e_merged, on='Proteomics',how='outer')
all_df = df6.merge(c_merged, on='Proteomics',how='outer')
all_df = all_df.replace(to_replace = r'_proteomics$', value = '', regex = True)
all_df.head()


Unnamed: 0,Proteomics,Gbm_P_Value,Gbm_Median,Hnscc_P_Value,Hnscc_Median,Luad_P_Value,Luad_Median,Lscc_P_Value,Lscc_Median,Brca_P_Value,Brca_Median,Ov_P_Value,Ov_Median,En_P_Value,En_Median,Colon_P_Value,Colon_Median
0,ARMH3,5.766739e-07,-0.405134,0.024047,-0.135941,0.222235,-0.37845,0.906857,-0.2797,,,,,0.034811,0.1503,,
1,CUTC,8.514758e-07,-0.553255,0.014672,-0.314307,0.056523,-1.01995,0.568489,-0.44585,0.349469,-0.4808,0.985704,-0.159424,0.330823,-0.2533,0.91753,-0.0821
2,CUL2,2.783477e-06,-0.586396,0.677585,0.017886,0.868772,0.05695,0.999848,0.04745,0.665878,0.17195,0.991659,0.033091,0.95797,0.0232,0.902073,-0.0769
3,PIP4K2A,2.783477e-06,-0.838882,0.038186,-0.224713,0.84773,0.04535,0.999848,0.2475,0.670623,-0.07435,0.985704,0.052955,0.502967,0.169,0.883481,0.10135
4,GDI2,2.783477e-06,-0.610188,0.8271,0.001559,,,0.999848,-0.03735,0.789708,0.14995,0.985704,-0.067389,0.28319,0.1286,0.999502,-0.0445


In [5]:
all_df.to_csv(root+R'\all_pval_all_proteins.csv', index=False)

# create csv with long data frame to use with HeatMap function

In [6]:
# Create long df for heat map

cancer = ['Gbm','Hnscc','Luad','Lscc','Brca','Ov','En','Colon']
merged_dfs = [g_merged,h_merged,l_merged,ls_merged,b_merged,o_merged,e_merged,c_merged]

all_long_df = pd.DataFrame()
i = 0
for c in cancer:
    m = merged_dfs[i]
    m2 = m.assign(Cancer = c)
    m2 = m2.rename(columns={c+'_P_Value': 'P_Value'})
    m2 = m2.rename(columns={c+'_Median': 'Medians'})
    all_long_df = all_long_df.append(m2) 
    if i < 7:
        i += 1

all_long_df = all_long_df.replace(to_replace = r'_proteomics', value = '', regex = True)
all_long_df

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
0,ARMH3,5.766739e-07,-0.405134,Gbm
1,CUTC,8.514758e-07,-0.553255,Gbm
2,CUL2,2.783477e-06,-0.586396,Gbm
3,PIP4K2A,2.783477e-06,-0.838882,Gbm
4,GDI2,2.783477e-06,-0.610188,Gbm
...,...,...,...,...
7733,ZNF397,,-0.438400,Colon
7734,ZNF414,,-0.096500,Colon
7735,ZNF48,,-0.019700,Colon
7736,ZNF587,,0.578000,Colon


In [7]:
all_long_df.to_csv(root+R'\all_heatmap.csv', index=False)

# Long df with genes sig > 1 cancer

Read in list_sig_in_at_least_one_cancer.csv. Convert to a list of genes significant in at least 1 cancer. Slice out genes in the list from the df with all data.

In [8]:
sig = pd.read_csv(root+R'\list_sig_one_cancer.csv')
list_sig = list(sig['0'])

In [9]:
# Keep genes with at least one sig ttest
bool_df = all_long_df['Proteomics'].isin(list_sig)
sig_df = all_long_df[bool_df]
print(len(sig_df.Proteomics.unique()))
sig_df.head()

2627


Unnamed: 0,Proteomics,P_Value,Medians,Cancer
0,ARMH3,5.766739e-07,-0.405134,Gbm
1,CUTC,8.514758e-07,-0.553255,Gbm
2,CUL2,2.783477e-06,-0.586396,Gbm
3,PIP4K2A,2.783477e-06,-0.838882,Gbm
4,GDI2,2.783477e-06,-0.610188,Gbm


In [10]:
sig_df.to_csv(root+R'\sig_pval_heatmap.csv', index=False)

#  create csv with significant proteins in more than 1 cancer 

In [11]:
mult = pd.read_csv(root+R'\list_sig_multiple_cancers.csv')
list_mult = list(mult['0'])
len(list_mult)

332

In [12]:
# Keep genes with > one sig ttest
bool_df2 = all_long_df['Proteomics'].isin(list_mult)
mult_df = all_long_df[bool_df2]

In [13]:
mult_df.to_csv(root+R'\mult_sig_pval_heatmap.csv', index=False)

#  create csv that has proteins with pos and neg changes in median

In [14]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False
    for item in row: 
        if pd.isnull(item):
            continue
        if item < -0.3:
            hasNeg = True
        if item > 0.3:
            hasPos = True
           
    if hasPos & hasNeg:
        return True
    return False


Create a df with only the differences in median columns for genes sig in at least one cancer. 

In [15]:
only_med = all_df.drop(columns= ['Gbm_P_Value','Hnscc_P_Value','Luad_P_Value','Lscc_P_Value',
                     'Brca_P_Value','Ov_P_Value','En_P_Value','Colon_P_Value'])
only_med

Unnamed: 0,Proteomics,Gbm_Median,Hnscc_Median,Luad_Median,Lscc_Median,Brca_Median,Ov_Median,En_Median,Colon_Median
0,ARMH3,-0.405134,-0.135941,-0.37845,-0.27970,,,0.1503,
1,CUTC,-0.553255,-0.314307,-1.01995,-0.44585,-0.48080,-0.159424,-0.2533,-0.08210
2,CUL2,-0.586396,0.017886,0.05695,0.04745,0.17195,0.033091,0.0232,-0.07690
3,PIP4K2A,-0.838882,-0.224713,0.04535,0.24750,-0.07435,0.052955,0.1690,0.10135
4,GDI2,-0.610188,0.001559,,-0.03735,0.14995,-0.067389,0.1286,-0.04450
...,...,...,...,...,...,...,...,...,...
15092,RETNLB,,,,,,,,3.17860
15093,SYCE3,,,,,,,,-0.18575
15094,TFF2,,,,,,,,-0.33000
15095,TRIM52,,,,,,,,-0.30270


Map the pos and neg differences in median with the HasPosNeg function. Slice out genes meeting the criteria from the long df formatted for the HeatMap function.

In [16]:
only_med = only_med.set_index('Proteomics') # Needed for HasPosNeg function to work (no str)
only_med["Pos_Neg"] = only_med.apply(HasPosNeg, axis = 1)
pn = only_med.loc[only_med['Pos_Neg'] == True]
pn_genes = list(pn.index) # list of genes that have pos and neg

# Slice
get = sig_df.Proteomics.isin(pn_genes)
pos_neg_df = sig_df[get] # Keep genes with pos and neg
len(pos_neg_df.Proteomics.unique())
pos_neg_df

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
7,PACS2,0.000015,-0.386279,Gbm
8,DOCK5,0.000033,-0.862819,Gbm
13,PDZD8,0.000177,-0.444260,Gbm
16,HAGH,0.000203,-0.601619,Gbm
18,DCTN4,0.000250,-0.362140,Gbm
...,...,...,...,...
7643,SPPL2B,,0.269000,Colon
7703,UBE2S,,-0.275500,Colon
7704,UBE2T,,-0.368400,Colon
7725,ZC3H8,,0.105200,Colon


In [17]:
pos_neg = pd.Series(pn_genes)
pos_neg.to_csv(root+R'\list_pos_neg.csv', index=False)

In [18]:
pos_neg_df.to_csv(root+R'\pos_neg_df.csv', index=False)

# Sig in > 1 cancer and pos neg correlation

In [19]:
# Slice
get = sig_df.Proteomics.isin(pn_genes)
pos_neg_df = sig_df[get] # Keep genes with pos and neg
len(pos_neg_df.Proteomics.unique())

774

# Sig in multiple cancers and pos neg correlation

In [20]:
# Slice
get = mult_df.Proteomics.isin(pn_genes)
pos_neg_df = mult_df[get] # Keep genes with pos and neg
len(pos_neg_df.Proteomics.unique())

113