# Step 3.2: Merge all cancer data frames and create csv files

Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as pu

In [2]:
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'

Read in the signle cancer data frames with p-values and correlations (changes in median). Merge all dfs into one pancancer data frame.

In [3]:
g_merged = pd.read_csv(root+R'\gbm_pval_medians.csv')
h_merged = pd.read_csv(root+R'\hnscc_pval_medians.csv')
b_merged = pd.read_csv(root+R'\brca_pval_medians.csv')
l_merged = pd.read_csv(root+R'\luad_pval_medians.csv')
ls_merged = pd.read_csv(root+R'\lscc_pval_medians.csv')
e_merged = pd.read_csv(root+R'\endo_pval_medians.csv')
o_merged = pd.read_csv(root+R'\ov_pval_medians.csv')
c_merged = pd.read_csv(root+R'\colon_pval_medians.csv')

# Create csv with all data

In [4]:
df1 = g_merged.merge(h_merged, on='Proteomics',how='outer')
df2 = df1.merge(l_merged, on='Proteomics',how='outer')
df3 = df2.merge(ls_merged, on='Proteomics',how='outer')
df4 = df3.merge(b_merged, on='Proteomics',how='outer')
df5 = df4.merge(o_merged, on='Proteomics',how='outer')
df6 = df5.merge(e_merged, on='Proteomics',how='outer')
all_df = df6.merge(c_merged, on='Proteomics',how='outer')
all_df = all_df.replace(to_replace = r'_proteomics$', value = '', regex = True)
all_df.head()


Unnamed: 0,Proteomics,Gbm_P_Value,Gbm_Median,Hnscc_P_Value,Hnscc_Median,Luad_P_Value,Luad_Median,Lscc_P_Value,Lscc_Median,Brca_P_Value,Brca_Median,Ov_P_Value,Ov_Median,En_P_Value,En_Median,Colon_P_Value,Colon_Median
0,ARMH3,5.396032e-11,-0.405134,0.000558,-0.135941,0.017908,-0.37845,0.000252,-0.42085,,,,,0.001113,-0.1503,,
1,CUTC,1.59348e-10,-0.553255,0.000189,-0.314307,0.000744,-1.01995,0.000841,-0.5571,0.008295,-0.4808,0.060164,-0.159424,0.082868,0.2533,0.411506,-0.0821
2,PIP4K2A,1.009419e-09,-0.838882,0.001544,-0.224713,0.560996,0.04535,0.020074,-0.00965,0.203952,-0.07435,0.389187,0.052955,0.201771,-0.169,0.199847,0.10135
3,CUL2,1.122076e-09,-0.586396,0.400554,0.017886,0.612583,0.05695,0.001998,-0.36965,0.198043,0.17195,0.786184,0.033091,0.864381,-0.0232,0.349065,-0.0769
4,GDI2,1.302273e-09,-0.610188,0.630985,0.001559,,,0.013276,-0.4358,0.354712,0.14995,0.569417,-0.067389,0.06166,-0.1286,0.990512,-0.0445


In [5]:
all_df.to_csv(root+R'\all_pval_all_proteins.csv', index=False)

# create csv with long data frame to use with HeatMap function

In [6]:
# Create long df for heat map

cancer = ['Gbm','Hnscc','Luad','Lscc','Brca','Ov','En','Colon']
merged_dfs = [g_merged,h_merged,l_merged,ls_merged,b_merged,o_merged,e_merged,c_merged]

all_long_df = pd.DataFrame()
i = 0
for c in cancer:
    m = merged_dfs[i]
    m2 = m.assign(Cancer = c)
    m2 = m2.rename(columns={c+'_P_Value': 'P_Value'})
    m2 = m2.rename(columns={c+'_Median': 'Medians'})
    all_long_df = all_long_df.append(m2) 
    if i < 7:
        i += 1

all_long_df = all_long_df.replace(to_replace = r'_proteomics$', value = '', regex = True)
all_long_df

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
0,ARMH3,5.396032e-11,-0.405134,Gbm
1,CUTC,1.593480e-10,-0.553255,Gbm
2,PIP4K2A,1.009419e-09,-0.838882,Gbm
3,CUL2,1.122076e-09,-0.586396,Gbm
4,GDI2,1.302273e-09,-0.610188,Gbm
...,...,...,...,...
8062,ZNF841,,0.782500,Colon
8063,ZNF888,,,Colon
8064,ZNHIT6,,,Colon
8065,ZNRD1,,-0.068000,Colon


In [7]:
all_long_df.to_csv(root+R'\all_heatmap.csv', index=False)

# Long df with genes sig > 1 cancer

Read in list_sig_in_at_least_one_cancer.csv. Convert to a list of genes significant in at least 1 cancer. Slice out genes in the list from the df with all data.

In [8]:
sig = pd.read_csv(root+R'\list_sig_in_at_least_one_cancer.csv')
#sig = sig.replace(to_replace = r'_proteomics$', value = '', regex = True)
list_sig = list(sig['0'])

In [9]:
# Keep genes with at least one sig ttest
bool_df = all_long_df['Proteomics'].isin(list_sig)
sig_df = all_long_df[bool_df]
sig_df

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
0,ARMH3,5.396032e-11,-0.405134,Gbm
1,CUTC,1.593480e-10,-0.553255,Gbm
2,PIP4K2A,1.009419e-09,-0.838882,Gbm
3,CUL2,1.122076e-09,-0.586396,Gbm
4,GDI2,1.302273e-09,-0.610188,Gbm
...,...,...,...,...
8042,WWP1,,-0.120300,Colon
8045,ZCCHC9,,0.406000,Colon
8046,ZDHHC2,,,Colon
8052,ZNF260,,0.052700,Colon


In [10]:
sig_df.to_csv(root+R'\sig_pval_heatmap.csv', index=False)

#  create csv with significant proteins in more than 1 cancer 

In [11]:
mult = pd.read_csv(root+R'\list_sig_in_multiple_cancers.csv')
#mult = mult.replace(to_replace = r'_proteomics$', value = '', regex = True)
list_mult = list(mult['0'])

In [12]:
# Keep genes with > one sig ttest
bool_df2 = all_long_df['Proteomics'].isin(list_mult)
mult_df = all_long_df[bool_df2]

In [13]:
mult_df.to_csv(root+R'\mult_sig_pval_heatmap.csv', index=False)

#  create csv that has proteins with pos and neg changes in median

In [14]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < -0.3:
            hasNeg = True
        if item > 0.3:
            hasPos = True
           
    if hasPos & hasNeg:
        return True
    return False


Create a df with only the differences in median columns for genes sig in at least one cancer. 

In [15]:
only_med = all_df.drop(columns= ['Gbm_P_Value','Hnscc_P_Value','Luad_P_Value','Lscc_P_Value',
                     'Brca_P_Value','Ov_P_Value','En_P_Value','Colon_P_Value'])
only_med.head()


Unnamed: 0,Proteomics,Gbm_Median,Hnscc_Median,Luad_Median,Lscc_Median,Brca_Median,Ov_Median,En_Median,Colon_Median
0,ARMH3,-0.405134,-0.135941,-0.37845,-0.42085,,,-0.1503,
1,CUTC,-0.553255,-0.314307,-1.01995,-0.5571,-0.4808,-0.159424,0.2533,-0.0821
2,PIP4K2A,-0.838882,-0.224713,0.04535,-0.00965,-0.07435,0.052955,-0.169,0.10135
3,CUL2,-0.586396,0.017886,0.05695,-0.36965,0.17195,0.033091,-0.0232,-0.0769
4,GDI2,-0.610188,0.001559,,-0.4358,0.14995,-0.067389,-0.1286,-0.0445


Map the pos and neg differences in median with the HasPosNeg function. Slice out genes meeting the criteria from the long df formatted for the HeatMap function.

In [16]:
only_med = only_med.set_index('Proteomics') # Needed for HasPosNeg function to work (no str)
only_med["Pos_Neg"] = only_med.apply(HasPosNeg, axis = 1)
pn = only_med.loc[only_med['Pos_Neg'] == True]
pn_genes = list(pn.index) # list of genes that have pos and neg

# Slice
get = all_long_df.Proteomics.isin(pn_genes)
pos_neg_df = all_long_df[get] # Keep genes with pos and neg
pos_neg_df

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
7,PACS2,1.123526e-08,-0.386279,Gbm
12,PTEN,1.219523e-07,-0.511726,Gbm
17,HAGH,3.232029e-07,-0.601619,Gbm
20,DCTN4,5.256343e-07,-0.362140,Gbm
30,FAM49B,7.767825e-07,-0.568365,Gbm
...,...,...,...,...
8046,ZDHHC2,,,Colon
8048,ZFP36,,-1.167400,Colon
8053,ZNF32,,0.737000,Colon
8055,ZNF397,,-0.438400,Colon


In [17]:
pos_neg = pd.Series(pn_genes)
pos_neg.to_csv(root+R'\list_pos_neg_correlation.csv', index=False)
pos_neg

0         PACS2
1          PTEN
2          HAGH
3         DCTN4
4        FAM49B
         ...   
3194       PNMT
3195      INHBB
3196      CLDN6
3197    GAGE12F
3198      DEFA6
Length: 3199, dtype: object

In [18]:
pos_neg_df.to_csv(root+R'\pos_neg_df.csv', index=False)

# Sig in > 1 cancer and pos neg correlation

In [19]:
# Slice
get = sig_df.Proteomics.isin(pn_genes)
pos_neg_df = sig_df[get] # Keep genes with pos and neg
pos_neg_df

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
7,PACS2,1.123526e-08,-0.386279,Gbm
12,PTEN,1.219523e-07,-0.511726,Gbm
17,HAGH,3.232029e-07,-0.601619,Gbm
20,DCTN4,5.256343e-07,-0.362140,Gbm
30,FAM49B,7.767825e-07,-0.568365,Gbm
...,...,...,...,...
7968,PRKCQ,,0.069350,Colon
7974,PYHIN1,,-1.118000,Colon
7977,RANGRF,,0.440000,Colon
7991,SIKE1,,0.607300,Colon


# Sig in multiple cancers and pos neg correlation

In [20]:
# Slice
get = mult_df.Proteomics.isin(pn_genes)
pos_neg_df = mult_df[get] # Keep genes with pos and neg
len(pos_neg_df.Proteomics.unique())

82