# Make tables: Merge all cancer dfs and create csv files

This notebook combines dfs with p-values and differential expressions for 8 cancers. 
Description of created csv files: 
* all_proteins - contains all data in normal format, 
* all_heatmap - all data appended to make a long table for easy use with heatmap function, 
* sig_pval_heatmap - contains only significant genes (long format),
* mult_sig_pval_heatmap - contains only proteins significant in multiple cancers (long format), 
* pos_neg_df - contains only proteins showing a significant opposite effect in different cancers. 

Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u
import plot_utils as p

Read in the signle cancer dfs with p-values and differential expressions. Merge all dfs into one pancancer data frame.

In [2]:
g_merged = pd.read_csv('csv/Single_Cancer/Gbm_pval_medians.csv')
h_merged = pd.read_csv('csv/Single_Cancer/Hnscc_pval_medians.csv')
b_merged = pd.read_csv('csv/Single_Cancer/Brca_pval_medians.csv')
l_merged = pd.read_csv('csv/Single_Cancer/Luad_pval_medians.csv')
ls_merged = pd.read_csv('csv/Single_Cancer/Lscc_pval_medians.csv')
e_merged = pd.read_csv('csv/Single_Cancer/En_pval_medians.csv')
o_merged = pd.read_csv('csv/Single_Cancer/Ov_pval_medians.csv')
c_merged = pd.read_csv('csv/Single_Cancer/Colon_pval_medians.csv')

# Make csv of all data

In [3]:
df1 = g_merged.merge(h_merged, on='Proteomics',how='outer')
df2 = df1.merge(l_merged, on='Proteomics',how='outer')
df3 = df2.merge(ls_merged, on='Proteomics',how='outer')
df4 = df3.merge(b_merged, on='Proteomics',how='outer')
df5 = df4.merge(o_merged, on='Proteomics',how='outer')
df6 = df5.merge(e_merged, on='Proteomics',how='outer')
all_df = df6.merge(c_merged, on='Proteomics',how='outer')
print('Total proteins:', len(all_df.Proteomics.unique()))
all_df.head()

Total proteins: 15411


Unnamed: 0,Proteomics,Gbm_P_Value,Gbm_Median,Hnscc_P_Value,Hnscc_Median,Luad_P_Value,Luad_Median,Lscc_P_Value,Lscc_Median,Brca_P_Value,Brca_Median,Ov_P_Value,Ov_Median,En_P_Value,En_Median,Colon_P_Value,Colon_Median
0,ARMH3,5.766739e-07,-0.405134,0.024047,-0.135941,0.222235,-0.37845,0.906857,-0.2797,,,,,0.034811,0.1503,,
1,CUTC,8.514758e-07,-0.553255,0.014672,-0.314307,0.056523,-1.01995,0.568489,-0.44585,0.349469,-0.4808,0.985704,-0.159424,0.330823,-0.2533,0.91753,-0.0821
2,CUL2,2.783477e-06,-0.586396,0.677585,0.017886,0.868772,0.05695,0.999848,0.04745,0.665878,0.17195,0.991659,0.033091,0.95797,0.0232,0.902073,-0.0769
3,PIP4K2A,2.783477e-06,-0.838882,0.038186,-0.224713,0.84773,0.04535,0.999848,0.2475,0.670623,-0.07435,0.985704,0.052955,0.502967,0.169,0.883481,0.10135
4,GDI2,2.783477e-06,-0.610188,0.8271,0.001559,,,0.999848,-0.03735,0.789708,0.14995,0.985704,-0.067389,0.28319,0.1286,0.999502,-0.0445


In [4]:
all_df.to_csv('csv/all_proteins.csv', index=False)

# Make csv of data formatted to use with the HeatMap function 

In [5]:
# Create long df for heatmap

cancer = ['Gbm','Hnscc','Luad','Lscc','Brca','Ov','En','Colon']
merged_dfs = [g_merged,h_merged,l_merged,ls_merged,b_merged,o_merged,e_merged,c_merged]

all_long_df = pd.DataFrame()
i = 0
for c in cancer:
    m = merged_dfs[i]
    m2 = m.assign(Cancer = c)
    m2 = m2.rename(columns={c+'_P_Value': 'P_Value'})
    m2 = m2.rename(columns={c+'_Median': 'Medians'})
    all_long_df = all_long_df.append(m2) 
    if i < 7:
        i += 1

print('Check total proteins:', len(all_long_df.Proteomics.unique()))

Check total proteins: 15411


In [6]:
all_long_df.to_csv('csv/all_heatmap.csv', index=False)

# Make csv of only significant proteins (formatted for heatmap)

Read in list_sig_in_at_least_one_cancer.csv. Convert to a list of genes significant in at least 1 cancer. Slice out genes in the list from the df with all data.

In [7]:
sig = pd.read_csv('csv/list_sig_one_cancer.csv')
list_sig = list(sig['0'])

In [8]:
# Keep genes with at least one sig ttest
bool_df = all_long_df['Proteomics'].isin(list_sig)
sig_df = all_long_df[bool_df]
print('Check total sig genes = 2630:', len(sig_df.Proteomics.unique()))
t = list(sig_df.Proteomics)

Check total sig genes = 2630: 2630


In [9]:
sig_df.to_csv('csv/sig_pval_heatmap.csv', index=False)

#  Make csv of proteins significant in multiple cancers (formatted for heatmap)

In [10]:
mult = pd.read_csv('csv/list_sig_multiple_cancers.csv')
list_mult = list(mult['0'])

In [11]:
# Keep genes with > one sig ttest
bool_df2 = all_long_df['Proteomics'].isin(list_mult)
mult_df = all_long_df[bool_df2]
print('Check total sig in multiple cancers = 332:', len(mult_df.Proteomics.unique()))

Check total sig in multiple cancers = 332: 332


In [12]:
mult_df.to_csv('csv/mult_sig_pval_heatmap.csv', index=False)

#  Make csv of proteins with sig opposite effects 

This csv will contain proteins that have both a significant positive differential expression and a significant negative differential expression in different cancers. (See Make_tables_data_munging for calculation of differential expression of proteomics)

In [13]:
# Returns true if both a pos and neg differential expression if found for the protein in the row

def HasPosNeg(row):
    hasPos = False
    hasNeg= False
    for item in row: 
        if pd.isnull(item):
            continue
        if item < 0:
            hasNeg = True
        if item > 0:
            hasPos = True
           
    if hasPos & hasNeg:
        return True
    return False


In [14]:
# Get df with only pval columns - Keep only sig values
new_df = all_df.set_index('Proteomics')
sig = new_df[new_df.columns[::2]].where(new_df < 0.05, np.nan) # replaces when false
only_sig = sig.dropna(how = 'all', axis = 'columns') 
only_sig = only_sig.dropna(how = 'all', axis = 'rows')
only_sig

Unnamed: 0_level_0,Gbm_P_Value,Hnscc_P_Value,Luad_P_Value,Lscc_P_Value,Ov_P_Value,En_P_Value
Proteomics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ARMH3,5.766739e-07,0.024047,,,,0.034811
CUTC,8.514758e-07,0.014672,,,,
CUL2,2.783477e-06,,,,,
PIP4K2A,2.783477e-06,0.038186,,,,
GDI2,2.783477e-06,,,,,
...,...,...,...,...,...,...
PIK3R1_isoform_1,,,0.046156,,,
RSPH3,,,,,,0.046155
ARMC3,,,,,,0.030930
CCDC57,,,,,,0.041460


In [15]:
# Keep only differential expression (median) columns
only_med = new_df[new_df.columns[1::2]]
only_med.columns

Index(['Gbm_Median', 'Hnscc_Median', 'Luad_Median', 'Lscc_Median',
       'Brca_Median', 'Ov_Median', 'En_Median', 'Colon_Median'],
      dtype='object')

In [16]:
sig_cancer = ['Gbm', 'Hnscc', 'Luad', 'Lscc', 'Ov', 'En'] 
sig_dfs = {}

# create df with pval and med of sig genes of one cancer - Add to dictionary
for c in sig_cancer:
    med = only_med[[c+'_Median']]
    pval = only_sig[[c+'_P_Value']]
    merged = pval.merge(med, on = 'Proteomics', how='left') # keep all pvals
    sig_med_df = merged.dropna()
    sig_dfs[c] = sig_med_df

# Combine all dfs with only real values for sig pval median pairs
all_sig = pd.DataFrame()
for c in sig_dfs:
    all_sig = all_sig.join(sig_dfs[c], how = 'outer') # join including all values

print('Total number of sig genes:', len(all_sig.index))

Total number of sig genes: 2630


In [17]:
# Checks

In [18]:
# check # see make_fig_2_data_munging
c = 'Luad'
l2 = all_sig[[c+'_P_Value', c+'_Median']].dropna(axis = 0, how= 'all')
luad2 = list(l2.index)
len(luad2)

111

In [19]:
# check 
only_med_of_sig = all_sig[['Gbm_Median','Hnscc_Median','Luad_Median','Lscc_Median','Ov_Median','En_Median']]
test = only_med_of_sig.index.isin(['ITGAL']) # sig luad -> ITGAL_isoform_1 (most consistent?)
only_med_of_sig[test]

Unnamed: 0_level_0,Gbm_Median,Hnscc_Median,Luad_Median,Lscc_Median,Ov_Median,En_Median
Proteomics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ITGAL,-0.591693,-0.477094,,,,


In [20]:
# checks

Map the proteins with a pos and neg differential expression with the HasPosNeg function. Slice out proteins meeting this criteria from the df formatted for the heatmap function.

In [21]:
only_med_of_sig = all_sig[['Gbm_Median','Hnscc_Median','Luad_Median','Lscc_Median','Ov_Median','En_Median']]
only_med_of_sig["Pos_Neg"] = only_med_of_sig.apply(HasPosNeg, axis = 1)
pn = only_med_of_sig.loc[only_med_of_sig['Pos_Neg'] == True]
pn_genes = list(pn.index) # list of genes that have pos and neg

# Slice
get = sig_df.Proteomics.isin(pn_genes)
pos_neg_df = sig_df[get] # Keep genes with pos and neg
len(pos_neg_df.Proteomics.unique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


126

In [23]:
pos_neg_df.to_csv('csv/pos_neg_df.csv', index=False)

# Get number of proteins with significant opposite effects

Count proteins with an opposite effect. An opposite effect occurs when there is a significant increase in the protein abundance in one cancer and there is a significant decrease in the protein abundance in a different cancer.

In [26]:
get = mult_df.Proteomics.isin(pn_genes)
mult_sig_pn = mult_df[get] # Keep genes with pos and neg
proteins_opposite_effect = mult_sig_pn.Proteomics.unique()
print('Proteins with significant opposite effects:', len(proteins_opposite_effect))

Proteins with significant opposite effects: 126


# Calculate percentage of proteins with significant opposite effects

In [28]:
mult_sig_total = len(list_mult)
opposite = len(proteins_opposite_effect)
print('Percent of proteins with significant opposite effects: ', opposite, '/', mult_sig_total, '* 100 = ',
      opposite / mult_sig_total *100)

Percent of proteins with significant opposite effects:  126 / 332 * 100 =  37.95180722891566
