# Make tables: Merge all cancer dfs and create csv files

This notebook combines dfs with p-values and differential expressions for 8 cancers. 
Description of created csv files: 
* all_proteins - contains all data in normal format, 
* all_heatmap - all data appended to make a long table for easy use with heatmap function, 
* sig_pval_heatmap - contains only significant genes (long format),
* mult_sig_pval_heatmap - contains only proteins significant in multiple cancers (long format), 
* pos_neg_df - contains only proteins showing a significant opposite effect in different cancers. 

Imports

In [1]:
import pandas as pd
import numpy as np

Read in the signle cancer dfs with p-values and differential expressions. Merge all dfs into one pancancer data frame.

In [31]:
g_merged = pd.read_csv('csv/Single_Cancer/GBM_pval_medians_pancan.csv')
h_merged = pd.read_csv('csv/Single_Cancer/HNSCC_pval_medians_pancan.csv')
b_merged = pd.read_csv('csv/Single_Cancer/BR_pval_medians_pancan.csv')
l_merged = pd.read_csv('csv/Single_Cancer/LUAD_pval_medians_pancan.csv')
ls_merged = pd.read_csv('csv/Single_Cancer/LSCC_pval_medians_pancan.csv')
e_merged = pd.read_csv('csv/Single_Cancer/EC_pval_medians_pancan.csv')
o_merged = pd.read_csv('csv/Single_Cancer/OV_pval_medians_pancan.csv')
c_merged = pd.read_csv('csv/Single_Cancer/CO_pval_medians_pancan.csv')

# Make csv of all data

In [32]:
df1 = g_merged.merge(h_merged, on='Umich_Proteomics',how='outer')
df2 = df1.merge(l_merged, on='Umich_Proteomics',how='outer')
df3 = df2.merge(ls_merged, on='Umich_Proteomics',how='outer')
df4 = df3.merge(b_merged, on='Umich_Proteomics',how='outer')
df5 = df4.merge(e_merged, on='Umich_Proteomics',how='outer')
df6 = df5.merge(o_merged, on='Umich_Proteomics',how='outer')
all_df = df6.merge(c_merged, on='Umich_Proteomics',how='outer')

print('Total proteins:', len(all_df.Umich_Proteomics.unique()))
all_df.head()

Total proteins: 16556


Unnamed: 0,Umich_Proteomics,GBM_P_Value,GBM_Median,HNSCC_P_Value,HNSCC_Median,LUAD_P_Value,LUAD_Median,LSCC_P_Value,LSCC_Median,BR_P_Value,BR_Median,EC_P_Value,EC_Median,OV_P_Value,OV_Median,CO_P_Value,CO_Median
0,CUL2,4.572167e-07,-0.458427,0.492948,-0.012483,0.754037,-0.041117,0.061876,-0.161777,0.999518,0.019744,0.968884,0.006991,0.993388,0.042732,0.947155,0.076786
1,DOCK1_1,4.572167e-07,-0.358361,0.94194,-0.050514,0.823513,-0.061154,0.637861,-0.033188,,0.223971,0.858479,0.106446,,-0.112346,,
2,ATE1_1,2.291619e-06,-0.481942,0.112081,-0.118994,0.011157,-0.315236,0.000959,-0.209974,0.999518,-0.094422,0.859507,0.028872,,,,
3,GDI2,2.291619e-06,-0.486754,0.961669,-0.101908,0.838721,-0.00609,0.302531,-0.181925,0.999518,-0.041954,0.693008,0.052571,0.994951,-0.10911,0.88324,-0.021554
4,CUTC,2.291619e-06,-0.490992,0.043479,-0.38481,0.409869,-0.280224,0.008548,-0.231417,0.999518,0.067701,0.788035,0.020382,0.993388,-0.186534,0.190516,-0.170226


In [33]:
all_df.to_csv('csv/all_proteins_pancan.tsv', sep='\t', index=False)

# Make csv of data formatted to use with the HeatMap function 

In [5]:
# Create long df for heatmap

cancer = ['GBM','HNSCC','LUAD','LSCC','BR','EC','OV','CO']
merged_dfs = [g_merged,h_merged,l_merged,ls_merged,b_merged,e_merged,o_merged,c_merged]

all_long_df = pd.DataFrame()
i = 0
for c in cancer:
    m = merged_dfs[i]
    m2 = m.assign(Cancer = c)
    m2 = m2.rename(columns={c+'_P_Value': 'P_Value'})
    m2 = m2.rename(columns={c+'_Median': 'Medians'})
    all_long_df = all_long_df.append(m2) 
    if i < 5:
        i += 1

print('Check total proteins:', len(all_long_df.Umich_Proteomics.unique()))

Check total proteins: 16305


In [6]:
all_long_df.to_csv('csv/all_heatmap_pancan.csv', index=False)

# Make csv of only significant proteins (formatted for heatmap)

Read in list_sig_in_at_least_one_cancer.csv. Convert to a list of genes significant in at least 1 cancer. Slice out genes in the list from the df with all data.

In [7]:
sig = pd.read_csv('csv/list_sig_one_cancer_pancan.csv')
list_sig = list(sig['0'])

In [8]:
# Keep genes with at least one sig ttest
bool_df = all_long_df['Umich_Proteomics'].isin(list_sig)
sig_df = all_long_df[bool_df]
print('Check total sig genes = 2630:', len(sig_df.Umich_Proteomics.unique()))
t = list(sig_df.Umich_Proteomics)

Check total sig genes = 2630: 2268


In [9]:
sig_df.to_csv('csv/sig_pval_heatmap_pancan.csv', index=False)

#  Make csv of proteins significant in multiple cancers (formatted for heatmap)

In [10]:
mult = pd.read_csv('csv/list_sig_multiple_cancers_pancan.csv')
list_mult = list(mult['0'])

In [11]:
# Keep genes with > one sig ttest
bool_df2 = all_long_df['Umich_Proteomics'].isin(list_mult)
mult_df = all_long_df[bool_df2]
print('Check total sig in multiple cancers = 332:', len(mult_df.Umich_Proteomics.unique()))

Check total sig in multiple cancers = 332: 241


In [12]:
mult_df.to_csv('csv/mult_sig_pval_heatmap_pancan.csv', index=False)

#  Make csv of proteins with sig opposite effects 

This csv will contain proteins that have both a significant positive differential expression and a significant negative differential expression in different cancers. (See Make_tables_data_munging for calculation of differential expression of proteomics)

In [13]:
# Returns true if both a pos and neg differential expression if found for the protein in the row

def HasPosNeg(row):
    hasPos = False
    hasNeg= False
    for item in row: 
        if pd.isnull(item):
            continue
        if item < 0:
            hasNeg = True
        if item > 0:
            hasPos = True
           
    if hasPos & hasNeg:
        return True
    return False


In [14]:
# Get df with only pval columns - Keep only sig values
new_df = all_df.set_index('Umich_Proteomics')
sig = new_df[new_df.columns[::2]].where(new_df < 0.05, np.nan) # replaces when false
only_sig = sig.dropna(how = 'all', axis = 'columns') 
only_sig = only_sig.dropna(how = 'all', axis = 'rows')
only_sig

Unnamed: 0_level_0,GBM_P_Value,HNSCC_P_Value,LUAD_P_Value,LSCC_P_Value,EC_P_Value,CO_P_Value
Umich_Proteomics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CUL2,4.572167e-07,,,,,
DOCK1_1,4.572167e-07,,,,,
ATE1_1,2.291619e-06,,0.011157,0.000959,,
GDI2,2.291619e-06,,,,,
CUTC,2.291619e-06,0.043479,,0.008548,,
...,...,...,...,...,...,...
CNBP_3,,,,0.044214,,
RAD51D,,,,0.047914,,
BCL11A_1,,,,0.048775,,
ZC3H14_4,,,,0.049479,,


In [15]:
# Keep only differential expression (median) columns
only_med = new_df[new_df.columns[1::2]]
only_med.columns

Index(['GBM_Median', 'HNSCC_Median', 'LUAD_Median', 'LSCC_Median', 'BR_Median',
       'EC_Median', 'OV_Median', 'CO_Median'],
      dtype='object')

In [17]:
sig_cancer = ['GBM', 'HNSCC', 'LUAD', 'LSCC', 'EC', 'CO'] # none sig in BR or OV
sig_dfs = {}

# Create df with pval and med of sig genes of one cancer - Add to dictionary
for c in sig_cancer:
    med = only_med[[c+'_Median']]
    pval = only_sig[[c+'_P_Value']]
    merged = pval.merge(med, on = 'Umich_Proteomics', how='left') # keep all pvals
    sig_med_df = merged.dropna()
    sig_dfs[c] = sig_med_df

# Combine all dfs with only real values for sig pval median pairs
all_sig = pd.DataFrame()
for c in sig_dfs:
    all_sig = all_sig.join(sig_dfs[c], how = 'outer') # join including all values

print('Total number of sig genes:', len(all_sig.index))

Total number of sig genes: 2268


In [None]:
# Checks

In [18]:
# check # see make_fig_2_data_munging
c = 'LUAD'
l2 = all_sig[[c+'_P_Value', c+'_Median']].dropna(axis = 0, how= 'all')
luad2 = list(l2.index)
len(luad2)

17

In [20]:
# check 
only_med_of_sig = all_sig[['GBM_Median','HNSCC_Median','LUAD_Median','LSCC_Median','EC_Median']]
test = only_med_of_sig.index.isin(['ITGAL']) # sig luad -> ITGAL_isoform_1 (most consistent?)
only_med_of_sig[test]

Unnamed: 0_level_0,GBM_Median,HNSCC_Median,LUAD_Median,LSCC_Median,EC_Median
Umich_Proteomics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ITGAL,-0.42913,,,-0.360454,


In [21]:
# checks

Map the proteins with a pos and neg differential expression with the HasPosNeg function. Slice out proteins meeting this criteria from the df formatted for the heatmap function.

In [25]:
only_med_of_sig = all_sig[['GBM_Median','HNSCC_Median','LUAD_Median','LSCC_Median','EC_Median']]
only_med_of_sig["Pos_Neg"] = only_med_of_sig.apply(HasPosNeg, axis = 1)
pn = only_med_of_sig.loc[only_med_of_sig['Pos_Neg'] == True]
pn_genes = list(pn.index) # list of genes that have pos and neg

# Slice
get = sig_df.Umich_Proteomics.isin(pn_genes)
pos_neg_df = sig_df[get] # Keep genes with pos and neg
len(pos_neg_df.Umich_Proteomics.unique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_med_of_sig["Pos_Neg"] = only_med_of_sig.apply(HasPosNeg, axis = 1)


138

In [27]:
pos_neg_df.to_csv('csv/pos_neg_df.csv', index=False)

# Get number of proteins with significant opposite effects

Count proteins with an opposite effect. An opposite effect occurs when there is a significant increase in the protein abundance in one cancer and there is a significant decrease in the protein abundance in a different cancer.

In [28]:
get = mult_df.Umich_Proteomics.isin(pn_genes)
mult_sig_pn = mult_df[get] # Keep genes with pos and neg
proteins_opposite_effect = mult_sig_pn.Umich_Proteomics.unique()
print('Proteins with significant opposite effects:', len(proteins_opposite_effect))

Proteins with significant opposite effects: 138


# Calculate percentage of proteins with significant opposite effects

In [29]:
mult_sig_total = len(list_mult)
opposite = len(proteins_opposite_effect)
print('Percent of proteins with significant opposite effects: ', opposite, '/', mult_sig_total, '* 100 = ',
      opposite / mult_sig_total *100)

Percent of proteins with significant opposite effects:  138 / 241 * 100 =  57.26141078838174
