# Make tables: Merge all cancer dfs and create csv files

This notebook combines dfs with p-values and differential expressions for 8 cancers. 
Description of created csv files: 
* all_proteins - contains all data in normal format, 
* all_heatmap - all data appended to make a long table for easy use with heatmap function, 
* sig_pval_heatmap - contains only significant genes (long format),
* mult_sig_pval_heatmap - contains only proteins significant in multiple cancers (long format), 
* pos_neg_df - contains only proteins showing a significant opposite effect in different cancers. 

Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u
import plot_utils as p

Read in the signle cancer dfs with p-values and differential expressions. Merge all dfs into one pancancer data frame.

In [3]:
l_merged = pd.read_csv('csv/Single_Cancer/Luad_pval_medians.csv')
e_merged = pd.read_csv('csv/Single_Cancer/Endo_pval_medians.csv')
c_merged = pd.read_csv('csv/Single_Cancer/Colon_pval_medians.csv')

# Make csv of all data

In [6]:
df1 = l_merged.merge(l_merged, on='Proteomics',how='outer')
df2 = df1.merge(e_merged, on='Proteomics',how='outer')
all_df = df2.merge(c_merged, on='Proteomics',how='outer')


In [7]:
all_df.to_csv('csv/all_proteins.csv', index=False)

# Make csv of data formatted to use with the HeatMap function 

In [8]:
# Create long df for heatmap

cancer = ['Luad','En','Colon']
merged_dfs = [l_merged,e_merged,c_merged]

all_long_df = pd.DataFrame()
i = 0
for c in cancer:
    m = merged_dfs[i]
    m2 = m.assign(Cancer = c)
    m2 = m2.rename(columns={c+'_P_Value': 'P_Value'})
    m2 = m2.rename(columns={c+'_Median': 'Medians'})
    all_long_df = all_long_df.append(m2) 
    if i < 7:
        i += 1

print('Check total proteins:', len(all_long_df.Proteomics.unique()))

Check total proteins: 12718


In [9]:
all_long_df.to_csv('csv/all_heatmap.csv', index=False)

# Make csv of only significant proteins (formatted for heatmap)

Read in list_sig_in_at_least_one_cancer.csv. Convert to a list of genes significant in at least 1 cancer. Slice out genes in the list from the df with all data.

In [10]:
sig = pd.read_csv('csv/list_sig_one_cancer.csv')
list_sig = list(sig['0'])

In [11]:
# Keep genes with at least one sig ttest
bool_df = all_long_df['Proteomics'].isin(list_sig)
sig_df = all_long_df[bool_df]
print('Check total sig genes = 2630:', len(sig_df.Proteomics.unique()))
t = list(sig_df.Proteomics)

Check total sig genes = 2630: 29


In [None]:
sig_df.to_csv('csv/sig_pval_heatmap.csv', index=False)

#  Make csv of proteins significant in multiple cancers (formatted for heatmap)

In [None]:
mult = pd.read_csv('csv/list_sig_multiple_cancers.csv')
list_mult = list(mult['0'])

In [None]:
# Keep genes with > one sig ttest
bool_df2 = all_long_df['Proteomics'].isin(list_mult)
mult_df = all_long_df[bool_df2]
print('Check total sig in multiple cancers = 332:', len(mult_df.Proteomics.unique()))

In [None]:
mult_df.to_csv('csv/mult_sig_pval_heatmap.csv', index=False)

#  Make csv of proteins with sig opposite effects 

This csv will contain proteins that have both a significant positive differential expression and a significant negative differential expression in different cancers. (See Make_tables_data_munging for calculation of differential expression of proteomics)

In [None]:
# Returns true if both a pos and neg differential expression if found for the protein in the row

def HasPosNeg(row):
    hasPos = False
    hasNeg= False
    for item in row: 
        if pd.isnull(item):
            continue
        if item < 0:
            hasNeg = True
        if item > 0:
            hasPos = True
           
    if hasPos & hasNeg:
        return True
    return False


In [None]:
# Get df with only pval columns - Keep only sig values
new_df = all_df.set_index('Proteomics')
sig = new_df[new_df.columns[::2]].where(new_df < 0.05, np.nan) # replaces when false
only_sig = sig.dropna(how = 'all', axis = 'columns') 
only_sig = only_sig.dropna(how = 'all', axis = 'rows')
only_sig

In [None]:
# Keep only differential expression (median) columns
only_med = new_df[new_df.columns[1::2]]
only_med.columns

In [None]:
sig_cancer = ['Gbm', 'Hnscc', 'Luad', 'Lscc', 'Ov', 'En'] 
sig_dfs = {}

# Create df with pval and med of sig genes of one cancer - Add to dictionary
for c in sig_cancer:
    med = only_med[[c+'_Median']]
    pval = only_sig[[c+'_P_Value']]
    merged = pval.merge(med, on = 'Proteomics', how='left') # keep all pvals
    sig_med_df = merged.dropna()
    sig_dfs[c] = sig_med_df

# Combine all dfs with only real values for sig pval median pairs
all_sig = pd.DataFrame()
for c in sig_dfs:
    all_sig = all_sig.join(sig_dfs[c], how = 'outer') # join including all values

print('Total number of sig genes:', len(all_sig.index))

In [None]:
# Checks

In [None]:
# check # see make_fig_2_data_munging
c = 'Luad'
l2 = all_sig[[c+'_P_Value', c+'_Median']].dropna(axis = 0, how= 'all')
luad2 = list(l2.index)
len(luad2)

In [None]:
# check 
only_med_of_sig = all_sig[['Gbm_Median','Hnscc_Median','Luad_Median','Lscc_Median','Ov_Median','En_Median']]
test = only_med_of_sig.index.isin(['ITGAL']) # sig luad -> ITGAL_isoform_1 (most consistent?)
only_med_of_sig[test]

In [None]:
# checks

Map the proteins with a pos and neg differential expression with the HasPosNeg function. Slice out proteins meeting this criteria from the df formatted for the heatmap function.

In [None]:
only_med_of_sig = all_sig[['Gbm_Median','Hnscc_Median','Luad_Median','Lscc_Median','Ov_Median','En_Median']]
only_med_of_sig["Pos_Neg"] = only_med_of_sig.apply(HasPosNeg, axis = 1)
pn = only_med_of_sig.loc[only_med_of_sig['Pos_Neg'] == True]
pn_genes = list(pn.index) # list of genes that have pos and neg

# Slice
get = sig_df.Proteomics.isin(pn_genes)
pos_neg_df = sig_df[get] # Keep genes with pos and neg
len(pos_neg_df.Proteomics.unique())

In [None]:
pos_neg_df.to_csv('csv/pos_neg_df.csv', index=False)

# Get number of proteins with significant opposite effects

Count proteins with an opposite effect. An opposite effect occurs when there is a significant increase in the protein abundance in one cancer and there is a significant decrease in the protein abundance in a different cancer.

In [None]:
get = mult_df.Proteomics.isin(pn_genes)
mult_sig_pn = mult_df[get] # Keep genes with pos and neg
proteins_opposite_effect = mult_sig_pn.Proteomics.unique()
print('Proteins with significant opposite effects:', len(proteins_opposite_effect))

# Calculate percentage of proteins with significant opposite effects

In [None]:
mult_sig_total = len(list_mult)
opposite = len(proteins_opposite_effect)
print('Percent of proteins with significant opposite effects: ', opposite, '/', mult_sig_total, '* 100 = ',
      opposite / mult_sig_total *100)