# Make tables: Merge all cancer dfs and create csv files

This notebook combines dfs with p-values and differential expressions for 8 cancers. 
Description of created csv files: 
* all_proteins - contains all data in normal format, 
* all_heatmap - all data appended to make a long table for easy use with heatmap function, 
* sig_pval_heatmap - contains only significant genes (long format),

Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u
import plot_utils as p

Read in the signle cancer dfs with p-values and differential expressions. Merge all dfs into one pancancer data frame.

In [2]:
l_merged = pd.read_csv('csv/Single_Cancer/Luad_pval_medians_G12.csv')
e_merged = pd.read_csv('csv/Single_Cancer/Endo_pval_medians_G12.csv')
c_merged = pd.read_csv('csv/Single_Cancer/Colon_pval_medians_G12.csv')

# Make csv of all data

In [3]:
df1 = l_merged.merge(e_merged, on='Proteomics',how='outer')
all_df = df1.merge(c_merged, on='Proteomics',how='outer')

In [4]:
all_df.to_csv('csv/all_proteins_G12.csv', index=False)

# Make csv of data formatted to use with the HeatMap function 

In [5]:
# Create long df for heatmap

cancer = ['Luad','Endo','Colon']
merged_dfs = [l_merged,e_merged,c_merged]

all_long_df = pd.DataFrame()
i = 0
for c in cancer:
    m = merged_dfs[i]
    m2 = m.assign(Cancer = c)
    m2 = m2.rename(columns={c+'_P_Value': 'P_Value'})
    m2 = m2.rename(columns={c+'_Median': 'Medians'})
    all_long_df = all_long_df.append(m2) 
    if i < 7:
        i += 1

print('Check total proteins:', len(all_long_df.Proteomics.unique()))

Check total proteins: 12718


In [6]:
all_long_df.to_csv('csv/all_heatmap_G12.csv', index=False)

# Make csv of only significant proteins (formatted for heatmap)

Read in list_sig_in_at_least_one_cancer.csv. Convert to a list of genes significant in at least 1 cancer. Slice out genes in the list from the df with all data.

In [7]:
sig = pd.read_csv('csv/list_sig_one_cancer_G12.csv')
list_sig = list(sig['0'])

In [9]:
# Keep genes with at least one sig ttest
bool_df = all_long_df['Proteomics'].isin(list_sig)
sig_df = all_long_df[bool_df]
print('Check total sig genes = 4:', len(sig_df.Proteomics.unique()))
t = list(sig_df.Proteomics)

Check total sig genes = 4: 4


In [10]:
sig_df.to_csv('csv/sig_pval_heatmap_G12.csv', index=False)

#  Make csv of proteins significant in multiple cancers (formatted for heatmap)

In [47]:
mult = pd.read_csv('csv/list_sig_multiple_cancers.csv')
list_mult = list(mult['0'])

In [48]:
# Keep genes with > one sig ttest
bool_df2 = all_long_df['Proteomics'].isin(list_mult)
mult_df = all_long_df[bool_df2]
print('Check total sig in multiple cancers = 0:', len(mult_df.Proteomics.unique()))

Check total sig in multiple cancers = 0: 0


In [None]:
#mult_df.to_csv('csv/mult_sig_pval_heatmap.csv', index=False)

#  Unable to compare opposite effects: 0 genes sig in mult cancers 