# Flagship vs Pancan Part 3: Trans Effect

Find the trans effects by finding the pearson correlation stats for EGFR proteomics vs all proteomics for each cancer type. Then compare the trans effects between the flagship paper and harmonized pipline.  

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import cptac.utils as u
import plot_utils as p
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
import cptac
#load cptac data 
brain = cptac.Gbm()
kidney = cptac.Ccrcc()
ovar = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
luad = cptac.Luad()
hnscc = cptac.Hnscc()
lscc = cptac.Lscc()

                                            

In [None]:
import cptac.pancan as pc
pc_g = pc.PancanGbm()
pc_hn = pc.PancanHnscc()
pc_l = pc.PancanLuad()
pc_o = pc.PancanOv()
pc_c = pc.PancanCcrcc()
pc_col = pc.PancanCoad()
pc_b = pc.PancanBrca()
pc_ls = pc.PancanLscc()

Loading washubrca v1.0......                     

# Step 1: trans effects data munging 

Call make_trans_df for all cancer types. The function calls wrap_pearson_corr which finds the pearson correleation between EGFR proteomics and all other proteomic columns

It records all FDR Benjamini/Hochberg corrected p values. 

In [None]:
'''
@Param df: Data frame. Takes a data frame of proteomic data frame.
This function takes the data frame from get_proteomics and labels any duplicate columns due to mutliple isoforms.
Then it returns the list of all proteins
'''

def create_prot_list(df):
    #Add _(number) to keep track of duplicates due to multiple isoforms 
    cols = pd.Series(df.columns[:])
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns=cols
    #get list of proteins
    prot_genes_list = df.columns.values.tolist()
    return prot_genes_list

In [None]:
'''
@Param cancer: cptac data for a cancer Ex brain = cptac.Gbm()

This function makes a df with all trans effects. 
Step 1: Get proteomic data with only tumor samples.
Step 2: For each cancer type, create list of proteins by using create_prot_list function 
which extracts column names and labels duplicate columns caused by multiple isoforms.
Step 3:Use plot_utils function wrap_pearson_corr on df and compare EGFR proteomics to all other proteins in protein list.
Record all FDR corrected p values. 

'''

def make_trans_df(cancer):
    df1_tumor = cancer.get_proteomics(tissue_type= "tumor")
    
    if isinstance(df1_tumor.keys(), pd.core.indexes.multi.MultiIndex):
        df1_tumor = u.reduce_multiindex(df1_tumor, levels_to_drop = 1)
        
    prot_genes_list = create_prot_list(df1_tumor)
    trans_df = p.wrap_pearson_corr(df1_tumor,"EGFR",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
    return trans_df
    

# GBM

In [None]:
#Flagship
gbm_df = make_trans_df(brain)
gbm_df


In [None]:
#pancan
pancan_gbm_df = make_trans_df(pc_g)

# Other Cancer Types

In [None]:
#Flagship
hnscc_df = make_trans_df(hnscc)
#pancan
pancan_hnscc = make_trans_df(pc_hn)

In [None]:
#ccRCC flagship
ccrcc_df = make_trans_df(kidney)
#ccRCC pancan 
pancan_ccrcc = make_trans_df(pc_c)

In [None]:
#Luad flagship 
luad_df = make_trans_df(luad)
#luad pancan 
pancan_luad = make_trans_df(pc_l)

In [None]:
#Lscc flagship 
lscc_df = make_trans_df(lscc)
#lscc pancan 
pancan_lscc = make_trans_df(pc_ls)

In [None]:
#brca flagship 
brca_df = make_trans_df(brca)
#brca pancan
pancan_brca = make_trans_df(pc_b)

In [None]:
#ov flagship 
ov_df = make_trans_df(ovar)
#ov pancan
pancan_ov = make_trans_df(pc_o)


In [None]:
#colon flagship
colon_df = make_trans_df(colon)
#colon pancan
pancan_colon = make_trans_df(pc_col)

# Step 2: Compare significant trans gens 

Create venn diagram of significant trans genes from the flagship and pancan pipelines

In [None]:
def compare_og_pancan_trans_genes(og_df, pancan_df,title):
    og_df["Comparison"] = og_df['Comparison'].str.replace(r"_\d","")
    og_genes = og_df.loc[(og_df["P_value"] <= 0.05)].Comparison.to_list()
    
    pancan_df["Comparison"] = pancan_df['Comparison'].str.replace(r"_\d","")
    pancan_genes = pancan_df.loc[(pancan_df["P_value"] <= 0.05)].Comparison.to_list()
    
    og_genes = set(og_genes)
    pancan_genes = set(pancan_genes)
    venn2([og_genes,pancan_genes],("Flagship_genes", "Pancan_genes"))
    plt.title(title)
    
    

In [None]:
#GBM
compare_og_pancan_trans_genes(gbm_df, pancan_gbm_df,"GBM")

In [None]:
#HNSCC
compare_og_pancan_trans_genes(hnscc_df, pancan_hnscc,"HNSCC")

In [None]:
#LUAD
compare_og_pancan_trans_genes(luad_df, pancan_luad,"LUAD")

In [None]:
#LSCC
compare_og_pancan_trans_genes(lscc_df, pancan_lscc,"LSCC")

In [None]:
#ccRCC
compare_og_pancan_trans_genes(ccrcc_df, pancan_ccrcc,"ccRCC")

In [None]:
#Colon
compare_og_pancan_trans_genes(colon_df, pancan_colon,"Colon")

In [None]:
#OV
compare_og_pancan_trans_genes(ov_df, pancan_ov,"OV")

# Step 3:  Determine consistency of trans effects across cancer types

First combine all of the trans effects into one data frame

In [None]:
#combine all of flagship trans df 
og_combined = pd.merge(gbm_df, ccrcc_df, on="Comparison", how = "outer")
og_combined = pd.merge(og_combined, ov_df, on="Comparison", how = "outer")
og_combined = pd.merge(og_combined, luad_df, on="Comparison", how = "outer")
og_combined = pd.merge(og_combined, lscc_df, on="Comparison", how = "outer")
og_combined = pd.merge(og_combined, brca_df, on="Comparison", how = "outer")
og_combined = pd.merge(og_combined, colon_df, on="Comparison", how = "outer")
og_combined = pd.merge(og_combined, hnscc_df, on="Comparison", how = "outer")
og_combined= og_combined[1:]
og_combined

In [None]:
#combine all pancan trans dfs
pancan = pd.merge(pancan_gbm_df, pancan_hnscc, on="Comparison", how = "outer")
pancan = pd.merge(pancan, pancan_luad, on="Comparison", how = "outer")
pancan = pd.merge(pancan, pancan_lscc, on="Comparison", how = "outer")
pancan = pd.merge(pancan, pancan_ccrcc, on="Comparison", how = "outer")
pancan = pancan[1:]
