# Make Figure 3B: EGFR Cis effect CNV

This notebook graphs the pearson correlation between EGFR copy number and EGFR proteomics. It shows how an EGFR copy number mutation effects protein levels of EGFR (cis effect). 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 

import cptac
import cptac.utils as u
import plot_utils as p
import cptac.pancan as pc

  import pandas.util.testing as tm


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
'''
@Param df1: Dataframe. Contains numeric values (such as proteomics) for pearson correlaton 
@Param x_axis: String. Used as the label for the x-axis as well as the column name for the x-axis values.
@Param y_axis:String. Used as the label for the y-axis as well as the column name for the y-axis values.
This fuction takes a dataframe with numeric values (such as proteomics) and performs a pearson correlation analysis
between two user specified columns within the dataframe. 
The function then prints the uncorrected pvalue and coorelation value.

'''
def print_stats(df1, x_axis, y_axis):
      
    df1_subset = df1[[x_axis,y_axis]]
    df1_subset = df1_subset.dropna(axis=0, how="any")
    count_row = df1_subset.shape[0]
    if count_row > 30:
        x1 = df1_subset[[x_axis]].values
        y1 = df1_subset[[y_axis]].values
        x1 = x1[:,0]
        y1 = y1[:,0]
        corr, pval = scipy.stats.pearsonr(x1,y1)
        print("Correlation: " + str(corr))
        print("P_value:" + str(pval))
        
    else: 
        return "Not Enough Data"
   

In [None]:
g = pc.PancanGbm()
hn = pc.PancanHnscc()
l = pc.PancanLuad()
#o = pc.PancanOv()
c = pc.PancanCcrcc()
col = pc.PancanCoad()
b = pc.PancanBrca()
ls = pc.PancanLscc()
en = pc.PancanUcec()

In [None]:
col.get_clinical()

In [4]:
col = pc.PancanCoad()

                                                 

KeyError: "['Sample_Tumor_Normal'] not found in axis"

# Step 1 Create Data Frame

Create a df that combines proteomics and CNV data and uses only tumor samples. 

# GBM

In [None]:
#prot_CNV_Tumor = g.join_omics_to_omics(df1_name="proteomics", df2_name="CNV",genes1="EGFR", genes2= "EGFR", tissue_type= "tumor")
#Example of df 
prot_CNV_Tumor = g.join_omics_to_omics(df1_name = "proteomics", df1_source = "umich", df2_name = "CNV", df2_source = "washu", genes1 = "EGFR", genes2 = "EGFR", tissue_type = "tumor")
drop = ['Database_ID']
prot_CNV_Tumor = u.reduce_multiindex(df=prot_CNV_Tumor, levels_to_drop=drop)    

prot_CNV_Tumor.head()


#  Step 2  Call Plotting Function

Use plot_pearson function in plot_utils to graph pearson correlation.

In [None]:
p.plot_pearson(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics", hue = "none", title = "Glioblastoma", ra_stats = False ,show_plot = True, save_file_name = "png_files/Figure3B_Gbm_pancan")

In [None]:
print_stats(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics")


# Repeat Steps 1 and 2 for the other 7 cancers

# Kidney 

In [None]:
prot_CNV_Tumor = c.join_omics_to_omics(df1_name = "proteomics", df1_source = "umich", df2_name = "CNV", df2_source = "washu", genes1 = "EGFR", genes2 = "EGFR", tissue_type = "tumor")
drop = ['Database_ID']
prot_CNV_Tumor = u.reduce_multiindex(df=prot_CNV_Tumor, levels_to_drop=drop)    


In [None]:
p.plot_pearson(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics", hue ="none", ra_stats = False, show_plot = True, title = "ccRCC", save_file_name = "png_files/Figure3B_Kidney_pancan")
#print_stats(prot_CNV_Tumor,"EGFR_CNV","EGFR_proteomics")

# Ovarian 

In [None]:
#prot_CNV_Tumor = o.join_omics_to_omics(df1_name = "proteomics", df1_source = "umich", df2_name = "CNV", df2_source = "washu", genes1 = "EGFR", genes2 = "EGFR", tissue_type = "tumor")
#drop = ['Database_ID']
#prot_CNV_Tumor = u.reduce_multiindex(df=prot_CNV_Tumor, levels_to_drop=drop)  

In [None]:
#plot_pearson(prot_CNV_Tumor,"EGFR_CNV","EGFR_proteomics", hue = "none", ra_stats = False,title = "OV", show_plot = True, save_file_name = "png_files/Figure3B_Ovarian")
#print_stats(prot_CNV_Tumor,"EGFR_CNV","EGFR_proteomics")

# Colon

In [None]:
#prot_CNV_Tumor = col.join_omics_to_omics(df1_name = "proteomics", df1_source = "umich", df2_name = "CNV", df2_source = "washu", genes1 = "EGFR", genes2 = "EGFR", tissue_type = "tumor")
#drop = ['Database_ID']
#prot_CNV_Tumor = u.reduce_multiindex(df=prot_CNV_Tumor, levels_to_drop=drop) 

In [None]:
#plot_pearson(prot_CNV,"EGFR_CNV","EGFR_proteomics",hue = "none", ra_stats = False, title = "Colon", show_plot = True, save_file_name = "png_files/Figure3B_Colon")
#print_stats(prot_CNV_Tumor,"EGFR_CNV","EGFR_proteomics")

# Brca 

In [None]:
prot_CNV_Tumor = b.join_omics_to_omics(df1_name = "proteomics", df1_source = "umich", df2_name = "CNV", df2_source = "washu", genes1 = "EGFR", genes2 = "EGFR", tissue_type = "tumor")
drop = ['Database_ID']
prot_CNV_Tumor = u.reduce_multiindex(df=prot_CNV_Tumor, levels_to_drop=drop)

In [None]:
p.plot_pearson(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics", hue = "none", ra_stats = False, title = "Breast", show_plot = True, save_file_name = "png_files/Figure3B_Brca_pancan")
print_stats(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics")


# LUAD

In [None]:
prot_CNV_Tumor = l.join_omics_to_omics(df1_name = "proteomics", df1_source = "umich", df2_name = "CNV", df2_source = "washu", genes1 = "EGFR", genes2 = "EGFR", tissue_type = "tumor")
drop = ['Database_ID']
prot_CNV_Tumor = u.reduce_multiindex(df=prot_CNV_Tumor, levels_to_drop=drop)

In [None]:
p.plot_pearson(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics", hue = 'none', title = "LUAD", ra_stats = False, show_plot = True, save_file_name = "png_files/Figure3B_Luad_pancan")
print_stats(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics")

# HNSCC

In [None]:
prot_CNV_Tumor = hn.join_omics_to_omics(df1_name = "proteomics", df1_source = "umich", df2_name = "CNV", df2_source = "washu", genes1 = "EGFR", genes2 = "EGFR", tissue_type = "tumor")
drop = ['Database_ID']
prot_CNV_Tumor = u.reduce_multiindex(df=prot_CNV_Tumor, levels_to_drop=drop)

In [None]:
p.plot_pearson(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics", hue = "none", title = "Head and Neck", ra_stats = False, show_plot = True, save_file_name = "png_files/Figure3B_Hnscc_pancan")
print_stats(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics")

# LSCC

In [None]:
prot_CNV_Tumor = ls.join_omics_to_omics(df1_name = "proteomics", df1_source = "umich", df2_name = "CNV", df2_source = "washu", genes1 = "EGFR", genes2 = "EGFR", tissue_type = "tumor")
drop = ['Database_ID']
prot_CNV_Tumor = u.reduce_multiindex(df=prot_CNV_Tumor, levels_to_drop=drop)

In [None]:
p.plot_pearson(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics", hue = "none", title = "LSCC" , ra_stats = False, show_plot = True, save_file_name = "png_files/Figure3B_Lscc_pancan")
print_stats(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics")


# Colon

In [None]:
prot_CNV_Tumor = col.join_omics_to_omics(df1_name = "proteomics", df1_source = "umich", df2_name = "CNV", df2_source = "washu", genes1 = "EGFR", genes2 = "EGFR", tissue_type = "tumor")
prot_CNV_Tumor

In [None]:
p.plot_pearson(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics", hue = "none", title = "LSCC" , ra_stats = False, show_plot = True, save_file_name = "png_files/Figure3B_Lscc_pancan")
#print_stats(prot_CNV_Tumor,"EGFR_washu_CNV","EGFR_umich_proteomics")


In [None]:
col