# Step 1: Import CPTAC Data and Define Correlation Functions

In [1]:
import cptac
import cptac.utils as ut
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import statistics as st
import load_data
import get_correlations

In [2]:
#Make docstrings for what is required
def get_single_gene_correlations(gene, cancer_type_list,type_dict,input_tissue_type = "tumor"):
    correlation_list = []
    type_list = []
    pval_list = []
    for cancer in cancer_type_list:
        correlation,pval,num_samples = get_correlations.correlation_list(cancer, [gene],tissue_type = input_tissue_type)
        if len(correlation) == 0:
            continue
        else:
            correlation_list.append(correlation[0])
            pval_list.append(pval[0][1])
            type_list.append(type_dict[cancer])
    return correlation_list, type_list, num_samples, pval_list

In [3]:
def make_df_with_genes(gene_list,cancer_list,name_dict,tissue_type):
    all_genes = []
    all_correlations = []
    all_cancer_types = []
    all_pvals = []
    for gene in gene_list:
        gene_corr_list,type_list,num_samples, pval_list = get_single_gene_correlations(gene,cancer_list,name_dict,tissue_type)
        gene_x,gene_y = get_x_y(gene_corr_list)
        all_genes.extend(gene_x)
        all_correlations.extend(gene_y)
        all_cancer_types.extend(type_list)
        all_pvals.extend(pval_list)
    df = pd.DataFrame(list(zip(all_genes,all_correlations,all_cancer_types, all_pvals)), columns =['Gene', 'Correlation','Cancer', 'p-val'])
    return(df)

In [4]:
def get_x_y(correlation_list):
    x = []
    y = []
    for item in correlation_list:
        x.append(item[0])
        y.append(item[1])
    return x,y

# Step 2: Define Desired Cancer Types

In [None]:
ccrcc = cptac.Ccrcc()
en = cptac.Endometrial()
luad = cptac.Luad()
hnscc  = cptac.Hnscc()
lscc = cptac.Lscc()

Checking that hnscc index is up-to-date...      

In [None]:
cancer_list = [ccrcc,en,luad,hnscc,lscc]

In [None]:
type_dict = {ccrcc:"ccrcc",en:"endometrial",luad:"luad", hnscc:"hnscc",lscc:"lscc"}

# Step 3: Pick Genes and Make Dataframe

The genes shown here were picked from Bailey et al. found [here](https://pubmed.ncbi.nlm.nih.gov/29625053/)

In [None]:
cancer_genes = ['BRAF','FAT1','IDH1','TP53','KMT2D','KRAS','PIK3CA','PTEN','SPTA1']

In [None]:
tumor_df = make_df_with_genes(cancer_genes,cancer_list,type_dict,tissue_type = "tumor")
tumor_df['Tissue'] = ['Tumor'] * len(tumor_df)

In [None]:
tumor_df

In [None]:
normal_df = make_df_with_genes(cancer_genes,cancer_list,type_dict,tissue_type = "normal")
normal_df['Tissue'] = ['Normal'] * len(normal_df)

In [None]:
normal_df

In [None]:
# get delta correlation p-values
delta_corr_pvals_df = pd.read_csv('p_val.csv')
delta_corr_pvals_df
delta_corr_pvals_df

In [None]:
delta_corr_pvals_df[delta_corr_pvals_df.Cancer == 'luad'].reset_index()['BRAF'][0]

In [None]:
combined_df = pd.merge(normal_df, tumor_df, how = 'outer')
combined_df['Significant Correlation'] = combined_df['p-val'] <= 0.05
delta_corr_pvals = []
for index, row in combined_df.iterrows():
    cancer = row['Cancer']
    gene = row['Gene']
    pval = delta_corr_pvals_df[delta_corr_pvals_df.Cancer == cancer].reset_index()[gene][0]
    delta_corr_pvals.append(pval)
combined_df['delta_corr_pval'] = delta_corr_pvals
combined_df['Significant Delta Correlation'] = combined_df['delta_corr_pval'] <= 0.05

In [None]:
combined_df

# Step 4: Plot Data

In [None]:
g = sns.FacetGrid(combined_df, col ='Gene', height = 3, col_wrap = 3)

g.map_dataframe(sns.scatterplot, x = 'Tissue', y = 'Correlation', hue = 'Cancer', palette='tab10', size = 'Significant Correlation', size_order = [True, False])
g.map_dataframe(sns.lineplot, x = 'Tissue', y = 'Correlation', hue = 'Cancer', palette='tab10', style = 'Significant Delta Correlation', style_order = [True, False], label = '_Hidden')
g.set_axis_labels('Tissue', 'Correlation')
#g.add_legend()
g.set_titles('{col_name}')
current_handles, current_labels = plt.gca().get_legend_handles_labels()
del current_labels[9:15]
del current_handles[9:15]
plt.legend(current_handles, current_labels, bbox_to_anchor=(2.2, 2.2))
plt.savefig('Interesting_Cancer_Genes_Correlation_Change.png', bbox_inches='tight')

In [None]:
# change lines to represent statistical significance of change 
# also is original correlations statistically meaningful 