In [None]:
import pandas as pd
import plot_utils 
import numpy as np
import math
from itertools import compress

## Get the trans table

In [None]:
trans = pd.read_csv('~/WhenMutationsDontMatter/PIK3CA/csv_files/proteomics.csv', index_col=0)
trans.head()

In [None]:
type(trans.iloc[2][1])

## If the p value isn't significant, replace both pval and median difference with NAN

In [None]:
def significant(row):
    if row['P_Value_Brca'] > .05:
        row['P_Value_Brca'] = np.nan
        row['Difference_In_Median_Brca'] = np.nan

    if row['P_Value_Endo'] > .05:
        row['P_Value_Endo'] = np.nan
        row['Difference_In_Median_Endo'] = np.nan
        

    if row['P_Value'] > .05:
        row['P_Value'] = np.nan
        row['Difference_In_Median'] = np.nan
        
    if math.isnan(row['P_Value']):
        row['P_Value'] = np.nan
        row['Difference_In_Median'] = np.nan
        
    return row
    
        

In [None]:
sig = trans.apply(significant, axis=1)
sig.head()

## Reformat the dataframe by combining like columns

In [None]:
brca = sig.drop(columns=['Difference_In_Median_Endo', 'P_Value_Endo','Difference_In_Median','P_Value'])
brca['cancer_type'] = "brca"
brca.columns = ['Difference_In_Median', 'P_Value','cancer_type']
brca.head()

endo = sig.drop(columns=['Difference_In_Median_Brca', 'P_Value_Brca','Difference_In_Median','P_Value'])
endo['cancer_type'] = "endo"
endo.columns = ['Difference_In_Median', 'P_Value','cancer_type']
endo.head()

colon = sig.drop(columns=['Difference_In_Median_Brca', 'P_Value_Brca','Difference_In_Median_Endo','P_Value_Endo'])
colon['cancer_type'] = "colon"
colon.head()

c_and_e = colon.append(endo)
df = c_and_e.append(brca)

df.head()

## Add comparison and size columns for the plot heatmap function
The comparison column is simply the index (which is the list of genes).
The size column is what the heatmap function will use to plot the significance of the p value.

In [None]:
df["comparison"] = df.index
df

# log p-vals for right scale in plot (bigger circle, smaller pval)
df['size'] = df['P_Value'].apply(lambda x: -1*(np.log(x)/10))

df.head()

## present_absent looks at the median_difference value in every cancer. 
A gene is only selected if among there are two cancers that have a median difference present in two cancers and absent in the third.

In [None]:
def present_absent(row):
    absent_in_one = False
    brca = row['Difference_In_Median_Brca']
    endo = row['Difference_In_Median_Endo']
    colon = row['Difference_In_Median']
    
    boolean_isNAN = np.isnan([brca, endo, colon])
    how_many_are_NAN = sum(boolean_isNAN)
    if how_many_are_NAN == 2:
        only_in_one =  True
        the_cancer = list(compress([brca, endo, colon], ~boolean_isNAN))
#         import pdb; pdb.set_trace()
        if (the_cancer[0] > 2):
            return True
#         elif (the_cancer[0] < 5):
#             return True
        
    else: return False
    

#### We will apply the present_absent function to the rows of a dataframe that only contains median difference values (shown below)

In [None]:
medians = sig.drop(columns= ['P_Value_Brca','P_Value_Endo','P_Value'])
medians.head()

#### present_absent_genes is a list that contains the genes that have significant P values in only two cancers. There are 103 such genes.

In [None]:
medians["present_absent"] = medians.apply(present_absent, axis = 1)
present_absent = medians.loc[medians['present_absent'] == True]
present_absent_genes = list(present_absent.index)
len(present_absent_genes)

#### Use this list to select the appropriate rows from our dataframe.

In [None]:
get = df['comparison'].isin(present_absent_genes)
corr_df = df[get]
corr_df['size'] = corr_df['P_Value'].apply(lambda x: -1*(np.log(x)/10))
corr_df.head()

In [None]:
# plot_df2.to_csv('correlations.csv')

In [None]:
plot_utils.plotCircleHeatMap(corr_df, circle_var = 'size', color_var='Difference_In_Median', 
                             x_axis= 'cancer_type', y_axis = 'comparison', 
                             graph_height=1000)