In [1]:
import pandas as pd
import plot_utils 
import numpy as np
import math
from itertools import compress

  import pandas.util.testing as tm


## Get the trans table

In [2]:
trans = pd.read_csv('~/WhenMutationsDontMatter/PIK3CA/csv_files/proteomics.csv', index_col=0)
trans.head()

Unnamed: 0,Difference_In_Median_Brca,P_Value_Brca,Difference_In_Median_Endo,P_Value_Endo,Difference_In_Median,P_Value
A2ML1,-1.2612,0.165194,1.10585,0.037044,,
AADAT,-1.332,0.027477,-0.18535,0.690613,,
AAGAB,-0.09455,0.909622,0.2435,0.139632,0.139,0.039189
AASDHPPT,0.51445,0.00362,-0.1059,0.428884,-0.0628,0.973515
AATF,-0.16905,0.042271,-0.1817,0.915482,0.025,0.568821


In [3]:
type(trans.iloc[2][1])

numpy.float64

## If the p value isn't significant, replace both pval and median difference with NAN

In [4]:
def significant(row):
    if row['P_Value_Brca'] > .05:
        row['P_Value_Brca'] = np.nan
        row['Difference_In_Median_Brca'] = np.nan

    if row['P_Value_Endo'] > .05:
        row['P_Value_Endo'] = np.nan
        row['Difference_In_Median_Endo'] = np.nan
        

    if row['P_Value'] > .05:
        row['P_Value'] = np.nan
        row['Difference_In_Median'] = np.nan
        
    if math.isnan(row['P_Value']):
        row['P_Value'] = np.nan
        row['Difference_In_Median'] = np.nan
        
    return row
    
        

In [5]:
sig = trans.apply(significant, axis=1)
sig.head()

Unnamed: 0,Difference_In_Median_Brca,P_Value_Brca,Difference_In_Median_Endo,P_Value_Endo,Difference_In_Median,P_Value
A2ML1,,,1.10585,0.037044,,
AADAT,-1.332,0.027477,,,,
AAGAB,,,,,0.139,0.039189
AASDHPPT,0.51445,0.00362,,,,
AATF,-0.16905,0.042271,,,,


## Reformat the dataframe by combining like columns

In [6]:
brca = sig.drop(columns=['Difference_In_Median_Endo', 'P_Value_Endo','Difference_In_Median','P_Value'])
brca['cancer_type'] = "brca"
brca.columns = ['Difference_In_Median', 'P_Value','cancer_type']
brca.head()

endo = sig.drop(columns=['Difference_In_Median_Brca', 'P_Value_Brca','Difference_In_Median','P_Value'])
endo['cancer_type'] = "endo"
endo.columns = ['Difference_In_Median', 'P_Value','cancer_type']
endo.head()

colon = sig.drop(columns=['Difference_In_Median_Brca', 'P_Value_Brca','Difference_In_Median_Endo','P_Value_Endo'])
colon['cancer_type'] = "colon"
colon.head()

c_and_e = colon.append(endo)
df = c_and_e.append(brca)

df.head()

Unnamed: 0,Difference_In_Median,P_Value,cancer_type
A2ML1,,,colon
AADAT,,,colon
AAGAB,0.139,0.039189,colon
AASDHPPT,,,colon
AATF,,,colon


## Add comparison and size columns for the plot heatmap function
The comparison column is simply the index (which is the list of genes).
The size column is what the heatmap function will use to plot the significance of the p value.

In [7]:
df["comparison"] = df.index
df

# log p-vals for right scale in plot (bigger circle, smaller pval)
df['size'] = df['P_Value'].apply(lambda x: -1*(np.log(x)/10))

df.head()

Unnamed: 0,Difference_In_Median,P_Value,cancer_type,comparison,size
A2ML1,,,colon,A2ML1,
AADAT,,,colon,AADAT,
AAGAB,0.139,0.039189,colon,AAGAB,0.323937
AASDHPPT,,,colon,AASDHPPT,
AATF,,,colon,AATF,


## present_absent looks at the median_difference value in every cancer. 
A gene is only selected if among there are two cancers that have a median difference present in two cancers and absent in the third.

In [8]:
def present_absent(row):
    absent_in_one = False
    brca = row['Difference_In_Median_Brca']
    endo = row['Difference_In_Median_Endo']
    colon = row['Difference_In_Median']
    
    boolean_isNAN = np.isnan([brca, endo, colon])
    how_many_are_NAN = sum(boolean_isNAN)
    if how_many_are_NAN == 2:
        only_in_one =  True
        the_cancer = list(compress([brca, endo, colon], ~boolean_isNAN))
#         import pdb; pdb.set_trace()
        if (the_cancer[0] > 1):# | (the_cancer[0] < -1):
            return True
#         elif (the_cancer[0] < 5):
#             return True
        
    else: return False
    

#### We will apply the present_absent function to the rows of a dataframe that only contains median difference values (shown below)

In [9]:
medians = sig.drop(columns= ['P_Value_Brca','P_Value_Endo','P_Value'])
medians.head()

Unnamed: 0,Difference_In_Median_Brca,Difference_In_Median_Endo,Difference_In_Median
A2ML1,,1.10585,
AADAT,-1.332,,
AAGAB,,,0.139
AASDHPPT,0.51445,,
AATF,-0.16905,,


#### present_absent_genes is a list that contains the genes that have significant P values in only two cancers. There are 103 such genes.

In [10]:
medians["present_absent"] = medians.apply(present_absent, axis = 1)
present_absent = medians.loc[medians['present_absent'] == True]
present_absent_genes = list(present_absent.index)
len(present_absent_genes)

71

#### Use this list to select the appropriate rows from our dataframe.

In [11]:
get = df['comparison'].isin(present_absent_genes)
corr_df = df[get]
corr_df['size'] = corr_df['P_Value'].apply(lambda x: -1*(np.log(x)/10))
corr_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Difference_In_Median,P_Value,cancer_type,comparison,size
A2ML1,,,colon,A2ML1,
ACVR1,,,colon,ACVR1,
ADAMTS12,,,colon,ADAMTS12,
ANXA8,,,colon,ANXA8,
ARL4C,,,colon,ARL4C,


In [18]:
# present_absent.to_csv('hotspot_higher_in_one_cancer.csv')

In [13]:
plot_utils.plotCircleHeatMap(corr_df, circle_var = 'size', color_var='Difference_In_Median', 
                             x_axis= 'cancer_type', y_axis = 'comparison', 
                             graph_height=1500)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['size'] = np.where(df[circle_var]<0, np.abs(df[circle_var]), df[circle_var])*50
