In [1]:
import pandas as pd
import plot_utils 
import numpy as np
import math

  import pandas.util.testing as tm


## Get the trans table

In [2]:
# trans = pd.read_csv('~/WhenMutationsDontMatter/PIK3CA/csv_files/proteomics.csv', index_col=0)
trans = pd.read_csv('~/WhenMutationsDontMatter/PIK3CA/csv_files/proteomics_no_hotspots.csv', index_col=0)
trans.head()

Unnamed: 0,Difference_In_Median_Brca,P_Value_Brca,Difference_In_Median_Endo,P_Value_Endo,Difference_In_Median,P_Value
AAGAB,-0.1854,0.983263,0.097,0.773704,,0.00646
AASDHPPT,0.5462,0.003932,-0.0634,0.405571,,0.687161
ABAT,0.092,0.374607,-0.031,0.477393,,0.012886
ABCA13,0.5804,0.038298,,,,
ABCB7,-0.2837,0.253549,0.1328,0.34907,,0.016269


## If the p value isn't significant, replace both pval and median difference with NAN

In [3]:
def significant(row):
    if row['P_Value_Brca'] > .05:
        row['P_Value_Brca'] = np.nan
        row['Difference_In_Median_Brca'] = np.nan

    if row['P_Value_Endo'] > .05:
        row['P_Value_Endo'] = np.nan
        row['Difference_In_Median_Endo'] = np.nan
        

    if row['P_Value'] > .05:
        row['P_Value'] = np.nan
        row['Difference_In_Median'] = np.nan
        
    if math.isnan(row['P_Value']):
        row['P_Value'] = np.nan
        row['Difference_In_Median'] = np.nan

    return row
    
        

In [4]:
sig = trans.apply(significant, axis=1)
sig.head()

Unnamed: 0,Difference_In_Median_Brca,P_Value_Brca,Difference_In_Median_Endo,P_Value_Endo,Difference_In_Median,P_Value
AAGAB,,,,,,0.00646
AASDHPPT,0.5462,0.003932,,,,
ABAT,,,,,,0.012886
ABCA13,0.5804,0.038298,,,,
ABCB7,,,,,,0.016269


## Reformat the dataframe by combining like columns

In [5]:
brca = sig.drop(columns=['Difference_In_Median_Endo', 'P_Value_Endo','Difference_In_Median','P_Value'])
brca['cancer_type'] = "brca"
brca.columns = ['Difference_In_Median', 'P_Value','cancer_type']
brca.head()

endo = sig.drop(columns=['Difference_In_Median_Brca', 'P_Value_Brca','Difference_In_Median','P_Value'])
endo['cancer_type'] = "endo"
endo.columns = ['Difference_In_Median', 'P_Value','cancer_type']
endo.head()

colon = sig.drop(columns=['Difference_In_Median_Brca', 'P_Value_Brca','Difference_In_Median_Endo','P_Value_Endo'])
colon['cancer_type'] = "colon"
colon.head()

c_and_e = colon.append(endo)
df = c_and_e.append(brca)

df.head()

Unnamed: 0,Difference_In_Median,P_Value,cancer_type
AAGAB,,0.00646,colon
AASDHPPT,,,colon
ABAT,,0.012886,colon
ABCA13,,,colon
ABCB7,,0.016269,colon


## Add comparison and size columns for the plot heatmap function
The comparison column is simply the index (which is the list of genes).
The size column is what the heatmap function will use to plot the significance of the p value.

In [6]:
df["comparison"] = df.index
df

# log p-vals for right scale in plot (bigger circle, smaller pval)
df['size'] = df['P_Value'].apply(lambda x: -1*(np.log(x)/10))

df.head()

Unnamed: 0,Difference_In_Median,P_Value,cancer_type,comparison,size
AAGAB,,0.00646,colon,AAGAB,0.504219
AASDHPPT,,,colon,AASDHPPT,
ABAT,,0.012886,colon,ABAT,0.43516
ABCA13,,,colon,ABCA13,
ABCB7,,0.016269,colon,ABCB7,0.411849


## HasPosNeg looks at the median_difference value in every cancer. 
A gene is only selected if among all cancer types there exist a median difference of <= -.3 and a median difference >= .3

In [7]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < -0.3:
            hasNeg = True
        if item > 0.3:
            hasPos = True
           
    if hasPos & hasNeg:
        return True
    return False

#### We will apply the HasPosNeg function to the rows of a dataframe that only contains median difference values (shown below)

In [8]:
medians = sig.drop(columns= ['P_Value_Brca','P_Value_Endo','P_Value'])
medians.head()

Unnamed: 0,Difference_In_Median_Brca,Difference_In_Median_Endo,Difference_In_Median
AAGAB,,,
AASDHPPT,0.5462,,
ABAT,,,
ABCA13,0.5804,,
ABCB7,,,


#### pos_and_neg_genes is a list that contains the genes that have positive and negative median values. There are 39 such genes.

In [9]:
medians["Pos_Neg"] = medians.apply(HasPosNeg, axis = 1)
pos_and_neg = medians.loc[medians['Pos_Neg'] == True]
pos_and_neg_genes = list(pos_and_neg.index)
len(pos_and_neg_genes)

5

#### Use this list to select the appropriate rows from our dataframe.

In [10]:
get = df['comparison'].isin(pos_and_neg_genes)
corr_df = df[get]
corr_df['size'] = corr_df['P_Value'].apply(lambda x: -1*(np.log(x)/10))
corr_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Difference_In_Median,P_Value,cancer_type,comparison,size
ACO2,,,colon,ACO2,
CKM,,,colon,CKM,
COL1A1,,,colon,COL1A1,
COL1A2,,,colon,COL1A2,
MRPL12,,,colon,MRPL12,


In [11]:
corr_df.head()


Unnamed: 0,Difference_In_Median,P_Value,cancer_type,comparison,size
ACO2,,,colon,ACO2,
CKM,,,colon,CKM,
COL1A1,,,colon,COL1A1,
COL1A2,,,colon,COL1A2,
MRPL12,,,colon,MRPL12,


In [12]:
# pos_and_neg.to_csv('correlations_medians.csv')

In [13]:
# plot_utils.plotCircleHeatMap(corr_df, circle_var = 'size', color_var='Difference_In_Median', x_axis= 'cancer_type', y_axis = 'comparison')
corr_df

Unnamed: 0,Difference_In_Median,P_Value,cancer_type,comparison,size
ACO2,,,colon,ACO2,
CKM,,,colon,CKM,
COL1A1,,,colon,COL1A1,
COL1A2,,,colon,COL1A2,
MRPL12,,,colon,MRPL12,
ACO2,0.309,0.035483,endo,ACO2,0.333869
CKM,0.981901,,endo,CKM,
COL1A1,-1.011,0.005557,endo,COL1A1,0.519268
COL1A2,-0.532,0.012314,endo,COL1A2,0.4397
MRPL12,0.363,0.025534,endo,MRPL12,0.366773


In [14]:
plot_utils.plotCircleHeatMap(corr_df, circle_var = 'P_Value', color_var='Difference_In_Median', y_axis= 'comparison', x_axis = 'cancer_type',
                   plot_width=700)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["size2"] = df[circle_var].apply(lambda x: -1*(np.log(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['size'] = (df["size2"])*3


In [15]:
# #genes that are significant in every cancer
get = sig.index.isin(pos_and_neg_genes)
in_all = sig[get]
in_all = in_all.dropna()
print("In every cancer:", list(in_all.index))

#look at the ones in colon
get = df['comparison'].isin(pos_and_neg_genes)
colon = df[get]
colon = colon.loc[colon["cancer_type"]=='colon']
colon = colon.dropna()
print("In colon: ", list(colon.index))

In every cancer: []
In colon:  []


In [16]:
in_all = ["IDH3A", 'IDH3G']
get = corr_df['comparison'].isin(in_all)
all_sig = corr_df[get]


# log p-vals for right scale in plot (bigger circle, smaller pval)
all_sig['size'] = all_sig['P_Value'].apply(lambda x: -1*(np.log(x)/10))

# plot_utils.plotCircleHeatMap(all_sig, circle_var = 'size', color_var='Difference_In_Median', x_axis= 'cancer_type', y_axis = 'comparison')
