In [None]:
from kinase_functions import *
from kinase_declarative import *
from sqlalchemy import create_engine, or_, and_
from sqlalchemy.orm import sessionmaker
from pprint import pprint
import csv #loading csv package
import pandas as pd #loading pandas package
import re #loading regex package
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import math
import plotly.express as px
from scipy.stats import norm
from scipy.stats import hypergeom 
from bokeh.models import Span
from bokeh.resources import CDN
from bokeh.embed import file_html, components
from bokeh.plotting import figure, ColumnDataSource, output_notebook, show, output_file
from bokeh.models import HoverTool, WheelZoomTool, PanTool, BoxZoomTool, ResetTool, TapTool, SaveTool
from bokeh.palettes import brewer
from bokeh.resources import CDN
from bokeh.embed import file_html

In [36]:
engine = create_engine("sqlite:///kinase_database.db")
Base.metadata.bind = engine
session = sessionmaker(bind=engine)
s = session()


def ReadDataInput(userDataInput):

    #read in txt file
    df_input_original = pd.read_csv(userDataInput,  sep='\t')

    #There are 86 columns in the dataframe, but only 7 columns have values, the rest are empty
    #Need to remove the empty columns
    input_original_subset = df_input_original.iloc[:, 0:7]

    #Make columns 2-7 type float instead of string
    input_original_subset.iloc[:, 1:7] = input_original_subset.iloc[:, 1:7].astype(float)

    #Need to separate the phosphosite from the substrate in the first column into 2 separate columns
    input_original_subset[['Substrate','Phosphosite']] = input_original_subset.Substrate.str.split('\(|\)', expand=True).iloc[:,[0,1]]

    #Remove any rows where there are NaN in any of the columns
    input_original_subset=input_original_subset.dropna()
    #input_original_subset=input_original_subset.head(100)
    return input_original_subset

input_original_subset=ReadDataInput('az20.txt')

##Carry out -log10 transform on P values
def NegLog10(input_original_subset):
    
    #Take -log10 of the corrected p-value.
    uncorrected_p_values=input_original_subset.iloc[ :,4].astype(np.float64)
    log10_corrected_pvalue = (-np.log10(uncorrected_p_values))

    #Append -log10(P-values) to a new column in data frame.
    input_original_subset["-Log10 Corrected P-Value"]=log10_corrected_pvalue
    NegLog10kinase=input_original_subset
    return NegLog10kinase

NegLog10KinaseDF=NegLog10(input_original_subset)
print (NegLog10KinaseDF)
#print (df_final3)

#Calculate log2FC and add as new column
def log2FC(NegLog10KinaseDF):
    log2FC=np.log2(NegLog10KinaseDF.iloc[:, 3])
    NegLog10KinaseDF["Log2 Fold Change"]=log2FC
    return NegLog10KinaseDF
log2FCKinase= log2FC(NegLog10KinaseDF)

log2FCKinase

        Substrate  control_mean     AZ20_mean  AZ20_fold_change  AZ20_p-value  \
0      1A24_HUMAN  1.527934e+07  2.643439e+07          1.730074      0.554298   
1      1A24_HUMAN  1.527934e+07  2.643439e+07          1.730074      0.554298   
2      1B13_HUMAN  2.706473e+07  2.779116e+07          1.026841      0.962084   
3      1B39_HUMAN  1.158130e+09  1.173595e+09          1.013353      0.958301   
4      1B39_HUMAN  1.158130e+09  1.173595e+09          1.013353      0.958301   
...           ...           ...           ...               ...           ...   
15517         ZYX  1.794139e+08  3.340900e+08          1.862118      0.146616   
15518         ZYX  4.364455e+07  5.227431e+07          1.197728      0.599593   
15519       ZZEF1  5.725008e+07  5.304806e+07          0.926602      0.927833   
15520        ZZZ3  1.313713e+08  9.847906e+07          0.749624      0.315054   
15523        ZZZ3  2.989399e+08  2.667268e+08          0.892242      0.485882   

       AZ20_ctrlCV  AZ20_tr

Unnamed: 0,Substrate,control_mean,AZ20_mean,AZ20_fold_change,AZ20_p-value,AZ20_ctrlCV,AZ20_treatCV,Phosphosite,-Log10 Corrected P-Value,Log2 Fold Change
0,1A24_HUMAN,1.527934e+07,2.643439e+07,1.730074,0.554298,1.280092,0.902944,S356,0.256257,0.790834
1,1A24_HUMAN,1.527934e+07,2.643439e+07,1.730074,0.554298,1.280092,0.902944,S359,0.256257,0.790834
2,1B13_HUMAN,2.706473e+07,2.779116e+07,1.026841,0.962084,0.724637,0.580207,,0.016787,0.038212
3,1B39_HUMAN,1.158130e+09,1.173595e+09,1.013353,0.958301,0.128036,0.399220,M1,0.018498,0.019137
4,1B39_HUMAN,1.158130e+09,1.173595e+09,1.013353,0.958301,0.128036,0.399220,M4,0.018498,0.019137
...,...,...,...,...,...,...,...,...,...,...
15517,ZYX,1.794139e+08,3.340900e+08,1.862118,0.146616,0.687707,0.308358,T270,0.833817,0.896945
15518,ZYX,4.364455e+07,5.227431e+07,1.197728,0.599593,0.606458,0.099992,T274,0.222144,0.260301
15519,ZZEF1,5.725008e+07,5.304806e+07,0.926602,0.927833,0.913163,1.067354,S1464,0.032530,-0.109978
15520,ZZZ3,1.313713e+08,9.847906e+07,0.749624,0.315054,0.362038,0.212820,S113,0.501616,-0.415761


In [16]:
def Substrate_Phosphosite_List_Dict(log2FCKinase):
    Sub_phosp_list=[]
    for i, j, k in zip(log2FCKinase['Substrate'], log2FCKinase['Phosphosite'],range(len(log2FCKinase))):
        Sub_phosp_list.append([])
        Sub_phosp_list[k].append(i)
        Sub_phosp_list[k].append(j)

    return Sub_phosp_list

#print (case_list)        
Sub_Phospho_list=Substrate_Phosphosite_List_Dict(input_original_subset)


def Fetch_Kinase(Sub_Phospho_List):
    KinaseList=[]
    for i in Sub_Phospho_list:
        Sub = i[0]
        Pho = i[1]
        KinaseList.append(get_kinase_substrate_phosphosite(Sub, Pho))
          
    return KinaseList

fetched_kinase=Fetch_Kinase(Sub_Phospho_list)

def convert_dict_df(fetched_kinase):
    input_original_subset['Kinase']=fetched_kinase

    df_final = pd.concat([input_original_subset, input_original_subset['Kinase'].apply(pd.Series)], axis = 1).drop('Kinase', axis = 1)
    df_final1=df_final.drop(['substrate', 'phosphosite'], axis=1)

    df_final2=df_final1.dropna()
    df_final3=df_final2.explode('kinase')
    return df_final3

df_final3=convert_dict_df(fetched_kinase)

# KSEA "Mean" Method

MS = Mean fold changes in substrate set 

mP = mean FC of data set

m=size of substrate set

delta= std of fold change of complete data set 

In [24]:
def KSEA_Mean(df_final3):#mS calculation
    mS = df_final3.groupby('kinase')['Log2 Fold Change'].mean()
    mP = df_final3['Log2 Fold Change'].mean()
    delta=df_final3['Log2 Fold Change'].std()

    m=[]
    Kinase_phosphosite=df_final3.groupby('kinase')['Phosphosite']
    for key, item in Kinase_phosphosite:
        m.append(len(item))

    Z_Scores=[]    
    for i, j in zip(mS, m):
        Z_Scores.append((i-mP)*math.sqrt(j)*1/delta)

    p_means=[]
    for i in Z_Scores:
        p_means.append(norm.sf(abs(i)))
        
    enrichment=mS/mP
    
    calculations_dict={'mS': mS, 'mP':mP, 'm':m, 'Delta':delta, 'Z_Scores':Z_Scores,"P_value":p_means,"Enrichment":enrichment}

    calculations_df=pd.DataFrame(calculations_dict)
    calculations_df=calculations_df.reset_index(level=['kinase'])
    
    return calculations_df
calculations_df=KSEA_Mean(df_final3)
calculations_df

Unnamed: 0,kinase,mS,mP,m,Delta,Z_Scores,P_value,Enrichment
0,AAK1,-0.294285,0.079763,1,1.157322,-0.323201,0.373271,-3.689514
1,ABL1,0.169503,0.079763,3,1.157322,0.134306,0.446580,2.125096
2,AKT1,-0.358166,0.079763,34,1.157322,-2.206422,0.013677,-4.490402
3,AKT2,-0.538991,0.079763,4,1.157322,-1.069285,0.142471,-6.757442
4,ARAF,-0.011380,0.079763,3,1.157322,-0.136403,0.445751,-0.142667
...,...,...,...,...,...,...,...,...
134,UHMK1,0.136757,0.079763,3,1.157322,0.085298,0.466012,1.714549
135,ULK1,-0.304190,0.079763,1,1.157322,-0.331760,0.370035,-3.813695
136,VRK1,0.652819,0.079763,2,1.157322,0.700259,0.241883,8.184532
137,WNK1,-1.850103,0.079763,1,1.157322,-1.667527,0.047705,-23.195128


In [53]:
calculations_df[calculations_df['m']>3]

Unnamed: 0,kinase,mS,mP,m,Delta,Z_Scores,P_value,Enrichment
2,AKT1,-0.358166,0.079763,34,1.157322,-2.206422,0.013677,-4.490402
3,AKT2,-0.538991,0.079763,4,1.157322,-1.069285,0.142471,-6.757442
5,ATM,-0.248681,0.079763,10,1.157322,-0.897441,0.184742,-3.117761
6,ATR,-0.090599,0.079763,4,1.157322,-0.294406,0.384224,-1.135857
7,AURKA,0.575554,0.079763,9,1.157322,1.285185,0.099364,7.215836
8,AURKB,-0.085562,0.079763,19,1.157322,-0.622674,0.26675,-1.07271
15,CAMK2A,0.350294,0.079763,5,1.157322,0.522695,0.300593,4.391709
19,CDC7,0.616326,0.079763,4,1.157322,0.927249,0.176899,7.727003
20,CDK1,0.093789,0.079763,105,1.157322,0.124193,0.450581,1.175856
23,CDK2,0.226249,0.079763,92,1.157322,1.214051,0.112364,2.83653


In [37]:
def VolcanoPlot_Sub(kinaseList):

    FC_T=1
    FC_TN=-1
    PV_T=-np.log10(0.05)

    kinaseList.loc[(kinaseList['Log2 Fold Change'] > FC_T) & (kinaseList['-Log10 Corrected P-Value'] > PV_T), 'color' ] = "Green"  # upregulated
    kinaseList.loc[(kinaseList['Log2 Fold Change'] < FC_TN) & (kinaseList['-Log10 Corrected P-Value'] > PV_T), 'color' ] = "Red"   # downregulated
    kinaseList['color'].fillna('grey', inplace=True)

    output_notebook()

    category = 'Substrate'

    category_items = kinaseList[category].unique()
    title="Volcano Plot"

    #title = Inhibitor + " :Data with identified kinases"
    #feeding data into ColumnDataSource

    source = ColumnDataSource(kinaseList)

    hover = HoverTool(tooltips=[
                                ('Substrate', '@Substrate'),
                                ('Phosphosite', '@Phosphosite'),
                                ('Fold_change', '@{Log2 Fold Change}'),
                                ('p_value', '@{-Log10 Corrected P-Value}')])

    tools = [hover, WheelZoomTool(), PanTool(), BoxZoomTool(), ResetTool(), SaveTool()]
    
    p = figure(tools=tools,title=title,plot_width=700,plot_height=400,toolbar_location='right',
           toolbar_sticky=False)
   
    p.scatter(x = 'Log2 Fold Change', y = '-Log10 Corrected P-Value',source=source,size=10,color='color')
   
    p_sig = Span(location=PV_T,dimension='width', line_color='black',line_dash='dashed', line_width=3)
    fold_sig_over=Span(location=FC_T,dimension='height', line_color='black',line_dash='dashed', line_width=3)
    fold_sig_under=Span(location=FC_TN,dimension='height', line_color='black',line_dash='dashed', line_width=3)

    p.add_layout(p_sig)   
    p.add_layout(fold_sig_over)   
    p.add_layout(fold_sig_under)   

    show(p)
VolcanoPlot_Sub(log2FCKinase)

In [35]:
def VolcanoPlot(kinaseList):

    FC_T=1
    FC_TN=-1
    PV_T=-np.log10(0.05)

    kinaseList.loc[(kinaseList['Log2 Fold Change'] > FC_T) & (kinaseList['-Log10 Corrected P-Value'] > PV_T), 'color' ] = "Green"  # upregulated
    kinaseList.loc[(kinaseList['Log2 Fold Change'] < FC_TN) & (kinaseList['-Log10 Corrected P-Value'] > PV_T), 'color' ] = "Red"   # downregulated
    kinaseList['color'].fillna('grey', inplace=True)

    output_notebook()

    category = 'Substrate'

    category_items = kinaseList[category].unique()
    title="Volcano Plot"

    #title = Inhibitor + " :Data with identified kinases"
    #feeding data into ColumnDataSource

    source = ColumnDataSource(kinaseList)

    hover = HoverTool(tooltips=[('Kinase','@Kinase'),
                                ('Substrate', '@Substrate'),
                                ('Phosphosite', '@Phosphosite'),
                                ('Fold_change', '@{Log2 Fold Change}'),
                                ('p_value', '@{-Log10 Corrected P-Value}')])

    tools = [hover, WheelZoomTool(), PanTool(), BoxZoomTool(), ResetTool(), SaveTool()]
    
    p = figure(tools=tools,title=title,plot_width=700,plot_height=400,toolbar_location='right',
           toolbar_sticky=False)
   
    p.scatter(x = 'Log2 Fold Change', y = '-Log10 Corrected P-Value',source=source,size=10,color='color')
   
    p_sig = Span(location=PV_T,dimension='width', line_color='black',line_dash='dashed', line_width=3)
    fold_sig_over=Span(location=FC_T,dimension='height', line_color='black',line_dash='dashed', line_width=3)
    fold_sig_under=Span(location=FC_TN,dimension='height', line_color='black',line_dash='dashed', line_width=3)

    p.add_layout(p_sig)   
    p.add_layout(fold_sig_over)   
    p.add_layout(fold_sig_under)   

    show(p)
volcano_plot=VolcanoPlot(df_final3)


In [43]:
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure

reduc_calculations_df=calculations_df[calculations_df['m']>= 4]

reduc_calculations_df.loc[(reduc_calculations_df['P_value'] < 0.05), 'color'] = "Orange"  # significance 0.05# significance 0.01
reduc_calculations_df.loc[(reduc_calculations_df['P_value'] > 0.05), 'color' ] = "Black"

kinase=reduc_calculations_df['kinase']

enrichment=reduc_calculations_df['Enrichment']
source = ColumnDataSource(reduc_calculations_df)



hover = HoverTool(tooltips=[('Enrichment)','@Enrichment'),
                                ('Number of Substrates', '@m'),
                                ('P-value', 'P_value')])

tools = [hover, WheelZoomTool(), PanTool(), BoxZoomTool(), ResetTool(), SaveTool()]
p = figure(tools=tools, y_range=kinase, x_range=((enrichment.min()-5), (enrichment.max()+5)), plot_width=600, plot_height=800, toolbar_location=None,
           title="Kinase Substrate Enrichment",)
p.hbar(y="kinase", left=0, right='Enrichment', height=0.3, color= 'color', source=source)

p.ygrid.grid_line_color = None
p.xaxis.axis_label = "Enrichment (mS/mP)"
p.yaxis.axis_label = "Kinase"
p.outline_line_color = None

show(p)
#log2FCKinase

In [45]:
phospho_ser = sum(log2FCKinase.iloc[:, 7].astype(str).str.contains("S", case=False)) # Ser.
phospho_thr = sum(log2FCKinase.iloc[:, 7].astype(str).str.contains("T", case=False)) # Thr.
phospho_tyr = sum(log2FCKinase.iloc[:, 7].astype(str).str.contains("Y", case=False)) # Tyr.

phos_ser_list=log2FCKinase[log2FCKinase.iloc[:, 7].astype(str).str.contains("S", case=False)]
phos_thr_list=log2FCKinase[log2FCKinase.iloc[:, 7].astype(str).str.contains("T", case=False)]
phos_tyr_list=log2FCKinase[log2FCKinase.iloc[:, 7].astype(str).str.contains("Y", case=False)]

print (phos_ser_list)

        Substrate  control_mean     AZ20_mean  AZ20_fold_change  AZ20_p-value  \
0      1A24_HUMAN  1.527934e+07  2.643439e+07          1.730074      0.554298   
1      1A24_HUMAN  1.527934e+07  2.643439e+07          1.730074      0.554298   
7            AAAS  3.886162e+09  4.023860e+09          1.035433      0.798476   
9           AAGAB  1.237204e+07  5.187831e+06          0.419319      0.532084   
10          AAGAB  1.398521e+07  5.187831e+06          0.370951      0.427256   
...           ...           ...           ...               ...           ...   
15515         ZYX  1.591670e+09  1.325098e+09          0.832521      0.224035   
15516         ZYX  1.858139e+10  2.262601e+10          1.217670      0.059541   
15519       ZZEF1  5.725008e+07  5.304806e+07          0.926602      0.927833   
15520        ZZZ3  1.313713e+08  9.847906e+07          0.749624      0.315054   
15523        ZZZ3  2.989399e+08  2.667268e+08          0.892242      0.485882   

       AZ20_ctrlCV  AZ20_tr

In [50]:
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6

###Total serine phophorylated
#Total significantly upregulated
#total significantly downregulated
phos_ser_sig=phos_ser_list[phos_ser_list.iloc[:,8]>np.log(0.05)]
phos_ser_nonsig=sum(phos_ser_list.iloc[:,8]<np.log(0.05))


phos_tyr_sig=phos_tyr_list[phos_tyr_list.iloc[:,8]>np.log(0.05)]
phos_tyr_nonsig=sum(phos_tyr_list.iloc[:,8]<np.log(0.05))

phos_thr_sig=phos_thr_list[phos_thr_list.iloc[:,8]>np.log(0.05)]
phos_thr_nonsig=sum(phos_thr_list.iloc[:,8]>np.log(0.05))

ser_upreg=sum(phos_ser_sig.iloc[:,9]>0)
ser_downreg=sum(phos_ser_sig.iloc[:,9]<0)
thr_upreg=sum(phos_thr_sig.iloc[:,9]>0)
thr_downreg=sum(phos_thr_sig.iloc[:,9]<0)
tyr_upreg=sum(phos_tyr_sig.iloc[:,9]>0)
tyr_downreg=sum(phos_tyr_sig.iloc[:,9]<0)

#print phos_ser_list
residues=["Serine","Threonine", "Tyrosine"]

data = {'Residues': ["Serine", "Threonine", "Tyrosine"],
        'Upregulated': [ser_upreg, thr_upreg, tyr_upreg],
        'Downregulated': [ser_downreg, thr_downreg, tyr_downreg],
        'Nonsignificant': [phos_ser_nonsig, phos_ser_nonsig, phos_thr_nonsig]}


output_file("bars.html")


regulation = ['Upregulated', 'Downregulated', 'Non-Significant']


x = [ (residue, reg) for residue in residues for reg in regulation ]
counts = sum(zip(data['Upregulated'], data['Downregulated'], data['Nonsignificant']), ()) # like an hstack

source = ColumnDataSource(data=dict(x=x, counts=counts))

p = figure(x_range=FactorRange(*x), plot_height=500, title="Residue phosphorylation",
           toolbar_location=None, tools="")

p.vbar(x='x', top='counts', width=0.9, source=source,line_color="white",
       fill_color=factor_cmap('x', palette=Spectral6, factors=regulation, start=1, end=2))
p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
p.yaxis.axis_label = "Number of Residues Phosphorylated"

show(p)