In [None]:
#initialize stuff

!ls
!mkdir data >> /dev/null


# Test scoring

In [1]:
from kinaid.matching import PWM_Matrices,Scoring,PeptideBackground
import os
import pandas as pd
from tqdm.notebook import tqdm_notebook
from kinaid.utility import DefaultConfiguration
from kinaid.utility import Utility
import numpy as np
from kinaid.session import Session
import sys
from kinaid.ortholog import OrthologManager
from dash import Dash, html
import dash_cytoscape as cyto



In [2]:
DefaultConfiguration()


Loading ST matrices
Loading Y matrices (w/ non-canonical)
Creating scoring objects
Dual specificity kinases
{'Q16654', 'Q15118', 'Q13873'}
Ortholog database for mouse already exists
Ortholog database for fly already exists
Ortholog database for worm already exists
Ortholog database for yeast already exists
Ortholog database for zebrafish already exists
orthologs/mouse_10090_orthologs.tsv : 0
orthologs/fly_7227_orthologs.tsv : 11
orthologs/worm_6239_orthologs.tsv : 33
orthologs/yeast_4932_orthologs.tsv : 10
orthologs/zebrafish_7955_orthologs.tsv : 0
Final ortholog database for mouse already exists
Final ortholog database for fly already exists
Final ortholog database for worm already exists
Final ortholog database for yeast already exists
UniProt map server responded: 404
{"url":"http://rest.uniprot.org/idmapping/stream/c060f06feb6ce7e57026abaa25dfa8320c2285b8","messages":["Resource not found"]}
Retry 1
UniProt map server responded: 404
{"url":"http://rest.uniprot.org/idmapping/stream/c

In [None]:
data_dir = './data'
johnson_ST_matrices_file = os.path.join(data_dir,'ST-Kinases.xlsx')
johnson_Y_matrices_file = os.path.join(data_dir,'Y-Kinases.xlsx')
densitometry_file = os.path.join(data_dir,'ST-Kinases_densitometry.xlsx')


ST_matrices = PWM_Matrices(johnson_ST_matrices_file, debug=True)
ST_matrices.add_densitometry(densitometry_file)

Y_matrices = PWM_Matrices(johnson_Y_matrices_file, debug=True)
Y_matrices_ncon = PWM_Matrices(johnson_Y_matrices_file, debug=True)

st_scoring = Scoring(ST_matrices)
y_scoring = Scoring(Y_matrices)

test_seq = 'IRDGGPYGGLMPD'
print(y_scoring.clean_sequence(test_seq))

test_seq = 'RDGGPYGGLMP'
print(y_scoring.clean_sequence(test_seq))

test_seq = 'RDGGPSGGLM'
print(st_scoring.clean_sequence(test_seq))

test_seq = 'GPTSGG'
print(st_scoring.clean_sequence(test_seq))

test_seq = 'GPSYGG'
print(y_scoring.clean_sequence(test_seq))

test_seq = 'RDGGPY*GGLMP'
print(y_scoring.clean_sequence(test_seq))

test_seq = 'RDTGPS*GGLM'
print(st_scoring.clean_sequence(test_seq))

test_seq = 'GPT*SGG'
print(st_scoring.clean_sequence(test_seq))

test_seq = 'GPSY*GG'
print(y_scoring.clean_sequence(test_seq))

test_seq = 'GPSGGY*'
print(y_scoring.clean_sequence(test_seq))

print(y_scoring.score_peptide('GPSGGY_____', kinase='BLK', mode='as_is', log_score=True))
print(st_scoring.score_peptide('RDGGPSGGLM', kinase='ERK2', mode='as_is', log_score=True))

In [None]:
ochoa_background_file = os.path.join(data_dir, 'johnson_ochoa_background_wfav.tsv')

tyrosine_background_file = os.path.join(data_dir, 'johnson_tyrosine_background_wfav.tsv')

ST_Background = PeptideBackground(ochoa_background_file)
Y_Background = PeptideBackground(tyrosine_background_file)

score1 = y_scoring.score_peptide('GPSGGY_____', kinase='BLK', mode='as_is')
score2 = st_scoring.score_peptide('RDGGPSGGLM', kinase='ERK2', mode='as_is')

print(ST_Background.get_percentile(score2, kinase='ERK2'))
print(Y_Background.get_percentile(score1, kinase='BLK'))

# Offline Experiments

In [None]:
import os
import numpy as np
import pandas as pd
from kinaid.session import Session
from kinaid.matching import MatchWithMapping, Scoring, PeptideBackground, PWM_Matrices
from kinaid.utility import Utility
from Bio import SeqIO as seqio


data_dir = './data'
test_dir = './test'
proteomes_dir = os.path.join(test_dir, 'proteomes')

if not os.path.exists(test_dir):
    os.makedirs(test_dir)
    
johnson_ST_matrices_file = os.path.join(data_dir,'ST-Kinases.xlsx')
johnson_Y_matrices_file = os.path.join(data_dir,'Y-Kinases.xlsx')
densitometry_file = os.path.join(data_dir,'ST-Kinases_densitometry.xlsx')

ST_matrices = PWM_Matrices(johnson_ST_matrices_file)
ST_matrices.add_densitometry(densitometry_file)

Y_matrices = PWM_Matrices(johnson_Y_matrices_file)

st_scoring = Scoring(ST_matrices)
y_scoring = Scoring(Y_matrices)

ochoa_background_file = os.path.join(data_dir, 'johnson_ochoa_background_wfav.tsv')

tyrosine_background_file = os.path.join(data_dir, 'johnson_tyrosine_background_wfav.tsv')

ST_Background = PeptideBackground(ochoa_background_file)
Y_Background = PeptideBackground(tyrosine_background_file)

ortholog_manager = OrthologManager('orthologs', debug=True)


## Yeast Experiments

### Leutert, M., Barente, A.S., Fukuda, N.K. et al. The regulatory landscape of the yeast phosphoproteome. Nat Struct Mol Biol 30, 1761–1773 (2023).

In [None]:

def clean_phosphopeptides(phosphorylation_file, fasta_file, output_file) :
    exp_df = pd.read_csv(phosphorylation_file)

    #read in the fasta file
    fasta_dict = seqio.to_dict(seqio.parse(fasta_file, 'fasta'))

    exp_df = exp_df[exp_df['p_residue'].isin(['S', 'T', 'Y'])]

    systematic_name_to_SGD_dict = {name:record.description.split(', ')[0].split(' ')[2] for name,record in fasta_dict.items()}
    #systematic_name_to_SGD_dict = {name:SGD.split(':')[1] for name,SGD in systematic_name_to_SGD_dict.items()}

    exp_df['SGDID'] = exp_df['systematic_name'].map(systematic_name_to_SGD_dict)

    systematic_name_to_seq_dict = {name:str(record.seq) for name,record in fasta_dict.items()}
    exp_df['sequence'] = (exp_df['systematic_name'].map(systematic_name_to_seq_dict))

    #remove rows with missing sequences
    exp_df = exp_df[~exp_df['sequence'].isna()]

    #remove rows with stop codons
    exp_df['sequence'] = exp_df['sequence'].str.rstrip('*')

    #get sequence lengths
    exp_df['seq_len'] = exp_df['sequence'].str.len()

    #keep only rows where the sequence length is greater than or equal to the position of the phosphorylation site
    exp_df = exp_df[exp_df['seq_len'] >= exp_df['p_position']]

    exp_df['sanity_check'] = exp_df.apply(lambda row: row['sequence'][row['p_position']-1] == row['p_residue'], axis=1)
    
    exp_df = exp_df[exp_df['sanity_check']]

    #if p_position is less than 6, add 5 - p_position number of _'s to the beginning of the sequence
    exp_df['sequence'] = exp_df.apply(lambda row: '_'*(6 - row['p_position']) + row['sequence'], axis=1)

    #if p_position is greater than seq_len - 6, add p_position - (seq_len - 5) number of _'s to the end of the sequence
    exp_df['sequence'] = exp_df.apply(lambda row: row['sequence'] + '_'*(row['p_position'] - (row['seq_len'] - 5)), axis=1)

    #make window around p_position if p_position is greater than 5 from p_position - 6 to p_position + 5, else from 0 to 10
    exp_df['window'] = exp_df.apply(lambda row: row['sequence'][row['p_position']-6:row['p_position']+5] if row['p_position'] > 5 else row['sequence'][0:11], axis=1)

    exp_df['adj_p_value'] = exp_df['adj_p_value'].apply(lambda x: -1*np.log10(x))

    exp_df_filtered = exp_df[['SGDID', 'p_position', 'window', 'fc_log2', 'adj_p_value']].copy()

    exp_df_filtered.rename(columns={'SGDID':'SGD', 'p_position':'site', 'window':'peptide', 'fc_log2':'log2fc', 'adj_p_value':'adjpvalue'}, inplace=True)

    exp_df_filtered.to_csv(output_file, index=False)


In [None]:
published_yeast_file = 'yeast-perturbation.xlsx'
published_yeast_path = os.path.join(test_dir, published_yeast_file)


import gzip
yeast_proteome_path = os.path.join(proteomes_dir, 'yeast_proteome.fasta')

if not os.path.exists(proteomes_dir):
    os.makedirs(proteomes_dir)
    
if not os.path.exists(yeast_proteome_path):
    print('Downloading yeast proteome')
    Utility.download_file('http://sgd-archive.yeastgenome.org/sequence/S288C_reference/orf_protein/orf_trans.fasta.gz', 'temp.gz')
    with gzip.open('temp.gz', 'rb') as f_in:
        with open(yeast_proteome_path, 'wb') as f_out:
            f_out.write(f_in.read())
    os.remove('temp.gz')
    

if not os.path.exists(published_yeast_path):
    print('Downloading yeast perturbation data')
    Utility.download_file('https://static-content.springer.com/esm/art%3A10.1038%2Fs41594-023-01115-3/MediaObjects/41594_2023_1115_MOESM9_ESM.xlsx',
                  published_yeast_path)


HOG1_experiment_file = os.path.join(test_dir,'HOG1_exp.csv')
SNF1_experiment_file = os.path.join(test_dir,'SNF1_exp.csv')
TOR1_experiment_file = os.path.join(test_dir,'TOR1_exp.csv')

if not os.path.exists(HOG1_experiment_file) or not os.path.exists(SNF1_experiment_file) or not os.path.exists(TOR1_experiment_file):
    df = pd.read_excel(published_yeast_path, sheet_name='p_site_diff_reg')


if not os.path.exists(HOG1_experiment_file):
    HOG1_phosphopeptides_file = 'yeast_HOG1_KC_phosphopeptides.csv'
    HOG1_path = os.path.join(test_dir, HOG1_phosphopeptides_file)
    df[df['treatment_id'] == 'KC'].to_csv(HOG1_path, index=False)
    clean_phosphopeptides(HOG1_path, yeast_proteome_path, HOG1_experiment_file)

if not os.path.exists(SNF1_experiment_file):
    SNF1_phosphopeptides_file = 'yeast_SNF1_GL_phosphopeptides.csv'
    SNF1_path = os.path.join(test_dir, SNF1_phosphopeptides_file)
    df[df['treatment_id'] == 'GL'].to_csv(SNF1_path, index=False)
    clean_phosphopeptides(SNF1_path, yeast_proteome_path, SNF1_experiment_file)

if not os.path.exists(TOR1_experiment_file):
    TOR1_phosphopeptides_file = 'yeast_TOR1_CS18_phosphopeptides.csv'
    TOR1_path = os.path.join(test_dir, TOR1_phosphopeptides_file)
    df[df['treatment_id'] == 'CS18'].to_csv(TOR1_path, index=False)
    clean_phosphopeptides(TOR1_path, yeast_proteome_path, TOR1_experiment_file)






In [None]:


column_names_dict={'id':'SGD', 'site':'site', 'peptide':'peptide', 'log2fc':'log2fc', 'dependent':'adjpvalue'}

"""
yeast_orthologs_file = os.path.join('orthologs', 'yeast_orthologs_final.tsv')

yeast_orthologs_df = pd.read_csv(yeast_orthologs_file, sep='\t')
yeast_orthologs_df = yeast_orthologs_df[yeast_orthologs_df['gene_id_type'] == 'SGD']

yeast_orthologs_st_df = yeast_orthologs_df[yeast_orthologs_df['kinase_type'] == 'ST']
yeast_mapping_st_dict = dict(zip(yeast_orthologs_st_df['symbol'], yeast_orthologs_st_df['kinase_name']))

yeast_orthologs_y_df = yeast_orthologs_df[yeast_orthologs_df['kinase_type'] == 'Y']
yeast_mapping_y_dict = dict(zip(yeast_orthologs_y_df['symbol'], yeast_orthologs_y_df['kinase_name']))

print(yeast_mapping_y_dict.values())
print(y_scoring._kinase_names)


ST_matching = MatchWithMapping(st_scoring, ST_Background, yeast_mapping_st_dict)
Y_matching = MatchWithMapping(y_scoring, Y_Background, yeast_mapping_y_dict)
"""

background = {'ST':ST_Background, 'Y':Y_Background}
scoring = {'ST':st_scoring, 'Y':y_scoring}


In [None]:
HOG1_df = pd.read_csv(HOG1_experiment_file)

#session = Session(0,'yeast', HOG1_df, column_names_dict, ST_matching, Y_matching, id_type = 'SGD', debug=True)
session = Session(0, 'yeast', HOG1_df, column_names_dict, scoring, background, ortholog_manager, id_type='SGD', ambiguous=True, debug=True)

In [None]:
kinase_matches_df = session.get_kinase_matches_df()

In [None]:
kinase_matches_df.to_csv(os.path.join(test_dir, 'HOG1_kinase_matches.tsv'), index=False, sep='\t')
display(kinase_matches_df.head())

In [None]:
percentiles_st_df = session.get_percentiles_df('ST')
percentiles_y_df = session.get_percentiles_df('Y')

percentiles_st_df.to_csv(os.path.join(test_dir, 'HOG1_percentiles_ST.tsv'), index=True, sep='\t')
percentiles_y_df.to_csv(os.path.join(test_dir, 'HOG1_percentiles_Y.tsv'), index=True, sep='\t')

In [None]:
barplot = session.get_counts_barplot_fig()

#update the height to be 20 pixels times the number of kinases
barplot = barplot.update_layout(height=20*len(session._all_selected_symbols))

In [None]:
display(barplot)

In [None]:
peptide_scatter = session.get_peptide_scatter_fig(selected_kinases={'PBS2', 'HOG1'})
display(peptide_scatter)

In [None]:
heatmap_st = session.get_heatmap_fig('ST')
heatmap_st = heatmap_st.update_layout(height=10*len(st_scoring._kinase_names))
display(heatmap_st)

heatmap_y = session.get_heatmap_fig('Y')
display(heatmap_y)

heatmap_all = session.get_all_heatmap_figs()
heatmap_all = heatmap_all.update_layout(height=10*len(st_scoring._kinase_names))

display(heatmap_all)


In [None]:
log2fc_df = session.get_stat_df()
display(log2fc_df)

In [None]:
zscore_fig = session.get_zscore_fig()
zscore_fig = zscore_fig.update_layout(height=20*len(session._all_selected_symbols))
display(zscore_fig)

zscore_fig2 = session.get_zscore_fig(combine_populations=False)
zscore_fig2 = zscore_fig2.update_layout(height=20*len(session._all_selected_symbols))
display(zscore_fig2)

In [None]:
kinase_scatter = session.get_kinase_scatter_fig()
display(kinase_scatter)

kinase_scatter2 = session.get_kinase_scatter_fig(combine_populations=False)
display(kinase_scatter2)

In [None]:
network_df = session.get_network_df()
network_df.to_csv(os.path.join(test_dir, 'HOG1_network.tsv'), index=False, sep='\t')

full_network_fig = session.get_full_kinase_network_fig()

hub_network_fig = session.get_kinase_hub_fig(selected_symbols={'PBS2', 'HOG1', 'STE11'})

In [None]:

cyto.load_extra_layouts()

app = Dash(__name__)

app.layout = html.Div([
    hub_network_fig
])

if __name__ == '__main__':
    app.run(debug=True)