In [1]:
#initialize stuff

!ls
!mkdir data >> /dev/null


LICENSE       [1m[36mdata[m[m          [1m[36morthologs[m[m     sandbox.ipynb
README.md     [1m[36mkinaid[m[m        [1m[36mproteomes[m[m     [1m[36mtest[m[m
mkdir: data: File exists


# Test scoring

In [2]:
from kinaid.matching import PWM_Matrices,Scoring,PeptideBackground
import os
import pandas as pd
from tqdm.notebook import tqdm_notebook
from kinaid.utility import DefaultConfiguration
from kinaid.utility import Utility
import numpy as np
from kinaid.session import Session


In [3]:
DefaultConfiguration()

Loading ST matrices
Loading Y matrices (w/ non-canonical)
Creating scoring objects
Dual specificity kinases
{'Q13873', 'Q16654', 'Q15118'}
Ortholog database for mouse already exists
Ortholog database for fly already exists
Ortholog database for worm already exists
Ortholog database for yeast already exists
Ortholog database for zebrafish already exists
Final ortholog database for mouse already exists
Final ortholog database for fly already exists
Final ortholog database for worm already exists
Final ortholog database for yeast already exists
Final ortholog database for zebrafish already exists


In [4]:
data_dir = './data'
johnson_ST_matrices_file = os.path.join(data_dir,'ST-Kinases.xlsx')
johnson_Y_matrices_file = os.path.join(data_dir,'Y-Kinases.xlsx')
densitometry_file = os.path.join(data_dir,'ST-Kinases_densitometry.xlsx')


ST_matrices = PWM_Matrices(johnson_ST_matrices_file, debug=True)
ST_matrices.add_densitometry(densitometry_file)

Y_matrices = PWM_Matrices(johnson_Y_matrices_file, debug=True)
Y_matrices_ncon = PWM_Matrices(johnson_Y_matrices_file, debug=True)

st_scoring = Scoring(ST_matrices)
y_scoring = Scoring(Y_matrices)

test_seq = 'IRDGGPYGGLMPD'
print(y_scoring.clean_sequence(test_seq))

test_seq = 'RDGGPYGGLMP'
print(y_scoring.clean_sequence(test_seq))

test_seq = 'RDGGPSGGLM'
print(st_scoring.clean_sequence(test_seq))

test_seq = 'GPTSGG'
print(st_scoring.clean_sequence(test_seq))

test_seq = 'GPSYGG'
print(y_scoring.clean_sequence(test_seq))

test_seq = 'RDGGPY*GGLMP'
print(y_scoring.clean_sequence(test_seq))

test_seq = 'RDTGPS*GGLM'
print(st_scoring.clean_sequence(test_seq))

test_seq = 'GPT*SGG'
print(st_scoring.clean_sequence(test_seq))

test_seq = 'GPSY*GG'
print(y_scoring.clean_sequence(test_seq))

test_seq = 'GPSGGY*'
print(y_scoring.clean_sequence(test_seq))

print(y_scoring.score_peptide('GPSGGY_____', kinase='BLK', mode='as_is', log_score=True))
print(st_scoring.score_peptide('RDGGPSGGLM', kinase='ERK2', mode='as_is', log_score=True))

['AAK1', 'ACVR2A', 'ACVR2B', 'AKT1', 'AKT2', 'AKT3', 'ALK2', 'ALK4', 'ALPHAK3', 'AMPKA1', 'AMPKA2', 'ANKRD3', 'ASK1', 'ATM', 'ATR', 'AURA', 'AURB', 'AURC', 'BCKDK', 'BIKE', 'BMPR1A', 'BMPR1B', 'BMPR2', 'BRAF', 'BRSK1', 'BRSK2', 'BUB1', 'CAMK1A', 'CAMK1B', 'CAMK1D', 'CAMK1G', 'CAMK2A', 'CAMK2B', 'CAMK2D', 'CAMK2G', 'CAMK4', 'CAMKK1', 'CAMKK2', 'CAMLCK', 'CDC7', 'CDK1', 'CDK10', 'CDK12', 'CDK13', 'CDK14', 'CDK16', 'CDK17', 'CDK18', 'CDK19', 'CDK2', 'CDK3', 'CDK4', 'CDK5', 'CDK6', 'CDK7', 'CDK8', 'CDK9', 'CDKL1', 'CDKL5', 'CHAK1', 'CHAK2', 'CHK1', 'CHK2', 'CK1A', 'CK1A2', 'CK1D', 'CK1E', 'CK1G1', 'CK1G2', 'CK1G3', 'CK2A1', 'CK2A2', 'CLK1', 'CLK2', 'CLK3', 'CLK4', 'COT', 'CRIK', 'DAPK1', 'DAPK2', 'DAPK3', 'DCAMKL1', 'DCAMKL2', 'DLK', 'DMPK1', 'DNAPK', 'DRAK1', 'DSTYK', 'DYRK1A', 'DYRK1B', 'DYRK2', 'DYRK3', 'DYRK4', 'EEF2K', 'ERK1', 'ERK2', 'ERK5', 'ERK7', 'FAM20C', 'GAK', 'GCK', 'GCN2', 'GRK1', 'GRK2', 'GRK3', 'GRK4', 'GRK5', 'GRK6', 'GRK7', 'GSK3A', 'GSK3B', 'HASPIN', 'HGK', 'HIPK1', 'HIP

In [5]:
ochoa_background_file = os.path.join(data_dir, 'johnson_ochoa_background_wfav.tsv')

tyrosine_background_file = os.path.join(data_dir, 'johnson_tyrosine_background_wfav.tsv')

ST_Background = PeptideBackground(ochoa_background_file)
Y_Background = PeptideBackground(tyrosine_background_file)

score1 = y_scoring.score_peptide('GPSGGY_____', kinase='BLK', mode='as_is')
score2 = st_scoring.score_peptide('RDGGPSGGLM', kinase='ERK2', mode='as_is')

print(ST_Background.get_percentile(score2, kinase='ERK2'))
print(Y_Background.get_percentile(score1, kinase='BLK'))

64.35222311324958
58.76965140123035


# Offline Experiments

In [6]:
import os
import numpy as np
import pandas as pd
from kinaid.session import Session
from kinaid.matching import MatchWithMapping, Scoring, PeptideBackground, PWM_Matrices
from kinaid.utility import Utility
from Bio import SeqIO as seqio


data_dir = './data'
test_dir = './test'
proteomes_dir = os.path.join(test_dir, 'proteomes')

if not os.path.exists(test_dir):
    os.makedirs(test_dir)
    
johnson_ST_matrices_file = os.path.join(data_dir,'ST-Kinases.xlsx')
johnson_Y_matrices_file = os.path.join(data_dir,'Y-Kinases.xlsx')
densitometry_file = os.path.join(data_dir,'ST-Kinases_densitometry.xlsx')

ST_matrices = PWM_Matrices(johnson_ST_matrices_file)
ST_matrices.add_densitometry(densitometry_file)

Y_matrices = PWM_Matrices(johnson_Y_matrices_file)

st_scoring = Scoring(ST_matrices)
y_scoring = Scoring(Y_matrices)

ochoa_background_file = os.path.join(data_dir, 'johnson_ochoa_background_wfav.tsv')

tyrosine_background_file = os.path.join(data_dir, 'johnson_tyrosine_background_wfav.tsv')

ST_Background = PeptideBackground(ochoa_background_file)
Y_Background = PeptideBackground(tyrosine_background_file)

## Yeast Experiments

### Leutert, M., Barente, A.S., Fukuda, N.K. et al. The regulatory landscape of the yeast phosphoproteome. Nat Struct Mol Biol 30, 1761–1773 (2023).

In [7]:

def clean_phosphopeptides(phosphorylation_file, fasta_file, output_file) :
    exp_df = pd.read_csv(phosphorylation_file)

    #read in the fasta file
    fasta_dict = seqio.to_dict(seqio.parse(fasta_file, 'fasta'))

    exp_df = exp_df[exp_df['p_residue'].isin(['S', 'T', 'Y'])]

    systematic_name_to_SGD_dict = {name:record.description.split(', ')[0].split(' ')[2] for name,record in fasta_dict.items()}
    #systematic_name_to_SGD_dict = {name:SGD.split(':')[1] for name,SGD in systematic_name_to_SGD_dict.items()}

    exp_df['SGDID'] = exp_df['systematic_name'].map(systematic_name_to_SGD_dict)

    systematic_name_to_seq_dict = {name:str(record.seq) for name,record in fasta_dict.items()}
    exp_df['sequence'] = (exp_df['systematic_name'].map(systematic_name_to_seq_dict))

    #remove rows with missing sequences
    exp_df = exp_df[~exp_df['sequence'].isna()]

    #remove rows with stop codons
    exp_df['sequence'] = exp_df['sequence'].str.rstrip('*')

    #get sequence lengths
    exp_df['seq_len'] = exp_df['sequence'].str.len()

    #keep only rows where the sequence length is greater than or equal to the position of the phosphorylation site
    exp_df = exp_df[exp_df['seq_len'] >= exp_df['p_position']]

    exp_df['sanity_check'] = exp_df.apply(lambda row: row['sequence'][row['p_position']-1] == row['p_residue'], axis=1)

    #if p_position is less than 6, add 5 - p_position number of _'s to the beginning of the sequence
    exp_df['sequence'] = exp_df.apply(lambda row: '_'*(6 - row['p_position']) + row['sequence'], axis=1)

    #if p_position is greater than seq_len - 5, add p_position - (seq_len - 4) number of _'s to the end of the sequence
    exp_df['sequence'] = exp_df.apply(lambda row: row['sequence'] + '_'*(row['p_position'] - (row['seq_len'] - 4)), axis=1)

    #make window around p_position if p_position is greater than 5 from p_position - 6 to p_position + 4, else from 0 to 10
    exp_df['window'] = exp_df.apply(lambda row: row['sequence'][row['p_position']-6:row['p_position']+4] if row['p_position'] > 5 else row['sequence'][0:10], axis=1)

    exp_df['adj_p_value'] = exp_df['adj_p_value'].apply(lambda x: -1*np.log10(x))

    exp_df_filtered = exp_df[['SGDID', 'p_position', 'window', 'fc_log2', 'adj_p_value']].copy()

    exp_df_filtered.rename(columns={'SGDID':'SGD', 'p_position':'site', 'window':'peptide', 'fc_log2':'log2fc', 'adj_p_value':'adjpvalue'}, inplace=True)

    exp_df_filtered.to_csv(output_file, index=False)


In [8]:
published_yeast_file = 'yeast-perturbation.xlsx'
published_yeast_path = os.path.join(test_dir, published_yeast_file)


import gzip
yeast_proteome_path = os.path.join(proteomes_dir, 'yeast_proteome.fasta')

if not os.path.exists(proteomes_dir):
    os.makedirs(proteomes_dir)
    
if not os.path.exists(yeast_proteome_path):
    print('Downloading yeast proteome')
    Utility.download_file('http://sgd-archive.yeastgenome.org/sequence/S288C_reference/orf_protein/orf_trans.fasta.gz', 'temp.gz')
    with gzip.open('temp.gz', 'rb') as f_in:
        with open(yeast_proteome_path, 'wb') as f_out:
            f_out.write(f_in.read())
    os.remove('temp.gz')
    

if not os.path.exists(published_yeast_path):
    print('Downloading yeast perturbation data')
    Utility.download_file('https://static-content.springer.com/esm/art%3A10.1038%2Fs41594-023-01115-3/MediaObjects/41594_2023_1115_MOESM9_ESM.xlsx',
                  published_yeast_path)


HOG1_experiment_file = os.path.join(test_dir,'HOG1_exp.csv')
SNF1_experiment_file = os.path.join(test_dir,'SNF1_exp.csv')
TOR1_experiment_file = os.path.join(test_dir,'TOR1_exp.csv')

if not os.path.exists(HOG1_experiment_file) or not os.path.exists(SNF1_experiment_file) or not os.path.exists(TOR1_experiment_file):
    df = pd.read_excel(published_yeast_path, sheet_name='p_site_diff_reg')


if not os.path.exists(HOG1_experiment_file):
    HOG1_phosphopeptides_file = 'yeast_HOG1_KC_phosphopeptides.csv'
    HOG1_path = os.path.join(test_dir, HOG1_phosphopeptides_file)
    df[df['treatment_id'] == 'KC'].to_csv(HOG1_path, index=False)
    clean_phosphopeptides(HOG1_path, yeast_proteome_path, HOG1_experiment_file)

if not os.path.exists(SNF1_experiment_file):
    SNF1_phosphopeptides_file = 'yeast_SNF1_GL_phosphopeptides.csv'
    SNF1_path = os.path.join(test_dir, SNF1_phosphopeptides_file)
    df[df['treatment_id'] == 'GL'].to_csv(SNF1_path, index=False)
    clean_phosphopeptides(SNF1_path, yeast_proteome_path, SNF1_experiment_file)

if not os.path.exists(TOR1_experiment_file):
    TOR1_phosphopeptides_file = 'yeast_TOR1_CS18_phosphopeptides.csv'
    TOR1_path = os.path.join(test_dir, TOR1_phosphopeptides_file)
    df[df['treatment_id'] == 'CS18'].to_csv(TOR1_path, index=False)
    clean_phosphopeptides(TOR1_path, yeast_proteome_path, TOR1_experiment_file)






In [9]:


column_names_dict={'id':'SGD', 'site':'site', 'peptide':'peptide', 'log2fc':'log2fc', 'dependent':'adjpvalue'}

yeast_orthologs_file = os.path.join('orthologs', 'yeast_orthologs_final.tsv')

yeast_orthologs_df = pd.read_csv(yeast_orthologs_file, sep='\t')
yeast_orthologs_df = yeast_orthologs_df[yeast_orthologs_df['gene_id_type'] == 'SGD']

yeast_orthologs_st_df = yeast_orthologs_df[yeast_orthologs_df['kinase_type'] == 'ST']
yeast_mapping_st_dict = dict(zip(yeast_orthologs_st_df['symbol'], yeast_orthologs_st_df['kinase_name']))

yeast_orthologs_y_df = yeast_orthologs_df[yeast_orthologs_df['kinase_type'] == 'Y']
yeast_mapping_y_dict = dict(zip(yeast_orthologs_y_df['symbol'], yeast_orthologs_y_df['kinase_name']))

print(yeast_mapping_y_dict.values())
print(y_scoring._kinase_names)


ST_matching = MatchWithMapping(st_scoring, ST_Background, yeast_mapping_st_dict)
Y_matching = MatchWithMapping(y_scoring, Y_Background, yeast_mapping_y_dict)


dict_values(['MYT1', 'PDHK4'])
['ABL', 'ACK', 'ALK', 'ARG', 'AXL', 'BLK', 'BMPR2', 'BRK', 'BTK', 'CSFR', 'CSK', 'CTK', 'DDR1', 'DDR2', 'EGFR', 'EPHA1', 'EPHA2', 'EPHA3', 'EPHA4', 'EPHA5', 'EPHA6', 'EPHA7', 'EPHA8', 'EPHB1', 'EPHB2', 'EPHB3', 'EPHB4', 'ETK', 'FAK', 'FER', 'FES', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FGR', 'FLT3', 'FRK', 'FYN', 'HCK', 'HER2', 'HER4', 'IGF1R', 'INSR', 'IRR', 'ITK', 'JAK1', 'JAK2', 'JAK3', 'KIT', 'LCK', 'LIMK1', 'LIMK2', 'LTK', 'LYN', 'MER', 'MET', 'MKK4', 'MKK6', 'MKK7', 'MST1R', 'MUSK', 'MYT1', 'NEK10', 'PDGFRA', 'PDGFRB', 'PDHK1', 'PDHK3', 'PDHK4', 'PINK1', 'PYK2', 'RET', 'ROS', 'SRC', 'SRMS', 'SYK', 'TEC', 'TESK1', 'TIE2', 'TNK1', 'TNNI3K', 'TRKA', 'TRKB', 'TRKC', 'TXK', 'TYK2', 'TYRO3', 'VEGFR1', 'VEGFR2', 'VEGFR3', 'WEE1', 'YES', 'ZAP70']


In [10]:
HOG1_df = pd.read_csv(HOG1_experiment_file)

session = Session(0,'yeast', HOG1_df, column_names_dict, ST_matching, Y_matching, debug=True)

selected kinases: {'HAL5+KKQ8+PTK1+SKS1+VHS1', 'CDC28', 'HRR25', 'IPL1', 'MEK1', 'BUB1', 'STE11', 'TPK2', 'RIM15', 'CDC5', 'IRE1', 'TEL1', 'ENV7', 'CHK1', 'SGV1', 'CBK1', 'ALK2', 'PKC1', 'TDA1', 'STE20', 'DUN1+RAD53', 'SKY1', 'IME2', 'CTK1', 'HSL1', 'MKK1', 'KIN2', 'CMK1', 'SLT2', 'PRR1+PTK2', 'GCN2', 'YCK1', 'PKP1', 'HOG1', 'SCH9', 'CDC7', 'TOS3', 'PBS2', 'KIN3', 'MPS1', 'SSN3', 'PSK1+PSK2', 'KIN28', 'CKA2', 'KCC4', 'TOR2', 'CMK2', 'PHO85', 'STE7', 'NPR1+PRR2', 'SPS1', 'PKH1', 'KNS1', 'TPK2+TPK3', 'SNF1', 'CLA4', 'IKS1', 'SKS1+VHS1', 'MEC1', 'RIM11', 'KIC1', 'YPK3', 'PKP2', 'NNK1', 'YAK1', 'YPL150W', 'ARK1+PRK1', 'CKA1', 'FUS3+KSS1', 'SWE1', 'BUD32'}
154
Number of ST kinases tested : 70
Number of Y kinases tested : 2
Elapsed time for percentiles and matches: 4.17 seconds


In [11]:
kinase_matches_df = session.get_kinase_matches_df()

In [12]:
kinase_matches_df.to_csv(os.path.join(test_dir, 'HOG1_kinase_matches.tsv'), index=False, sep='\t')

In [13]:
percentiles_st_df = session.get_percentiles_df('ST')
percentiles_y_df = session.get_percentiles_df('Y')

percentiles_st_df.to_csv(os.path.join(test_dir, 'HOG1_percentiles_ST.tsv'), index=False, sep='\t')
percentiles_y_df.to_csv(os.path.join(test_dir, 'HOG1_percentiles_Y.tsv'), index=False, sep='\t')

In [16]:
barplot = session.get_barplot_fig()

#update the height to be 20 pixels times the number of kinases
barplot = barplot.update_layout(height=20*len(session._selected_kinases))

  kinase_type     kinase  count
0          ST  ARK1+PRK1    404
1          ST       SCH9    622
2          ST       SNF1    752
3          ST       TEL1    592
4          ST       MEC1    463


In [17]:
display(barplot)