In [1]:
#initialize stuff

!ls
!mkdir data >> /dev/null


data  kinaid  LICENSE  README.md  sandbox.ipynb
mkdir: cannot create directory ‘data’: File exists


# Download matrices

In [7]:
import requests
import pandas as pd
import os
import numpy as np

def download_file(url : str,
                filename : str):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    with requests.get(url, headers=headers, stream=True) as response:
        response.raise_for_status()
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192): 
                if chunk: 
                    file.write(chunk)

def rearrange_matrices (matrices_file : str = './out/johnson_ST_matrices.xlsx',
                        sheet_name : str = 'ser_thr_all_norm_scaled_matrice', 
                    output_file : str = './out/ST-Kinases.xlsx', 
                    pos = ['-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4']):
    ae_df = pd.read_excel(matrices_file, engine='openpyxl', sheet_name=sheet_name)
    #rename first column to Kinase
    ae_df.rename(columns={ae_df.columns[0]: 'Kinase'}, inplace=True)
    ae_df.set_index('Kinase', inplace=True)

    res = ['P','G','A','C','S','T','V','I','L','M','F','Y','W','H','K','R','Q','N','D','E','s','t','y']

    kinase_matrices = {}
    for k,row in ae_df.iterrows() :
        probs = row.to_numpy()
        prob_matrix = np.reshape(probs, (len(pos),len(res)))
        prob_matrix_t = prob_matrix.transpose()
        kdf = pd.DataFrame(prob_matrix_t, columns=pos, index=res)
        kdf.index.name = 'AA'
        kinase_matrices[k] = kdf

    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        any(df.to_excel(writer, sheet_name=k) for k, df in kinase_matrices.items())



data_dir = './data'

johnson_ST_matrices_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-05575-3/MediaObjects/41586_2022_5575_MOESM4_ESM.xlsx'
johnson_ST_matrices_original_file = os.path.join(data_dir,'johnson_ST_matrices.xlsx')

if not os.path.exists(johnson_ST_matrices_original_file):
    download_file(johnson_ST_matrices_url, johnson_ST_matrices_original_file)
    
johnson_ST_matrices_file = os.path.join(data_dir,'ST-Kinases.xlsx')
rearrange_matrices(johnson_ST_matrices_original_file, sheet_name = 'ser_thr_all_norm_scaled_matrice', output_file=johnson_ST_matrices_file)

densitometry_file = os.path.join(data_dir,'ST-Kinases_densitometry.xlsx')
rearrange_matrices(johnson_ST_matrices_original_file, sheet_name = 'ser_thr_all_raw_matrices', output_file=densitometry_file)

johnson_Y_matrices_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-024-07407-y/MediaObjects/41586_2024_7407_MOESM4_ESM.xlsx'
johnson_Y_matrices_original_file = os.path.join(data_dir,'johnson_Y_matrices.xlsx')

if not os.path.exists(johnson_Y_matrices_original_file):
    download_file(johnson_Y_matrices_url, johnson_Y_matrices_original_file)

johnson_Y_matrices_file = os.path.join(data_dir,'Y-Kinases.xlsx')
rearrange_matrices(johnson_Y_matrices_original_file, sheet_name = 'tyrosine_all_norm_scaled_matric', pos = ['-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4', '5'], output_file = johnson_Y_matrices_file)

ST_matrix_to_uniprot_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-05575-3/MediaObjects/41586_2022_5575_MOESM3_ESM.xlsx'
ST_matrix_to_uniprot = os.path.join(data_dir,'ST-Kinases_to_Uniprot.xlsx')

Y_matrix_to_uniprot_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-024-07407-y/MediaObjects/41586_2024_7407_MOESM3_ESM.xlsx'

ST_matrix_to_uniprot = os.path.join(data_dir,'ST-Kinases_to_Uniprot.xlsx')
Y_matrix_to_uniprot = os.path.join(data_dir,'Y-Kinases_to_Uniprot.xlsx')

if not os.path.exists(ST_matrix_to_uniprot):
    download_file(ST_matrix_to_uniprot_url, ST_matrix_to_uniprot)

if not os.path.exists(Y_matrix_to_uniprot):
    download_file(Y_matrix_to_uniprot_url, Y_matrix_to_uniprot)

ochoa_background_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-05575-3/MediaObjects/41586_2022_5575_MOESM5_ESM.xlsx'


ochoa_background_original_file = os.path.join(data_dir,'ochoa_background.xlsx')


if not os.path.exists(ochoa_background_original_file):
    download_file(ochoa_background_url, ochoa_background_original_file)


tyrosine_background_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-024-07407-y/MediaObjects/41586_2024_7407_MOESM5_ESM.xlsx'

tyrosine_background_original_file = os.path.join(data_dir,'tyrosine_background.xlsx')

if not os.path.exists(tyrosine_background_original_file):
    download_file(tyrosine_background_url, tyrosine_background_original_file)



# Test scoring

In [3]:
from kinaid.matching import PWM_Matrices, Scoring
import os
import pandas as pd
from tqdm.notebook import tqdm_notebook

In [4]:
data_dir = './data'
johnson_ST_matrices_file = os.path.join(data_dir,'ST-Kinases.xlsx')
johnson_Y_matrices_file = os.path.join(data_dir,'Y-Kinases.xlsx')
densitometry_file = os.path.join(data_dir,'ST-Kinases_densitometry.xlsx')


ST_matrices = PWM_Matrices(johnson_ST_matrices_file, debug=True)
ST_matrices.add_densitometry(densitometry_file)

Y_matrices = PWM_Matrices(johnson_Y_matrices_file, ignore_suffix='_TYR', debug=True)
Y_matrices_ncon = PWM_Matrices(johnson_Y_matrices_file, debug=True)

scoring = Scoring(ST_matrices, Y_matrices)

test_seq = 'IRDGGPYGGLMPD'
print(scoring.clean_sequence(test_seq))

test_seq = 'RDGGPYGGLMP'
print(scoring.clean_sequence(test_seq))

test_seq = 'RDGGPSGGLM'
print(scoring.clean_sequence(test_seq))

test_seq = 'GPTSGG'
print(scoring.clean_sequence(test_seq))

test_seq = 'GPSYGG'
print(scoring.clean_sequence(test_seq))

test_seq = 'RDGGPY*GGLMP'
print(scoring.clean_sequence(test_seq))

test_seq = 'RDTGPS*GGLM'
print(scoring.clean_sequence(test_seq))

test_seq = 'GPT*SGG'
print(scoring.clean_sequence(test_seq))

test_seq = 'GPSY*GG'
print(scoring.clean_sequence(test_seq))

test_seq = 'GPSGGY*'
print(scoring.clean_sequence(test_seq))

print(scoring.score_peptide('GPSGGY_____', kinase='BLK', mode='as_is', log_score=True))
print(scoring.score_peptide('RDGGPSGGLM', kinase='ERK2', mode='as_is', log_score=True))

['AAK1', 'ACVR2A', 'ACVR2B', 'AKT1', 'AKT2', 'AKT3', 'ALK2', 'ALK4', 'ALPHAK3', 'AMPKA1', 'AMPKA2', 'ANKRD3', 'ASK1', 'ATM', 'ATR', 'AURA', 'AURB', 'AURC', 'BCKDK', 'BIKE', 'BMPR1A', 'BMPR1B', 'BMPR2', 'BRAF', 'BRSK1', 'BRSK2', 'BUB1', 'CAMK1A', 'CAMK1B', 'CAMK1D', 'CAMK1G', 'CAMK2A', 'CAMK2B', 'CAMK2D', 'CAMK2G', 'CAMK4', 'CAMKK1', 'CAMKK2', 'CAMLCK', 'CDC7', 'CDK1', 'CDK10', 'CDK12', 'CDK13', 'CDK14', 'CDK16', 'CDK17', 'CDK18', 'CDK19', 'CDK2', 'CDK3', 'CDK4', 'CDK5', 'CDK6', 'CDK7', 'CDK8', 'CDK9', 'CDKL1', 'CDKL5', 'CHAK1', 'CHAK2', 'CHK1', 'CHK2', 'CK1A', 'CK1A2', 'CK1D', 'CK1E', 'CK1G1', 'CK1G2', 'CK1G3', 'CK2A1', 'CK2A2', 'CLK1', 'CLK2', 'CLK3', 'CLK4', 'COT', 'CRIK', 'DAPK1', 'DAPK2', 'DAPK3', 'DCAMKL1', 'DCAMKL2', 'DLK', 'DMPK1', 'DNAPK', 'DRAK1', 'DSTYK', 'DYRK1A', 'DYRK1B', 'DYRK2', 'DYRK3', 'DYRK4', 'EEF2K', 'ERK1', 'ERK2', 'ERK5', 'ERK7', 'FAM20C', 'GAK', 'GCK', 'GCN2', 'GRK1', 'GRK2', 'GRK3', 'GRK4', 'GRK5', 'GRK6', 'GRK7', 'GSK3A', 'GSK3B', 'HASPIN', 'HGK', 'HIPK1', 'HIP

In [10]:
from tqdm.notebook import tqdm_notebook


def build_background(background_file : str,
                     sheet_name : str,
                     scoring : Scoring,
                     kinase_type : str, 
                     output_file : str) :
    

    #read the original background file
    background_df = pd.read_excel(background_file, engine='calamine', sheet_name=sheet_name, usecols=['SITE_+/-7_AA'])

   
    background_df['clean_seq'] = background_df['SITE_+/-7_AA'].apply(scoring.clean_sequence)

    kinases = scoring.get_kinase_names(kinase_type)

    scores_df = pd.concat([
        pd.Series(
            background_df['clean_seq'].apply(
                lambda x: scoring.score_peptide(
                    x,
                    kinase=k,
                    mode='as_is',
                    log_score=False
                )
            ),
            index = background_df.index,
            name = k+'_score'
        ) 
        for k in tqdm_notebook(kinases)
    ], axis =1)
    scores_df = pd.concat([background_df, scores_df], axis=1)
    scores_df.to_csv(output_file, sep='\t', index=True)



In [12]:
ochoa_background_original_file = os.path.join(data_dir,'ochoa_background.xlsx')
ochoa_background_file = os.path.join(data_dir, 'johnson_ochoa_background_wfav.tsv')

if not os.path.exists(ochoa_background_file) :
    build_background(ochoa_background_original_file, 'Supplementary Table 3', scoring, 'ST', ochoa_background_file)


tyrosine_background_original_file = os.path.join(data_dir,'tyrosine_background.xlsx')
tyrosine_background_file = os.path.join(data_dir, 'johnson_tyrosine_background_wfav.tsv')

if not os.path.exists(tyrosine_background_file) :
    build_background(tyrosine_background_original_file, 'Annotation - Canonical only', scoring, 'Y', tyrosine_background_file)

  0%|          | 0/78 [00:00<?, ?it/s]