In [8]:
#initialize stuff

!ls
!mkdir data >> /dev/null


sandbox.ipynb


# Download matrices

In [4]:
import requests
import pandas as pd
import os
import numpy as np

def download_file(url : str,
                filename : str):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    with requests.get(url, headers=headers, stream=True) as response:
        response.raise_for_status()
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192): 
                if chunk: 
                    file.write(chunk)

def rearrange_matrices (matrices_file : str = './out/johnson_ST_matrices.xlsx',
                        sheet_name : str = 'ser_thr_all_norm_scaled_matrice', 
                    output_file : str = './out/ST-Kinases.xlsx', 
                    pos = ['-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4']):
    ae_df = pd.read_excel(matrices_file, engine='openpyxl', sheet_name=sheet_name)
    #rename first column to Kinase
    ae_df.rename(columns={ae_df.columns[0]: 'Kinase'}, inplace=True)
    ae_df.set_index('Kinase', inplace=True)

    res = ['P','G','A','C','S','T','V','I','L','M','F','Y','W','H','K','R','Q','N','D','E','s','t','y']

    kinase_matrices = {}
    for k,row in ae_df.iterrows() :
        probs = row.to_numpy()
        prob_matrix = np.reshape(probs, (len(pos),len(res)))
        prob_matrix_t = prob_matrix.transpose()
        kdf = pd.DataFrame(prob_matrix_t, columns=pos, index=res)
        kdf.index.name = 'AA'
        kinase_matrices[k] = kdf

    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        any(df.to_excel(writer, sheet_name=k) for k, df in kinase_matrices.items())

data_dir = './data'

johnson_ST_matrices_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-05575-3/MediaObjects/41586_2022_5575_MOESM4_ESM.xlsx'
johnson_ST_matrices_original_file = os.path.join(data_dir,'johnson_ST_matrices.xlsx')

if not os.path.exists(johnson_ST_matrices_original_file):
    download_file(johnson_ST_matrices_url, johnson_ST_matrices_original_file)
    
johnson_ST_matrices_file = os.path.join(data_dir,'ST-Kinases.xlsx')
rearrange_matrices(johnson_ST_matrices_original_file, sheet_name = 'ser_thr_all_norm_scaled_matrice', output_file=johnson_ST_matrices_file)

densitometry_file = os.path.join(data_dir,'ST-Kinases_densitometry.xlsx')
rearrange_matrices(johnson_ST_matrices_original_file, sheet_name = 'ser_thr_all_raw_matrices', output_file=densitometry_file)

johnson_Y_matrices_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-024-07407-y/MediaObjects/41586_2024_7407_MOESM4_ESM.xlsx'
johnson_Y_matrices_original_file = os.path.join(data_dir,'johnson_Y_matrices.xlsx')

if not os.path.exists(johnson_Y_matrices_original_file):
    download_file(johnson_Y_matrices_url, johnson_Y_matrices_original_file)

johnson_Y_matrices_file = os.path.join(data_dir,'Y-Kinases.xlsx')
rearrange_matrices(johnson_Y_matrices_original_file, sheet_name = 'tyrosine_all_norm_scaled_matric', pos = ['-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4', '5'], output_file = johnson_Y_matrices_file)

ST_matrix_to_uniprot_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-05575-3/MediaObjects/41586_2022_5575_MOESM3_ESM.xlsx'
ST_matrix_to_uniprot = os.path.join(data_dir,'ST-Kinases_to_Uniprot.xlsx')

Y_matrix_to_uniprot_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-024-07407-y/MediaObjects/41586_2024_7407_MOESM3_ESM.xlsx'

ST_matrix_to_uniprot = os.path.join(data_dir,'ST-Kinases_to_Uniprot.xlsx')
Y_matrix_to_uniprot = os.path.join(data_dir,'Y-Kinases_to_Uniprot.xlsx')

if not os.path.exists(ST_matrix_to_uniprot):
    download_file(ST_matrix_to_uniprot_url, ST_matrix_to_uniprot)

if not os.path.exists(Y_matrix_to_uniprot):
    download_file(Y_matrix_to_uniprot_url, Y_matrix_to_uniprot)



In [1]:
from kinaid.matching import PWM_Matrices
import os

In [2]:
data_dir = './data'
johnson_ST_matrices_file = os.path.join(data_dir,'ST-Kinases.xlsx')
johnson_Y_matrices_file = os.path.join(data_dir,'Y-Kinases.xlsx')
densitometry_file = os.path.join(data_dir,'ST-Kinases_densitometry.xlsx')


ST_matrices = PWM_Matrices(johnson_ST_matrices_file, debug=True)
ST_matrices.add_densitometry(densitometry_file)

y_matrices_con = PWM_Matrices(johnson_Y_matrices_file, ignore_suffix='_TYR', debug=True)
y_matrices = PWM_Matrices(johnson_Y_matrices_file, debug=True)

['AAK1', 'ACVR2A', 'ACVR2B', 'AKT1', 'AKT2', 'AKT3', 'ALK2', 'ALK4', 'ALPHAK3', 'AMPKA1', 'AMPKA2', 'ANKRD3', 'ASK1', 'ATM', 'ATR', 'AURA', 'AURB', 'AURC', 'BCKDK', 'BIKE', 'BMPR1A', 'BMPR1B', 'BMPR2', 'BRAF', 'BRSK1', 'BRSK2', 'BUB1', 'CAMK1A', 'CAMK1B', 'CAMK1D', 'CAMK1G', 'CAMK2A', 'CAMK2B', 'CAMK2D', 'CAMK2G', 'CAMK4', 'CAMKK1', 'CAMKK2', 'CAMLCK', 'CDC7', 'CDK1', 'CDK10', 'CDK12', 'CDK13', 'CDK14', 'CDK16', 'CDK17', 'CDK18', 'CDK19', 'CDK2', 'CDK3', 'CDK4', 'CDK5', 'CDK6', 'CDK7', 'CDK8', 'CDK9', 'CDKL1', 'CDKL5', 'CHAK1', 'CHAK2', 'CHK1', 'CHK2', 'CK1A', 'CK1A2', 'CK1D', 'CK1E', 'CK1G1', 'CK1G2', 'CK1G3', 'CK2A1', 'CK2A2', 'CLK1', 'CLK2', 'CLK3', 'CLK4', 'COT', 'CRIK', 'DAPK1', 'DAPK2', 'DAPK3', 'DCAMKL1', 'DCAMKL2', 'DLK', 'DMPK1', 'DNAPK', 'DRAK1', 'DSTYK', 'DYRK1A', 'DYRK1B', 'DYRK2', 'DYRK3', 'DYRK4', 'EEF2K', 'ERK1', 'ERK2', 'ERK5', 'ERK7', 'FAM20C', 'GAK', 'GCK', 'GCN2', 'GRK1', 'GRK2', 'GRK3', 'GRK4', 'GRK5', 'GRK6', 'GRK7', 'GSK3A', 'GSK3B', 'HASPIN', 'HGK', 'HIPK1', 'HIP