In [1]:
import pandas as pd
import sqlite3
from pathlib import Path
from bioservices import UniProt
import io
import requests

Define the path to the ChEBML sqlite database file and the LLM results file.  People who are not me will have to change this. 

In [2]:
import os

# Define paths
# Users should set the CHEMBL_DB_PATH environment variable or update the default path below
CHEMBL_SQLITE_PATH = os.environ.get("CHEMBL_DB_PATH", "chembl_36.db")
LLM_RESULT_PATH = "kinase_inhibitors_after_2022.csv"

if not Path(CHEMBL_SQLITE_PATH).exists():
    print(f"NOTE: ChEMBL database not found at '{CHEMBL_SQLITE_PATH}'.\n"
          "Please ensure the file exists or update CHEMBL_SQLITE_PATH.")

Use [bioservices](https://github.com/cokelaer/bioservices) package to get the UniProt ids for human kinases. 

In [3]:
uniprot_kinase_filename = "kinase_uniprot.csv"
uniprot_kinase_path = Path(uniprot_kinase_filename)
if not uniprot_kinase_path.is_file():
    u = UniProt()
    # 1. Use 'tsv' format and specify the 'accession' column
    # 2. Note: 'organism' is now 'organism_id' in the modern API
    query = "keyword:KW-0418 AND organism_id:9606"
    result = u.search(query, frmt="tsv", columns="accession")

    # Convert the TSV string to a list of IDs
    df = pd.read_csv(io.StringIO(result), sep='\t')
    kinase_ids = df['Entry'].tolist()

    print(f"Found {len(kinase_ids)} human kinases.")
    df.to_csv("kinase_uniprot.csv",index=False)

Write out the UniProt Ids so we don't have to hit the API every time. 

In [4]:
uniprot_df = pd.read_csv("kinase_uniprot.csv")
uniprot_df.head()

Unnamed: 0,Entry
0,O00329
1,O00444
2,O00506
3,O00746
4,O14757


To search in ChEMBL, we need ChEMBL Ids.  Fortunately the ChEMBL site has a file that maps UniProt Ids to ChEMBL Ids. 

In [5]:
chembl_mapping_filename = "chembl_uniprot_mapping.txt"
chembl_mapping_path = Path(chembl_mapping_filename)
if not chembl_mapping_path.is_file():
    url = "https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_uniprot_mapping.txt"
    response = requests.get(url)
    if response.status_code == 200:
        with open(chembl_mapping_filename,"w") as ofs:
            ofs.write(response.content)
            print(f"Downloaded {chembl_mapping_filename}")
    else:
        print("Failed to download file.")
chembl_mapping_df = pd.read_csv("chembl_uniprot_mapping.txt",sep="\t",skiprows=[0],names=["uniprot_id","chembl_id","name","protein_type"])

Let's look at the first few lines

In [6]:
chembl_mapping_df.head()

Unnamed: 0,uniprot_id,chembl_id,name,protein_type
0,P21266,CHEMBL2242,Glutathione S-transferase Mu 3,SINGLE PROTEIN
1,O00519,CHEMBL2243,Fatty-acid amide hydrolase 1,SINGLE PROTEIN
2,P19217,CHEMBL2244,Sulfotransferase 1E1,SINGLE PROTEIN
3,P97292,CHEMBL2245,Histamine H2 receptor,SINGLE PROTEIN
4,P17342,CHEMBL2247,Atrial natriuretic peptide receptor 3,SINGLE PROTEIN


Get the ChEMBL Ids for the human kinases. 

In [7]:
chembl_kinase_df = chembl_mapping_df.query("uniprot_id in @uniprot_df.Entry and protein_type == 'SINGLE PROTEIN'")
chembl_kinase_df.head()

Unnamed: 0,uniprot_id,chembl_id,name,protein_type
7,P51451,CHEMBL2250,Tyrosine-protein kinase Blk,SINGLE PROTEIN
8,Q9BTU6,CHEMBL2251,Phosphatidylinositol 4-kinase type 2-alpha,SINGLE PROTEIN
27,P45983,CHEMBL2276,Mitogen-activated protein kinase 8,SINGLE PROTEIN
34,P11802,CHEMBL331,Cyclin-dependent kinase 4,SINGLE PROTEIN
43,Q13627,CHEMBL2292,Dual specificity tyrosine-phosphorylation-regu...,SINGLE PROTEIN


Connect to the ChEMBL sqlite database and get a cursor. 

In [8]:
con = sqlite3.connect(CHEMBL_SQLITE_PATH)
cursor = con.cursor()

Create a temporary table to hold the ChEMBL Ids for the human kinases

In [9]:
cursor.execute("CREATE TEMP TABLE tmp_ids(chembl_id TEXT)")

<sqlite3.Cursor at 0x11f49c840>

Populate the temporary table

In [10]:
chembl_id_list = [(idx,) for idx in chembl_kinase_df.chembl_id.values]
cursor.executemany("INSERT INTO tmp_ids(chembl_id) VALUES (?)", chembl_id_list)

<sqlite3.Cursor at 0x11f49c840>

Define an SQL query to extract information on kinase inhibitors published after 2022. 
- doi - the Document Object Identifer for the paper
- year - publication year
- pref_name - preferred name for the target
- target_chembl_id - the ChEMBL Id for the target
- activity_id - the unique identifer for each assay value
- molregno - the identifier for each compound
- canonical_smiles - the SMILES for the compound
- standard_type - IC50
- standard_units - nM

In [11]:
sql = """select d.doi, d.year, td.pref_name, td.chembl_id as target_chembl_id, act.activity_id, cs.molregno, cs.canonical_smiles, act.standard_type, act.standard_value, act.standard_units
from compound_structures cs
            join activities act on cs.molregno = act.molregno
            join main.assays a on act.assay_id = a.assay_id
            join target_dictionary td on a.tid = td.tid
            join docs d on a.doc_id = d.doc_id
            join tmp_ids on td.chembl_id = tmp_ids.chembl_id
    where d.year > 2022 and act.standard_type = 'IC50'"""

Run the query and return the results as a Pandas dataframe

In [12]:
pw_df = pd.read_sql_query(sql,con)

How many results did the query return? 

In [13]:
len(pw_df)

44992

Read the LLM search result. 

In [14]:
llm_df = pd.read_csv(LLM_RESULT_PATH)

How many results did the LLM search return? 

In [15]:
len(llm_df)

38571

The LLM query returned fewer results, let's understand why.  First let's see how many results were returned by the LLM query, but not by my query. 

In [16]:
llm_only_df = llm_df.query("activity_id not in @pw_df.activity_id")
len(llm_only_df)

35

How many targets were in the the 35 compounds my query didn't return. 

In [17]:
llm_only_targets = llm_only_df.target_chembl_id.unique()
llm_only_targets

array(['CHEMBL2906', 'CHEMBL5081', 'CHEMBL3559702', 'CHEMBL2783'],
      dtype=object)

Were these targets in the ChEMBL to UniProt mapping table? **Yes**

In [18]:
llm_only_mapping_df = chembl_mapping_df.query("chembl_id in @llm_only_targets")
llm_only_mapping_df

Unnamed: 0,uniprot_id,chembl_id,name,protein_type
891,Q15078,CHEMBL2783,Cyclin-dependent kinase 5 activator 1,SINGLE PROTEIN
2477,Q9Y616,CHEMBL5081,Interleukin-1 receptor-associated kinase 3,SINGLE PROTEIN
5995,P46019,CHEMBL2906,Phosphorylase b kinase regulatory subunit alph...,SINGLE PROTEIN
10269,Q92569,CHEMBL3559702,Phosphatidylinositol 3-kinase regulatory subun...,SINGLE PROTEIN


Were these Uniprot Ids in the list of human kinases? **No**

In [19]:
uniprot_df.query("Entry in @llm_only_mapping_df.uniprot_id")

Unnamed: 0,Entry


Let's examine the ChEMBL protein classes for the 4 targets that the LLM query found but mine didn't. Note that 3 are protein kinase regulatory subunits and 1 is a pseudo kinase.  However, in both of these cases, ChEMBL lists the parent protein classification as `Kinase`. IMO these are false positives. 

In [20]:
llm_only_target_list = llm_only_df.drop_duplicates("target_chembl_id").target_chembl_id.values
target_str = str(tuple(llm_only_target_list))

sql_in = f"""select td.chembl_id, td.pref_name, pc.pref_name as protein_class
             from target_components tc 
             join target_dictionary td on tc.tid = td.tid 
             join component_class cc on tc.component_id = cc.component_id 
             join protein_classification pc on cc.protein_class_id = pc.protein_class_id 
             where td.chembl_id IN {target_str}"""

pd.read_sql_query(sql_in, con)

Unnamed: 0,chembl_id,pref_name,protein_class
0,CHEMBL2783,Cyclin-dependent kinase 5 activator 1,Protein kinase regulatory subunit
1,CHEMBL2906,Phosphorylase b kinase regulatory subunit alph...,Protein kinase regulatory subunit
2,CHEMBL3559702,Phosphatidylinositol 3-kinase regulatory subun...,Protein kinase regulatory subunit
3,CHEMBL5081,Interleukin-1 receptor-associated kinase 3,TKL protein kinase IRAK family


Now, let's look  at the compounds that my query found but the LLM query didn't. 

In [21]:
pw_only_df = pw_df.query("activity_id not in @llm_df.activity_id")
len(pw_only_df)

6456

How many targets account for the 6456 compounds. 

In [22]:
len(pw_only_df.drop_duplicates("target_chembl_id"))

43

In [23]:
pw_only_target_list = pw_only_df.drop_duplicates("target_chembl_id").target_chembl_id.values
target_str = str(tuple(pw_only_target_list))

sql_in = f"""select td.chembl_id, td.pref_name, pc.pref_name as protein_class
             from target_components tc 
             join target_dictionary td on tc.tid = td.tid 
             join component_class cc on tc.component_id = cc.component_id 
             join protein_classification pc on cc.protein_class_id = pc.protein_class_id 
             where td.chembl_id IN {target_str}"""

df_out = pd.read_sql_query(sql_in, con)
df_out

Unnamed: 0,chembl_id,pref_name,protein_class
0,CHEMBL1075102,Phosphatidylinositol 4-phosphate 3-kinase C2 d...,Transferase
1,CHEMBL1075126,Pyruvate kinase PKLR,Enzyme
2,CHEMBL1075157,Diacylglycerol kinase gamma,Transferase
3,CHEMBL1075165,Phosphatidylinositol 3-kinase catalytic subuni...,Transferase
4,CHEMBL1075189,Pyruvate kinase PKM,Enzyme
5,CHEMBL1163101,Serine/threonine-protein kinase/endoribonuclea...,Enzyme
6,CHEMBL1163120,Phosphatidylinositol 3-kinase C2 domain-contai...,Transferase
7,CHEMBL1250412,Transient receptor potential cation channel su...,Transient receptor potential channel
8,CHEMBL1275212,Ketohexokinase,Transferase
9,CHEMBL1649059,Serine/threonine-protein kinase VRK2,Enzyme


Let's look at the ChEMBL target classes for the 43 missing proteins. 
- Transferase has Enzyme as the parent
- Enzyme is at the top level of the heirarcy
- Transient receptor potential channel maps to Voltage-gated ion channel which maps to Ion Channel at the top level

In [24]:
df_out.protein_class.value_counts().to_frame().reset_index()

Unnamed: 0,protein_class,count
0,Transferase,30
1,Enzyme,11
2,Transient receptor potential channel,1
3,Unclassified protein,1


Q96QT4 (TRPM7) is a kinase. More specifically, it is a bifunctional protein that uniquely combines an ion channel with an intrinsic serine/threonine protein kinase domain.

In [187]:
chembl_mapping_df.query("chembl_id == 'CHEMBL1250412'")

Unnamed: 0,uniprot_id,chembl_id,name,protein_type
4211,Q96QT4,CHEMBL1250412,Transient receptor potential cation channel su...,SINGLE PROTEIN
