In [6]:
# 📌 ChEMBL 35 Loader Notebook
# --------------------------------------------------------
# Requirements: pandas, biopython, sqlite3
# Install if missing:
#   pip install pandas biopython

import pandas as pd
import sqlite3
from Bio import SeqIO
import gzip

# ========== 1. Load chembl_35_chemreps.txt.gz ==========
print("Loading chemreps (molecule representations)...")
chemreps = pd.read_csv(
    r"DL_ENDSEM__DATASET\chembl_35_chemreps.txt.gz",
    sep="\t",
    compression="gzip"
)
print("chemreps shape:", chemreps.shape)
print(chemreps.head())


Loading chemreps (molecule representations)...
chemreps shape: (2474590, 4)
      chembl_id                                   canonical_smiles  \
0  CHEMBL153534                       Cc1cc(-c2csc(N=C(N)N)n2)cn1C   
1  CHEMBL440060  CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H...   
2  CHEMBL440245  CCCC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(...   
3  CHEMBL440249  CC(C)C[C@@H]1NC(=O)CNC(=O)[C@H](c2ccc(O)cc2)NC...   
4  CHEMBL405398             Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1   

                                      standard_inchi  \
0  InChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10...   
1  InChI=1S/C123H212N44O34S/c1-19-63(12)96(164-11...   
2  InChI=1S/C160H268N50O41/c1-23-27-41-95-134(228...   
3  InChI=1S/C124H154ClN21O39/c1-57(2)48-81-112(17...   
4  InChI=1S/C19H21BrN6O/c20-15-2-1-3-17(18(15)22-...   

            standard_inchi_key  
0  MFRNFCWYPYSFQQ-UHFFFAOYSA-N  
1  RSEQNZQKBMRQNM-VRGFNVLHSA-N  
2  FTKBTEIKPOYCEX-OZSLQWTKSA-N  
3  UYSXXKGACMHPIM-KFGDMSGDSA-N  
4

In [7]:


# ========== 2. Load chembl_35_blast.fa.gz ==========
print("\nLoading target protein sequences (FASTA)...")
fasta_file = r"DL_ENDSEM__DATASET\chembl_35_blast.fa.gz"
sequences = []

# Open with gzip in text mode to avoid UnicodeDecodeError
with gzip.open(fasta_file, "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequences.append({"id": record.id, "seq": str(record.seq)})

print("Number of sequences:", len(sequences))
print("Example sequence:", sequences[0])




Loading target protein sequences (FASTA)...
Number of sequences: 14763
Example sequence: {'id': 'CHEMBL1907607_O09028', 'seq': 'MSYSLYLAFVCLNLLAQRMCIQGNQFNVEVSRSDKLSLPGFENLTAGYNKFLRPNFGGDPVRIALTLDIASISSISESNMDYTATIYLRQRWTDPRLVFEGNKSFTLDARLVEFLWVPDTYIVESKKSFLHEVTVGNRLIRLFSNGTVLYALRITTTVTCNMDLSKYPMDTQTCKLQLESWGYDGNDVEFSWLRGNDSVRGLENLRLAQYTIQQYFTLVTVSQQETGNYTRLVLQFELRRNVLYFILETYVPSTFLVVLSWVSFWISLESVPARTCIGVTTVLSMTTLMIGSRTSLPNTNCFIKAIDVYLGICFSFVFGALLEYAVAHYSSLQQMAVKDRGPAKDSEEVNITNIINSSISSFKRKISFASIEISGDNVNYSDLTMKASDKFKFVFREKIGRIIDYFTIQNPSNVDRYSKLLFPLIFMLANVFYWAYYMYF'}


In [8]:

# ========== 3. Load chembl_35_sqlite.db ==========
# Make sure you extracted chembl_35_sqlite.tar.gz first:
#   tar -xvzf chembl_35_sqlite.tar.gz
# This gives you chembl_35/chembl_35.db

print("\nConnecting to ChEMBL SQLite database...")
conn = sqlite3.connect(
    r"DL_ENDSEM__DATASET\chembl_35\chembl_35_sqlite\chembl_35.db"
)
print("\nChecking available tables in ChEMBL DB...")
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print(tables)

# Example query: fetch 5 IC50 activities with SMILES
query = """
SELECT cs.canonical_smiles, a.standard_type, a.standard_value, a.standard_units
FROM activities a
JOIN compound_structures cs ON a.molregno = cs.molregno
WHERE a.standard_type = 'IC50'
LIMIT 5;
"""
example_activities = pd.read_sql_query(query, conn)
print(example_activities)




Connecting to ChEMBL SQLite database...

Checking available tables in ChEMBL DB...
                         name
0                 action_type
1                  assay_type
2            chembl_id_lookup
3     confidence_score_lookup
4             curation_lookup
..                        ...
75             mechanism_refs
76            metabolism_refs
77  predicted_binding_domains
79               sqlite_stat1

[80 rows x 1 columns]
                                    canonical_smiles standard_type  \
0           c1ccc(-c2nc3c(-c4nc5ccccc5o4)cccc3o2)cc1          IC50   
1  Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)...          IC50   
2  Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)...          IC50   
3  COc1ccccc1-c1ccc2oc(-c3ccc(OC)c(N4C(=O)c5ccc(C...          IC50   
4  COc1ccccc1-c1ccc2oc(-c3ccc(OC)c(N4C(=O)c5ccc(C...          IC50   

   standard_value standard_units  
0        100000.0             nM  
1          2500.0             nM  
2         50000.0             nM  
3 

In [9]:

# ========== 4. Load chembl_uniprot_mapping.txt ==========
print("\nLoading UniProt mapping...")
uniprot_map = pd.read_csv(
    r"DL_ENDSEM__DATASET\chembl_uniprot_mapping.txt",
    sep="\t"
)
print("uniprot mapping shape:", uniprot_map.shape)
print(uniprot_map.head())

# Close DB connection
conn.close()



Loading UniProt mapping...
uniprot mapping shape: (14809, 1)
                                                        # chembl_35 target list, 01/12/2024
P21266 CHEMBL2242 Glutathione S-transferase Mu 3                             SINGLE PROTEIN
O00519 CHEMBL2243 Anandamide amidohydrolase                                  SINGLE PROTEIN
P19217 CHEMBL2244 Estrogen sulfotransferase                                  SINGLE PROTEIN
P97292 CHEMBL2245 Histamine H2 receptor                                      SINGLE PROTEIN
P17342 CHEMBL2247 Atrial natriuretic peptide receptor C                      SINGLE PROTEIN
