In [9]:
# 📌 ChEMBL 35 Loader Notebook
# --------------------------------------------------------
# Requirements: pandas, biopython, sqlite3
# Install if missing:
#   pip install pandas biopython

import pandas as pd
import sqlite3
from Bio import SeqIO
import gzip

# ========== 1. Load chembl_35_chemreps.txt.gz ==========
print("Loading chemreps (molecule representations)...")
chemreps = pd.read_csv(
    r"C:\Users\nikhi\Desktop\DL_ENDSEM__DATASET\chembl_35_chemreps.txt.gz",
    sep="\t",
    compression="gzip"
)
print("chemreps shape:", chemreps.shape)
print(chemreps.head())


# ========== 2. Load chembl_35_blast.fa.gz ==========
print("\nLoading target protein sequences (FASTA)...")
fasta_file = r"C:\Users\nikhi\Desktop\DL_ENDSEM__DATASET\chembl_35_blast.fa.gz"
sequences = []

# Open with gzip in text mode to avoid UnicodeDecodeError
with gzip.open(fasta_file, "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequences.append({"id": record.id, "seq": str(record.seq)})

print("Number of sequences:", len(sequences))
print("Example sequence:", sequences[0])


# ========== 3. Load chembl_35_sqlite.db ==========
# Make sure you extracted chembl_35_sqlite.tar.gz first:
#   tar -xvzf chembl_35_sqlite.tar.gz
# This gives you chembl_35/chembl_35.db

print("\nConnecting to ChEMBL SQLite database...")
conn = sqlite3.connect(
    r"D:\chembl_35.db"
)
print("\nChecking available tables in ChEMBL DB...")
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print(tables)

# Example query: fetch 5 IC50 activities with SMILES
query = """
SELECT cs.canonical_smiles, a.standard_type, a.standard_value, a.standard_units
FROM activities a
JOIN compound_structures cs ON a.molregno = cs.molregno
WHERE a.standard_type = 'IC50'
LIMIT 5;
"""
example_activities = pd.read_sql_query(query, conn)
print(example_activities)


# ========== 4. Load chembl_uniprot_mapping.txt ==========
print("\nLoading UniProt mapping...")
uniprot_map = pd.read_csv(
    r"C:\Users\nikhi\Desktop\DL_ENDSEM__DATASET\chembl_uniprot_mapping.txt",
    sep="\t"
)
print("uniprot mapping shape:", uniprot_map.shape)
print(uniprot_map.head())

# Close DB connection
conn.close()


Loading chemreps (molecule representations)...
chemreps shape: (2474590, 4)
      chembl_id                                   canonical_smiles  \
0  CHEMBL153534                       Cc1cc(-c2csc(N=C(N)N)n2)cn1C   
1  CHEMBL440060  CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H...   
2  CHEMBL440245  CCCC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(...   
3  CHEMBL440249  CC(C)C[C@@H]1NC(=O)CNC(=O)[C@H](c2ccc(O)cc2)NC...   
4  CHEMBL405398             Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1   

                                      standard_inchi  \
0  InChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10...   
1  InChI=1S/C123H212N44O34S/c1-19-63(12)96(164-11...   
2  InChI=1S/C160H268N50O41/c1-23-27-41-95-134(228...   
3  InChI=1S/C124H154ClN21O39/c1-57(2)48-81-112(17...   
4  InChI=1S/C19H21BrN6O/c20-15-2-1-3-17(18(15)22-...   

            standard_inchi_key  
0  MFRNFCWYPYSFQQ-UHFFFAOYSA-N  
1  RSEQNZQKBMRQNM-VRGFNVLHSA-N  
2  FTKBTEIKPOYCEX-OZSLQWTKSA-N  
3  UYSXXKGACMHPIM-KFGDMSGDSA-N  
4

In [3]:
!pip install biopython


Collecting biopython
  Downloading biopython-1.85-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.8 MB 640.0 kB/s eta 0:00:05
   ----- ---------------------------------- 0.4/2.8 MB 4.8 MB/s eta 0:00:01
   -------------- ------------------------- 1.0/2.8 MB 8.0 MB/s eta 0:00:01
   ------------------------- -------------- 1.8/2.8 MB 10.3 MB/s eta 0:00:01
   ------------------------------------- -- 2.6/2.8 MB 11.9 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 12.0 MB/s eta 0:00:00
Installing collected packages: biopython
Successfully installed biopython-1.85



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print(tables)


Empty DataFrame
Columns: [name]
Index: []
