In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-pic50/data_pic50.csv


In [5]:
! pip install rdkit pubchempy
! pip install deepchem

Collecting rdkit
  Downloading rdkit-2025.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading rdkit-2025.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (36.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.3/36.3 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py) ... [?25ldone
[?25h  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13820 sha256=c38bf7c30c4c5fa2d06000e296d303d8a7a3f0a9451fc445f973b1d51e82f1c8
  Stored in directory: /root/.cache/pip/wheels/90/7c/45/18a0671e3c3316966ef7ed9ad2b3f3300a7e41d3421a44e799
Successfully built pubchempy
Installing collected packages: pubchempy, rdkit
Successfully installed pubchempy-1.0.4 rdkit-2025.3.5
Collecting deepchem
  Downloading deepchem-2.8.0-py3-none-a

In [6]:
import deepchem as dc



In [7]:
data_df = pd.read_csv("/kaggle/input/data-pic50/data_pic50.csv")
data_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,canonical_smiles,standard_relation,standard_type,standard_units,pIC50
0,13,27,CC(C)(C)NCc1cc(Nc2ccnc3cc(Cl)ccc23)cc(-c2ccc(C...,=,IC50,nM,7.236572
1,14,28,CC(C)(C)NCc1cc(Nc2ccnc3cc(Cl)ccc23)cc(-c2ccc(C...,=,IC50,nM,7.130768
2,20,38,C[C@H]1[C@@H](OCCCONC(=O)c2ccccc2O)O[C@@H]2O[C...,=,IC50,nM,7.280255
3,33,66,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,=,IC50,nM,6.602060
4,45,87,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,=,IC50,nM,6.602060
...,...,...,...,...,...,...,...
22630,45226,969209,CC1(N2CCC2)CCC(NCC2CCN(Cc3ccccc3)C2)CC1,=,IC50,nM,5.823909
22631,45227,969210,O=C(NC1CCN(CCOc2ccc(Cl)cc2)CC1)c1c[nH]c2ccncc2...,=,IC50,nM,5.698970
22632,45228,969211,COc1ccc(-c2cncc(C(=O)NCC3CCN(CCN(C)C)CC3)c2)cc1,=,IC50,nM,5.568636
22633,45229,969212,Cc1c(CN2CCC(O)(Cn3ccnc3)CC2)[nH]c2c(Cl)cccc12,=,IC50,nM,5.443697


https://github.com/deepchem/deepchem/blob/master/deepchem/feat/molecule_featurizers/pubchem_fingerprint.py

In [8]:
from deepchem.utils.typing import RDKitMol
from deepchem.feat.base_classes import MolecularFeaturizer


class PubChemFingerprint(MolecularFeaturizer):
    """PubChem Fingerprint.

    The PubChem fingerprint is a 881 bit structural key,
    which is used by PubChem for similarity searching.
    Please confirm the details in [1]_.

    References
    ----------
    .. [1] ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.pdf

    Note
    -----
    This class requires RDKit and PubChemPy to be installed.
    PubChemPy use REST API to get the fingerprint, so you need the internet access.

    Examples
    --------
    >>> import deepchem as dc
    >>> smiles = ['CCC']
    >>> featurizer = dc.feat.PubChemFingerprint()
    >>> features = featurizer.featurize(smiles)
    >>> type(features[0])
    <class 'numpy.ndarray'>
    >>> features[0].shape
    (881,)

    """

    def __init__(self):
        """Initialize this featurizer."""
        try:
            from rdkit import Chem  # noqa
            import pubchempy as pcp  # noqa
        except ModuleNotFoundError:
            raise ImportError("This class requires PubChemPy to be installed.")

        self.get_pubchem_compounds = pcp.get_compounds

    def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
        """
        Calculate PubChem fingerprint.

        Parameters
        ----------
        datapoint: rdkit.Chem.rdchem.Mol
            RDKit Mol object

        Returns
        -------
        np.ndarray
            1D array of RDKit descriptors for `mol`. The length is 881.

        """
        try:
            from rdkit import Chem
            import pubchempy as pcp
        except ModuleNotFoundError:
            raise ImportError("This class requires PubChemPy to be installed.")
        if 'mol' in kwargs:
            datapoint = kwargs.get("mol")
            raise DeprecationWarning(
                'Mol is being phased out as a parameter, please pass "datapoint" instead.'
            )

        smiles = Chem.MolToSmiles(datapoint)
        pubchem_compound = pcp.get_compounds(smiles, 'smiles')[0]
        feature = [int(bit) for bit in pubchem_compound.cactvs_fingerprint]
        return np.asarray(feature)

In [9]:

featurizer = dc.feat.PubChemFingerprint()

def get_pubchem_fingerprints(smiles_list):
    fingerprints = []
    for smiles in smiles_list:
        fingerprint = featurizer.featurize([smiles])
        fingerprints.append(fingerprint[0]) 
    return fingerprints

#data_df['pubchem_fingerprint'] = get_pubchem_fingerprints(data_df['canonical_smiles'].tolist())

In [10]:
data_herbal = {
    'CID': ['969516', '5281792', '2353','445154','5280804','45485025','23915','10228'],
    'canonical_smiles': ['COC1=C(C=CC(=C1)/C=C/C(=O)CC(=O)/C=C/C2=CC(=C(C=C2)O)OC)O',
                         'C1=CC(=C(C=C1C[C@H](C(=O)O)OC(=O)/C=C/C2=CC(=C(C=C2)O)O)O)O',
                         'COC1=C(C2=C[N+]3=C(C=C2C=C1)C4=CC5=C(C=C4CC3)OCO5)OC',
                         'C1=CC(=CC=C1/C=C/C2=CC(=CC(=C2)O)O)O',
                         'C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O[C@H]4[C@@H]([C@H]([C@@H]([C@H](O4)CO)O)O)O)O)O',
                        'C1=CC(=CC=C1[C@@H]2[C@H](C3=C(O2)C=CC(=C3)/C=C/C(=O)NCCCCN=C(N)N)C(=O)NCCCCN=C(N)N)O',
                        'CC1CC2=CC(=C(C(=C2C3=C(C(=C(C=C3CC1(C)O)OC)OC)OC)OC)OC)OC',
                         'CC(=CCC1=C(C=CC2=C1OC(=O)C=C2)OC)C'
                        ]
}

herbal_df = pd.DataFrame(data_herbal)
herbal_df

Unnamed: 0,CID,canonical_smiles
0,969516,COC1=C(C=CC(=C1)/C=C/C(=O)CC(=O)/C=C/C2=CC(=C(...
1,5281792,C1=CC(=C(C=C1C[C@H](C(=O)O)OC(=O)/C=C/C2=CC(=C...
2,2353,COC1=C(C2=C[N+]3=C(C=C2C=C1)C4=CC5=C(C=C4CC3)O...
3,445154,C1=CC(=CC=C1/C=C/C2=CC(=CC(=C2)O)O)O
4,5280804,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O[C...
5,45485025,C1=CC(=CC=C1[C@@H]2[C@H](C3=C(O2)C=CC(=C3)/C=C...
6,23915,CC1CC2=CC(=C(C(=C2C3=C(C(=C(C=C3CC1(C)O)OC)OC)...
7,10228,CC(=CCC1=C(C=CC2=C1OC(=O)C=C2)OC)C


In [12]:
herbal_df['pubchem_fingerprint'] = get_pubchem_fingerprints(herbal_df['canonical_smiles'].tolist())
herbal_df

Unnamed: 0,CID,canonical_smiles,pubchem_fingerprint
0,969516,COC1=C(C=CC(=C1)/C=C/C(=O)CC(=O)/C=C/C2=CC(=C(...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."
1,5281792,C1=CC(=C(C=C1C[C@H](C(=O)O)OC(=O)/C=C/C2=CC(=C...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."
2,2353,COC1=C(C2=C[N+]3=C(C=C2C=C1)C4=CC5=C(C=C4CC3)O...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ..."
3,445154,C1=CC(=CC=C1/C=C/C2=CC(=CC(=C2)O)O)O,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ..."
4,5280804,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O[C...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."
5,45485025,C1=CC(=CC=C1[C@@H]2[C@H](C3=C(O2)C=CC(=C3)/C=C...,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ..."
6,23915,CC1CC2=CC(=C(C(=C2C3=C(C(=C(C=C3CC1(C)O)OC)OC)...,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."
7,10228,CC(=CCC1=C(C=CC2=C1OC(=O)C=C2)OC)C,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ..."


In [24]:
"""
def cleaning_fingerprint(smiles):
    fingerprint_list = []
    
    for data in smiles['pubchem_fingerprint']:
        cleaned_data = data.replace("[", "").replace("\n", "").replace("]", "").split()
        fingerprint_list.append(list(map(int, cleaned_data)))
    
    fingerprint_column = pd.DataFrame(fingerprint_list)
    fingerprint_column.columns = [f'bit{i}' for i in range(fingerprint_column.shape[1])]
    fingerprint_column = fingerprint_column.dropna().astype(int)
    #fingerprint_column["pIC50"] = smiles["pIC50"].reset_index(drop=True)
    
    return fingerprint_column
"""

In [19]:
"""
def cleaning_fingerprint(smiles):
    fingerprint_list = []
    
    # Iterasi melalui setiap data pada 'pubchem_fingerprint'
    for data in smiles['pubchem_fingerprint']:
        if isinstance(data, str):  # Memastikan data adalah string
            if pd.isna(data):  
                continue
            # Membersihkan data dengan mengganti karakter yang tidak diperlukan
            cleaned_data = data.replace("[", "").replace("\n", "").replace("]", "").split()
            fingerprint_list.append(list(map(int, cleaned_data)))
    
    # Membuat DataFrame dari list yang berisi bit fingerprint
    fingerprint_column = pd.DataFrame(fingerprint_list)
    fingerprint_column.columns = [f'bit{i}' for i in range(fingerprint_column.shape[1])]
    
    # Menangani NaN, jika ada, dan memastikan tipe data integer
    fingerprint_column = fingerprint_column.dropna().astype(int)
    
    return fingerprint_column
"""

In [26]:
def cleaning_fingerprint(smiles):
    fingerprint_list = []
    
    # Iterasi melalui setiap data pada 'pubchem_fingerprint'
    for data in smiles['pubchem_fingerprint']:
        # Jika data adalah numpy array, ubah menjadi string
        if isinstance(data, (list, np.ndarray)):
            data = str(data)
        
        # Pastikan data adalah string, kemudian lakukan pembersihan
        if isinstance(data, str):
            if pd.isna(data):  
                continue
            # Membersihkan data dengan mengganti karakter yang tidak diperlukan
            cleaned_data = data.replace("[", "").replace("\n", "").replace("]", "").split()
            fingerprint_list.append(list(map(int, cleaned_data)))
    
    # Membuat DataFrame dari list yang berisi bit fingerprint
    fingerprint_column = pd.DataFrame(fingerprint_list)
    fingerprint_column.columns = [f'bit{i}' for i in range(fingerprint_column.shape[1])]
    
    # Menangani NaN, jika ada, dan memastikan tipe data integer
    fingerprint_column = fingerprint_column.dropna().astype(int)
    
    return fingerprint_column

In [27]:
fingerprint_final = (cleaning_fingerprint(herbal_df))
fingerprint_final

Unnamed: 0,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,bit9,...,bit871,bit872,bit873,bit874,bit875,bit876,bit877,bit878,bit879,bit880
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [29]:
result = pd.concat([herbal_df, fingerprint_final], axis=1)
result

Unnamed: 0,CID,canonical_smiles,pubchem_fingerprint,bit0,bit1,bit2,bit3,bit4,bit5,bit6,...,bit871,bit872,bit873,bit874,bit875,bit876,bit877,bit878,bit879,bit880
0,969516,COC1=C(C=CC(=C1)/C=C/C(=O)CC(=O)/C=C/C2=CC(=C(...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ...",1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5281792,C1=CC(=C(C=C1C[C@H](C(=O)O)OC(=O)/C=C/C2=CC(=C...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ...",1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2353,COC1=C(C2=C[N+]3=C(C=C2C=C1)C4=CC5=C(C=C4CC3)O...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...",1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,445154,C1=CC(=CC=C1/C=C/C2=CC(=CC(=C2)O)O)O,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...",1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5280804,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O[C...,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ...",1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,45485025,C1=CC(=CC=C1[C@@H]2[C@H](C3=C(O2)C=CC(=C3)/C=C...,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, ...",1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,23915,CC1CC2=CC(=C(C(=C2C3=C(C(=C(C=C3CC1(C)O)OC)OC)...,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ...",1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,10228,CC(=CCC1=C(C=CC2=C1OC(=O)C=C2)OC)C,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...",1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
herbal_df.to_csv("herbal_pubchem_fingerprint_2.csv",index= False)

In [11]:
#data_df.to_csv("pubchem_fingerprint.csv",index= False)