# Calculating PaDEL Descriptors and Fingerprints

## Installing and Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
## Installing PaDEL:
# !pip install PaDEL-pywrapper --user
from PaDEL_pywrapper import PaDEL
from PaDEL_pywrapper.descriptor import ALOGP, Crippen, FMF
from rdkit import Chem

# Importing the Dataset

In [3]:
df = pd.read_csv("./acetylcholinesterase_04_bioactivity_data_3class_pEC50.csv").iloc[:,1:]
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,Mol_Wt,LogP,H-Bond Acceptor,H-Bond Donor,pEC50
0,CHEMBL174698,Cc1cn(C2C=CC(COC(=O)CN3CCNCC3)O2)c(=O)[nH]c1=O,Active,350.375,-1.25298,8,2,7.196543
1,CHEMBL295054,C[n+]1ccc2c([nH]c3ccccc32)c1-c1ccccc1,Inactive,259.332,3.8126,0,1,4.886057
2,CHEMBL21521,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O...,Intermediate,750.982,4.9304,8,8,5.886057
3,CHEMBL111217,CC(/C=C/C(F)=C(/C)c1cc(C(C)(C)C)cc(C(C)(C)C)c1...,Active,438.53,7.2131,2,1,7.200659
4,CHEMBL415341,CCCCC(=O)N[C@@H]1CC(=O)NCCCC[C@@H](C(N)=O)NC(=...,Active,1023.637,1.4937,9,11,8.045757


In [4]:
df_selected = df[["molecule_chembl_id", "canonical_smiles"]]
df_selected.to_csv("molecule.smi", sep = "\t", index = False, header = False)

# Feature Engineering

## Importing Required Libraries

In [8]:
from PaDEL_pywrapper import PaDEL
from typing import Tuple, List
from PaDEL_pywrapper.descriptor import PubchemFP
from rdkit import Chem
from rdkit.Chem import SaltRemover
from rdkit.Chem.MolStandardize import rdMolStandardize

In [9]:
smiles = df_selected["canonical_smiles"].values

## PubChem FingerPrint Extractor

In [13]:
class MoleculeProcessor:
    def __init__(self):
        self.fp = PubchemFP()
        ##Salt remover
        self.salt_remover = SaltRemover.SaltRemover()
        # self.normalizer = rdMolStandardize.Normalize()
        # self.reionizer = rdMolStandardize.Reionize()
        self.padel = PaDEL(descriptors = [self.fp], ignore_3D=True)

    def preprocess_molecule(self, mol):
        if mol is None:
            return None
        ## Remove Salt From Molecule
        self.salt_remover.StripMol(mol, dontRemoveEverything=True)
        ##Standardize Nitro groups
        mol = rdMolStandardize.Normalize(mol)
        ## Reionize the molecule
        mol = rdMolStandardize.Reionize(mol)
        return mol

    def process_smile(self, smiles:List[str]):
        mols = [self.preprocess_molecule(Chem.MolFromSmiles(smile)) for smile in smiles if Chem.MolFromSmiles(smile) is not None]
        return self.padel.calculate(mols)

In [14]:
processor = MoleculeProcessor()
fingerprint = processor.process_smile(smiles)

[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing Normalizer
[12:09:21] Running Normalizer
[12:09:21] Initializing N

PaDEL-Descriptor is a software for calculating molecular
descriptors and fingerprints. The software calculates
1875 descriptors (1444 1D and 2D descriptors, and 431
3D descriptors) and 12 types of fingerprints.

###################################

Should you publish results based on the PaDEL descriptors,
please cite:

Yap, C.W. (2011), PaDEL-descriptor: An open source software
to calculate molecular descriptors and fingerprints.
J. Comput. Chem., 32: 1466-1474. https://doi.org/10.1002/jcc.21707

###################################



In [17]:
df_final = pd.concat([fingerprint, df["pEC50"]], axis = 1)
df_final.head()

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pEC50
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.196543
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.886057
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.886057
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.200659
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.045757


In [23]:
## Checking for null values
df_final.isnull().sum()[df_final.isnull().sum()!=0]

Series([], dtype: int64)

In [24]:
df_final.to_csv("acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv")

In [1]:
# !pip install streamlit

Collecting streamlit
  Downloading streamlit-1.35.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Downloading blinker-1.8.2-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Downloading cachetools-5.3.3-py3-none-any.whl.metadata (5.3 kB)
Collecting click<9,>=7.0 (from streamlit)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting protobuf<5,>=3.20 (from streamlit)
  Downloading protobuf-4.25.3-cp310-abi3-win_amd64.whl.metadata (541 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-16.1.0-cp310-cp310-win_amd64.whl.metadata (3.1 kB)
Collecting rich<14,>=10.14.0 (from streamlit)
  Downloading rich-13.7.1-py3-none-any.whl.metadata (18 kB)
Collecting tenacity<9,>=8.1.0 (from streamlit)
  Using cached tenacity-8.3.0-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10