### **Molecular descriptors converter**

In [None]:
!pip install pubchempy

Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13820 sha256=e5cee8475b89321e66bf3b7a2caa65495b2cc1696044721aada2b2e41413961a
  Stored in directory: /root/.cache/pip/wheels/90/7c/45/18a0671e3c3316966ef7ed9ad2b3f3300a7e41d3421a44e799
Successfully built pubchempy
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.4


In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.1


Change from SMILES string to Pubchem CID

In [None]:
import pandas as pd
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import MolSurf, rdMolDescriptors
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from rdkit import RDLogger

In [None]:
# Function to calculate SLogP
def calculate_slogp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.MolLogP(mol)
    else:
        return None

# Function to calculate SMR
def calculate_smr(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.MolMR(mol)
    else:
        return None

def get_SLogP_VSA8(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA8(mol)
    else:
        return None

def get_SMR_VSA3(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SMR_VSA3(mol)
    else:
        return None

def get_PEOE_VSA2(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA2(mol)
    else:
        return None

def get_PEOE_VSA3(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA3(mol)
    else:
        return None

def get_PEOE_VSA6(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA6(mol)
    else:
        return None

def get_PEOE_VSA8(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA8(mol)
    else:
        return None

def get_PEOE_VSA9(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA9(mol)
    else:
        return None

def get_PEOE_VSA10(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA10(mol)
    else:
        return None

def get_PEOE_VSA11(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA11(mol)
    else:
        return None

#_____________________________________________------


def calculate_labute_asa(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.CalcLabuteASA(mol)#MolSurf.LabuteASA(mol)
    else:
        return None

def calculate_tpsa(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.TPSA(mol)
    else:
        return None

def calculate_amw(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.MolWt(mol)
    else:
        return None

def calculate_exact_mw(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.ExactMolWt(mol)
    else:
        return None

def calculate_num_lipinski_hba(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumHAcceptors(mol)
    else:
        return None

def calculate_num_lipinski_hbd(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumHDonors(mol)
    else:
        return None

def calculate_num_rotatable_bonds(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumRotatableBonds(mol)
    else:
        return None

def calculate_num_hbd(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumHDonors(mol)
    else:
        return None

def calculate_num_hba(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumHAcceptors(mol)
    else:
        return None

def calculate_num_amide_bonds(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.CalcNumAmideBonds(mol)
    else:
        return None

def calculate_num_hetero_atoms(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumHeteroatoms(mol)
    else:
        return None

def calculate_num_heavy_atoms(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.HeavyAtomCount(mol)
    else:
        return None

def calculate_num_atoms(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return mol.GetNumAtoms()
    else:
        return None

def calculate_num_stereocenters(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.CalcNumAtomStereoCenters(mol)
    else:
        return None

def calculate_num_unspecified_stereocenters(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumUnspecifiedStereocenters(mol)
    else:
        return None

def calculate_num_rings(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.RingCount(mol)
    else:
        return None

def calculate_num_aromatic_rings(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumAromaticRings(mol)
    else:
        return None

def calculate_num_saturated_rings(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumSaturatedRings(mol)
    else:
        return None

def calculate_num_aliphatic_rings(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumAliphaticRings(mol)
    else:
        return None

def calculate_num_aromatic_heterocycles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumAromaticHeterocycles(mol)
    else:
        return None

def calculate_num_saturated_heterocycles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumSaturatedHeterocycles(mol)
    else:
        return None

def calculate_num_aliphatic_heterocycles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumAliphaticHeterocycles(mol)
    else:
        return None

def calculate_num_aromatic_carbocycles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumAromaticCarbocycles(mol)
    else:
        return None

def calculate_num_saturated_carbocycles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumSaturatedCarbocycles(mol)
    else:
        return None

def calculate_num_aliphatic_carbocycles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.NumAliphaticCarbocycles(mol)
    else:
        return None

def calculate_fraction_csp3(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.FractionCSP3(mol)
    else:
        return None

def calculate_chi0v(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Chi0v(mol)
    else:
        return None

def calculate_chi1v(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Chi1v(mol)
    else:
        return None

def calculate_chi2v(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Chi2v(mol)
    else:
        return None

def calculate_chi3v(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Chi3v(mol)
    else:
        return None

def calculate_chi4v(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Chi4v(mol)
    else:
        return None

def calculate_chi1n(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Chi1n(mol)
    else:
        return None

def calculate_chi2n(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Chi2n(mol)
    else:
        return None

def calculate_chi3n(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Chi3n(mol)
    else:
        return None

def calculate_chi4n(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Chi4n(mol)
    else:
        return None

def calculate_hall_kier_alpha(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.HallKierAlpha(mol)
    else:
        return None

def calculate_kappa1(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Kappa1(mol)
    else:
        return None

def calculate_kappa2(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Kappa2(mol)
    else:
        return None

def calculate_kappa3(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Descriptors.Kappa3(mol)
    else:
        return None

def calculate_slogp_vsa1(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA1(mol)
    else:
        return None

def calculate_slogp_vsa2(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA2(mol)
    else:
        return None

def calculate_slogp_vsa3(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA3(mol)
    else:
        return None

def calculate_slogp_vsa4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA4(mol)
    else:
        return None

def calculate_slogp_vsa5(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA5(mol)
    else:
        return None

def calculate_slogp_vsa6(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA6(mol)
    else:
        return None

def calculate_slogp_vsa7(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA7(mol)
    else:
        return None

def calculate_slogp_vsa9(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA9(mol)
    else:
        return None

def calculate_slogp_vsa10(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA10(mol)
    else:
        return None

def calculate_slogp_vsa11(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA11(mol)
    else:
        return None

def calculate_slogp_vsa12(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SlogP_VSA12(mol)
    else:
        return None

def calculate_smr_vsa1(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SMR_VSA1(mol)
    else:
        return None

def calculate_smr_vsa2(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SMR_VSA2(mol)
    else:
        return None

def calculate_smr_vsa4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SMR_VSA4(mol)
    else:
        return None

def calculate_smr_vsa5(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SMR_VSA5(mol)
    else:
        return None

def calculate_smr_vsa6(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SMR_VSA6(mol)
    else:
        return None

def calculate_smr_vsa7(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SMR_VSA7(mol)
    else:
        return None

def calculate_smr_vsa8(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SMR_VSA8(mol)
    else:
        return None

def calculate_smr_vsa9(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SMR_VSA9(mol)
    else:
        return None

def calculate_smr_vsa10(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.SMR_VSA10(mol)
    else:
        return None

def calculate_peoe_vsa1(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA1(mol)
    else:
        return None

def calculate_peoe_vsa4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA4(mol)
    else:
        return None

def calculate_peoe_vsa5(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA5(mol)
    else:
        return None

def calculate_peoe_vsa7(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA7(mol)
    else:
        return None

def calculate_peoe_vsa12(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA12(mol)
    else:
        return None

def calculate_peoe_vsa13(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA13(mol)
    else:
        return None

def calculate_peoe_vsa14(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return MolSurf.PEOE_VSA14(mol)
    else:
        return None

def calculate_mqn1(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[0]
    else:
        return None

def calculate_mqn2(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[1]
    else:
        return None

def calculate_mqn3(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[2]
    else:
        return None

def calculate_mqn4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[3]
    else:
        return None

def calculate_mqn5(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[4]
    else:
        return None

def calculate_mqn6(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[5]
    else:
        return None

def calculate_mqn7(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[6]
    else:
        return None

def calculate_mqn8(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[7]
    else:
        return None

def calculate_mqn9(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[8]
    else:
        return None

def calculate_mqn10(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[9]
    else:
        return None

def calculate_mqn11(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[10]
    else:
        return None

def calculate_mqn12(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[11]
    else:
        return None

def calculate_mqn13(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[12]
    else:
        return None

def calculate_mqn14(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[13]
    else:
        return None

def calculate_mqn15(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[14]
    else:
        return None

def calculate_mqn16(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[15]
    else:
        return None

def calculate_mqn17(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[16]
    else:
        return None

def calculate_mqn18(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[17]
    else:
        return None

def calculate_mqn19(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[18]
    else:
        return None

def calculate_mqn20(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[19]
    else:
        return None

def calculate_mqn21(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[20]
    else:
        return None

def calculate_mqn22(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[21]
    else:
        return None

def calculate_mqn23(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[22]
    else:
        return None

def calculate_mqn24(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[23]
    else:
        return None

def calculate_mqn25(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[24]
    else:
        return None

def calculate_mqn26(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[25]
    else:
        return None

def calculate_mqn27(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[26]
    else:
        return None

def calculate_mqn28(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[27]
    else:
        return None

def calculate_mqn29(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[28]
    else:
        return None

def calculate_mqn30(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[29]
    else:
        return None

def calculate_mqn31(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[30]
    else:
        return None

def calculate_mqn32(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[31]
    else:
        return None

def calculate_mqn33(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[32]
    else:
        return None

def calculate_mqn34(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[33]
    else:
        return None

def calculate_mqn35(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[34]
    else:
        return None

def calculate_mqn36(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[35]
    else:
        return None

def calculate_mqn37(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[36]
    else:
        return None

def calculate_mqn38(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[37]
    else:
        return None

def calculate_mqn39(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[38]
    else:
        return None

def calculate_mqn40(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[39]
    else:
        return None

def calculate_mqn41(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[40]
    else:
        return None

def calculate_mqn42(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return rdMolDescriptors.MQNs_(mol)[41]
    else:
        return None

In [None]:
def molecular_features(df ,smiles):

  temp_df = pd.DataFrame(df[smiles])

  temp_df['SlogP'] = temp_df[smiles].apply(calculate_slogp)
  temp_df['SMR'] = temp_df[smiles].apply(calculate_smr)
  temp_df['peoe_VSA2'] =temp_df[smiles].apply(get_PEOE_VSA2)
  temp_df['peoe_VSA3'] =temp_df[smiles].apply(get_PEOE_VSA3)
  temp_df['peoe_VSA6'] =temp_df[smiles].apply(get_PEOE_VSA6)
  temp_df['peoe_VSA8'] =temp_df[smiles].apply(get_PEOE_VSA8)
  temp_df['peoe_VSA9'] =temp_df[smiles].apply(get_PEOE_VSA9)
  temp_df['peoe_VSA10'] =temp_df[smiles].apply(get_PEOE_VSA10)
  temp_df['peoe_VSA11'] =temp_df[smiles].apply(get_PEOE_VSA11)
  temp_df['smr_VSA3'] =temp_df[smiles].apply(get_SMR_VSA3)
  temp_df['slogp_VSA8'] =temp_df[smiles].apply(get_SLogP_VSA8)

  temp_df['LabuteASA'] = temp_df[smiles].apply(calculate_labute_asa)
  temp_df['TPSA'] = temp_df[smiles].apply(calculate_tpsa)
  temp_df['AMW'] = temp_df[smiles].apply(calculate_amw)
  temp_df['ExactMW'] = temp_df[smiles].apply(calculate_exact_mw)
  temp_df['NumLipinskiHBA'] = temp_df[smiles].apply(calculate_num_lipinski_hba)
  temp_df['NumLipinskiHBD'] = temp_df[smiles].apply(calculate_num_lipinski_hbd)
  temp_df['NumRotatableBonds'] = temp_df[smiles].apply(calculate_num_rotatable_bonds)
  temp_df['NumHBD'] = temp_df[smiles].apply(calculate_num_hbd)
  temp_df['NumHBA'] = temp_df[smiles].apply(calculate_num_hba)
  temp_df['NumAmideBonds'] = temp_df[smiles].apply(calculate_num_amide_bonds)
  temp_df['NumHeteroAtoms'] = temp_df[smiles].apply(calculate_num_hetero_atoms)
  temp_df['NumHeavyAtoms'] = temp_df[smiles].apply(calculate_num_heavy_atoms)
  temp_df['NumAtoms'] = temp_df[smiles].apply(calculate_num_atoms)
  temp_df['NumStereocenters'] = temp_df[smiles].apply(calculate_num_stereocenters)
  #temp_df['NumUnspecifiedStereocenters'] = temp_df[smiles].apply(calculate_num_unspecified_stereocenters)
  temp_df['NumRings'] = temp_df[smiles].apply(calculate_num_rings)
  temp_df['NumAromaticRings'] = temp_df[smiles].apply(calculate_num_aromatic_rings)
  temp_df['NumSaturatedRings'] = temp_df[smiles].apply(calculate_num_saturated_rings)
  temp_df['NumAliphaticRings'] = temp_df[smiles].apply(calculate_num_aliphatic_rings)
  temp_df['NumAromaticHeterocycles'] = temp_df[smiles].apply(calculate_num_aromatic_heterocycles)
  temp_df['NumSaturatedHeterocycles'] = temp_df[smiles].apply(calculate_num_saturated_heterocycles)
  temp_df['NumAliphaticHeterocycles'] = temp_df[smiles].apply(calculate_num_aliphatic_heterocycles)
  temp_df['NumAromaticCarbocycles'] = temp_df[smiles].apply(calculate_num_aromatic_carbocycles)
  temp_df['NumSaturatedCarbocycles'] = temp_df[smiles].apply(calculate_num_saturated_carbocycles)
  temp_df['NumAliphaticCarbocycles'] = temp_df[smiles].apply(calculate_num_aliphatic_carbocycles)
  temp_df['FractionCSP3'] = temp_df[smiles].apply(calculate_fraction_csp3)
  temp_df['Chi0v'] = temp_df[smiles].apply(calculate_chi0v)
  temp_df['Chi1v'] = temp_df[smiles].apply(calculate_chi1v)
  temp_df['Chi2v'] = temp_df[smiles].apply(calculate_chi2v)
  temp_df['Chi3v'] = temp_df[smiles].apply(calculate_chi3v)
  temp_df['Chi4v'] = temp_df[smiles].apply(calculate_chi4v)
  temp_df['Chi1n'] = temp_df[smiles].apply(calculate_chi1n)
  temp_df['Chi2n'] = temp_df[smiles].apply(calculate_chi2n)
  temp_df['Chi3n'] = temp_df[smiles].apply(calculate_chi3n)
  temp_df['Chi4n'] = temp_df[smiles].apply(calculate_chi4n)
  temp_df['HallKierAlpha'] = temp_df[smiles].apply(calculate_hall_kier_alpha)
  temp_df['kappa1'] = temp_df[smiles].apply(calculate_kappa1)
  temp_df['kappa2'] = temp_df[smiles].apply(calculate_kappa2)
  temp_df['kappa3'] = temp_df[smiles].apply(calculate_kappa3)
  temp_df['slogp_VSA1'] = temp_df[smiles].apply(calculate_slogp_vsa1)
  temp_df['slogp_VSA2'] = temp_df[smiles].apply(calculate_slogp_vsa2)
  temp_df['slogp_VSA3'] = temp_df[smiles].apply(calculate_slogp_vsa3)
  temp_df['slogp_VSA4'] = temp_df[smiles].apply(calculate_slogp_vsa4)
  temp_df['slogp_VSA5'] = temp_df[smiles].apply(calculate_slogp_vsa5)
  temp_df['slogp_VSA6'] = temp_df[smiles].apply(calculate_slogp_vsa6)
  temp_df['slogp_VSA7'] = temp_df[smiles].apply(calculate_slogp_vsa7)
  temp_df['slogp_VSA9'] = temp_df[smiles].apply(calculate_slogp_vsa9)
  temp_df['slogp_VSA10'] = temp_df[smiles].apply(calculate_slogp_vsa10)
  temp_df['slogp_VSA11'] = temp_df[smiles].apply(calculate_slogp_vsa11)
  temp_df['slogp_VSA12'] = temp_df[smiles].apply(calculate_slogp_vsa12)
  temp_df['smr_VSA1'] = temp_df[smiles].apply(calculate_smr_vsa1)
  temp_df['smr_VSA2'] = temp_df[smiles].apply(calculate_smr_vsa2)
  temp_df['smr_VSA4'] = temp_df[smiles].apply(calculate_smr_vsa4)
  temp_df['smr_VSA5'] = temp_df[smiles].apply(calculate_smr_vsa5)
  temp_df['smr_VSA6'] = temp_df[smiles].apply(calculate_smr_vsa6)
  temp_df['smr_VSA7'] = temp_df[smiles].apply(calculate_smr_vsa7)
  temp_df['smr_VSA8'] = temp_df[smiles].apply(calculate_smr_vsa8)
  temp_df['smr_VSA9'] = temp_df[smiles].apply(calculate_smr_vsa9)
  temp_df['smr_VSA10'] = temp_df[smiles].apply(calculate_smr_vsa10)
  temp_df['peoe_VSA1'] = temp_df[smiles].apply(calculate_peoe_vsa1)
  temp_df['peoe_VSA4'] = temp_df[smiles].apply(calculate_peoe_vsa4)
  temp_df['peoe_VSA5'] = temp_df[smiles].apply(calculate_peoe_vsa5)
  temp_df['peoe_VSA7'] = temp_df[smiles].apply(calculate_peoe_vsa7)
  temp_df['peoe_VSA12'] = temp_df[smiles].apply(calculate_peoe_vsa12)
  temp_df['peoe_VSA13'] = temp_df[smiles].apply(calculate_peoe_vsa13)
  temp_df['peoe_VSA14'] = temp_df[smiles].apply(calculate_peoe_vsa14)
  temp_df['MQN1'] = temp_df[smiles].apply(calculate_mqn1)
  temp_df['MQN2'] = temp_df[smiles].apply(calculate_mqn2)
  temp_df['MQN3'] = temp_df[smiles].apply(calculate_mqn3)
  temp_df['MQN4'] = temp_df[smiles].apply(calculate_mqn4)
  temp_df['MQN5'] = temp_df[smiles].apply(calculate_mqn5)
  temp_df['MQN6'] = temp_df[smiles].apply(calculate_mqn6)
  temp_df['MQN7'] = temp_df[smiles].apply(calculate_mqn7)
  temp_df['MQN8'] = temp_df[smiles].apply(calculate_mqn8)
  temp_df['MQN9'] = temp_df[smiles].apply(calculate_mqn9)
  temp_df['MQN10'] = temp_df[smiles].apply(calculate_mqn10)
  temp_df['MQN11'] = temp_df[smiles].apply(calculate_mqn11)
  temp_df['MQN12'] = temp_df[smiles].apply(calculate_mqn12)
  temp_df['MQN13'] = temp_df[smiles].apply(calculate_mqn13)
  temp_df['MQN14'] = temp_df[smiles].apply(calculate_mqn14)
  temp_df['MQN15'] = temp_df[smiles].apply(calculate_mqn15)
  temp_df['MQN16'] = temp_df[smiles].apply(calculate_mqn16)
  temp_df['MQN17'] = temp_df[smiles].apply(calculate_mqn17)
  temp_df['MQN18'] = temp_df[smiles].apply(calculate_mqn18)
  temp_df['MQN19'] = temp_df[smiles].apply(calculate_mqn19)
  temp_df['MQN20'] = temp_df[smiles].apply(calculate_mqn20)
  temp_df['MQN21'] = temp_df[smiles].apply(calculate_mqn21)
  temp_df['MQN22'] = temp_df[smiles].apply(calculate_mqn22)
  temp_df['MQN23'] = temp_df[smiles].apply(calculate_mqn23)
  temp_df['MQN24'] = temp_df[smiles].apply(calculate_mqn24)
  temp_df['MQN25'] = temp_df[smiles].apply(calculate_mqn25)
  temp_df['MQN26'] = temp_df[smiles].apply(calculate_mqn26)
  temp_df['MQN27'] = temp_df[smiles].apply(calculate_mqn27)
  temp_df['MQN28'] = temp_df[smiles].apply(calculate_mqn28)
  temp_df['MQN29'] = temp_df[smiles].apply(calculate_mqn29)
  temp_df['MQN30'] = temp_df[smiles].apply(calculate_mqn30)
  temp_df['MQN31'] = temp_df[smiles].apply(calculate_mqn31)
  temp_df['MQN32'] = temp_df[smiles].apply(calculate_mqn32)
  temp_df['MQN33'] = temp_df[smiles].apply(calculate_mqn33)
  temp_df['MQN34'] = temp_df[smiles].apply(calculate_mqn34)
  temp_df['MQN35'] = temp_df[smiles].apply(calculate_mqn35)
  temp_df['MQN36'] = temp_df[smiles].apply(calculate_mqn36)
  temp_df['MQN37'] = temp_df[smiles].apply(calculate_mqn37)
  temp_df['MQN38'] = temp_df[smiles].apply(calculate_mqn38)
  temp_df['MQN39'] = temp_df[smiles].apply(calculate_mqn39)
  temp_df['MQN40'] = temp_df[smiles].apply(calculate_mqn40)
  temp_df['MQN41'] = temp_df[smiles].apply(calculate_mqn41)
  temp_df['MQN42'] = temp_df[smiles].apply(calculate_mqn42)

  df = pd.concat([df, temp_df.drop(columns=[smiles])], axis=1)

  return df

In [None]:
functions = [
        calculate_slogp, calculate_smr, get_PEOE_VSA2, get_PEOE_VSA3, get_PEOE_VSA6,
        get_PEOE_VSA8, get_PEOE_VSA9, get_PEOE_VSA10, get_PEOE_VSA11, get_SMR_VSA3,
        get_SLogP_VSA8, calculate_labute_asa, calculate_tpsa, calculate_amw,
        calculate_exact_mw, calculate_num_lipinski_hba, calculate_num_lipinski_hbd,
        calculate_num_rotatable_bonds, calculate_num_hbd, calculate_num_hba,
        calculate_num_amide_bonds, calculate_num_hetero_atoms, calculate_num_heavy_atoms,
        calculate_num_atoms, calculate_num_stereocenters, calculate_num_rings,
        calculate_num_aromatic_rings, calculate_num_saturated_rings,
        calculate_num_aliphatic_rings, calculate_num_aromatic_heterocycles,
        calculate_num_saturated_heterocycles, calculate_num_aliphatic_heterocycles,
        calculate_num_aromatic_carbocycles, calculate_num_saturated_carbocycles,
        calculate_num_aliphatic_carbocycles, calculate_fraction_csp3, calculate_chi0v,
        calculate_chi1v, calculate_chi2v, calculate_chi3v, calculate_chi4v,
        calculate_chi1n, calculate_chi2n, calculate_chi3n, calculate_chi4n,
        calculate_hall_kier_alpha, calculate_kappa1, calculate_kappa2,
        calculate_kappa3, calculate_slogp_vsa1, calculate_slogp_vsa2,
        calculate_slogp_vsa3, calculate_slogp_vsa4, calculate_slogp_vsa5,
        calculate_slogp_vsa6, calculate_slogp_vsa7, calculate_slogp_vsa9,
        calculate_slogp_vsa10, calculate_slogp_vsa11, calculate_slogp_vsa12,
        calculate_smr_vsa1, calculate_smr_vsa2, calculate_smr_vsa4,
        calculate_smr_vsa5, calculate_smr_vsa6, calculate_smr_vsa7,
        calculate_smr_vsa8, calculate_smr_vsa9, calculate_smr_vsa10,
        calculate_peoe_vsa1, calculate_peoe_vsa4, calculate_peoe_vsa5,
        calculate_peoe_vsa7, calculate_peoe_vsa12, calculate_peoe_vsa13,
        calculate_peoe_vsa14, calculate_mqn1, calculate_mqn2, calculate_mqn3,
        calculate_mqn4, calculate_mqn5, calculate_mqn6, calculate_mqn7,
        calculate_mqn8, calculate_mqn9, calculate_mqn10, calculate_mqn11,
        calculate_mqn12, calculate_mqn13, calculate_mqn14, calculate_mqn15,
        calculate_mqn16, calculate_mqn17, calculate_mqn18, calculate_mqn19,
        calculate_mqn20, calculate_mqn21, calculate_mqn22, calculate_mqn23,
        calculate_mqn24, calculate_mqn25, calculate_mqn26, calculate_mqn27,
        calculate_mqn28, calculate_mqn29, calculate_mqn30, calculate_mqn31,
        calculate_mqn32, calculate_mqn33, calculate_mqn34, calculate_mqn35,
        calculate_mqn36, calculate_mqn37, calculate_mqn38, calculate_mqn39,
        calculate_mqn40, calculate_mqn41, calculate_mqn42
    ]


In [None]:
len(functions)

118

In [None]:
column_names = ['SlogP', 'SMR', 'peoe_VSA2', 'peoe_VSA3', 'peoe_VSA6', 'peoe_VSA8', 'peoe_VSA9', 'peoe_VSA10', 'peoe_VSA11', 'smr_VSA3', 'slogp_VSA8', 'LabuteASA', 'TPSA', 'AMW', 'ExactMW', 'NumLipinskiHBA', 'NumLipinskiHBD', 'NumRotatableBonds', 'NumHBD', 'NumHBA', 'NumAmideBonds', 'NumHeteroAtoms', 'NumHeavyAtoms', 'NumAtoms', 'NumStereocenters', 'NumRings', 'NumAromaticRings', 'NumSaturatedRings', 'NumAliphaticRings', 'NumAromaticHeterocycles', 'NumSaturatedHeterocycles', 'NumAliphaticHeterocycles', 'NumAromaticCarbocycles', 'NumSaturatedCarbocycles', 'NumAliphaticCarbocycles', 'FractionCSP3', 'Chi0v', 'Chi1v', 'Chi2v', 'Chi3v', 'Chi4v', 'Chi1n', 'Chi2n', 'Chi3n', 'Chi4n', 'HallKierAlpha', 'kappa1', 'kappa2', 'kappa3', 'slogp_VSA1', 'slogp_VSA2', 'slogp_VSA3', 'slogp_VSA4', 'slogp_VSA5', 'slogp_VSA6', 'slogp_VSA7', 'slogp_VSA9', 'slogp_VSA10', 'slogp_VSA11', 'slogp_VSA12', 'smr_VSA1', 'smr_VSA2', 'smr_VSA4', 'smr_VSA5', 'smr_VSA6', 'smr_VSA7', 'smr_VSA8', 'smr_VSA9', 'smr_VSA10', 'peoe_VSA1', 'peoe_VSA4', 'peoe_VSA5', 'peoe_VSA7', 'peoe_VSA12', 'peoe_VSA13', 'peoe_VSA14', 'MQN1', 'MQN2', 'MQN3', 'MQN4', 'MQN5', 'MQN6', 'MQN7', 'MQN8', 'MQN9', 'MQN10', 'MQN11', 'MQN12', 'MQN13', 'MQN14', 'MQN15', 'MQN16', 'MQN17', 'MQN18', 'MQN19', 'MQN20', 'MQN21', 'MQN22', 'MQN23', 'MQN24', 'MQN25', 'MQN26', 'MQN27', 'MQN28', 'MQN29', 'MQN30', 'MQN31', 'MQN32', 'MQN33', 'MQN34', 'MQN35', 'MQN36', 'MQN37', 'MQN38', 'MQN39', 'MQN40', 'MQN41', 'MQN42']
len(column_names)

118

In [None]:
def proto(temp_df, smiles, labels):

  #RDLogger.DisableLog('rdApp.*')

  y = temp_df[labels]
  # Applying functions and concatenating the results
  tempp_df = pd.concat([temp_df[smiles].apply(func) for func in functions], axis=1)
  tempp_df.columns = column_names
  tempp_df[labels] = y

  #RDLogger.EnableLog('rdApp.*')

  return tempp_df

### **Feature selection methods:**

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn import tree
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
import numpy as np
from sklearn.feature_selection import VarianceThreshold

Step 1: variance filter

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
def varaince_filter(df, v_value, label_col):

    selector = VarianceThreshold(threshold = v_value)

    X = df.drop(label_col, axis =1)
    y = df[label_col]

    # Fit the selector to your data and transform the data
    X_filtered = selector.fit_transform(X)

    # Get the indices of features selected by the filter
    selected_indices = selector.get_support(indices=True)

    # Get the selected features
    selected_features = X.columns[selector.get_support()]

    # Create a DataFrame with the selected features
    df_filtered = df[selected_features]
    df_filtered[label_col] = df[label_col]

    return df_filtered

Step 2: filter using permutation

In [None]:
def permutation_filter(df, label_col):

    X = df.drop(label_col, axis =1)
    y = df[label_col]

    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X, y)

    import_score = clf.feature_importances_

    result = permutation_importance(clf, X, y, n_repeats=5, random_state=42)
    mean_results = result.importances_mean

    # Compare import_score with mean_results
    feature_names = X.columns
    selected_features = feature_names[import_score > mean_results]

    # Create a DataFrame with the selected features
    df_filtered = df[selected_features]
    df_filtered[label_col] = y

    return df_filtered

step 3: filter out most correlated features

In [None]:
def correlation(df, threshold,label_col):

    X = df.drop(label_col, axis =1)
    y = df[label_col]

    col_corr = set()
    corr_matrix = X.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)

    df_filtered = X.drop(col_corr,axis=1)
    df_filtered[label_col] = df[label_col]
    return df_filtered