In [1]:
from padelpy import padeldescriptor
import pandas as pd
import os
import numpy as np 

In [2]:
# This script calculates the molecular descriptors for the molecules in the dataset
# The descriptors are calculated using the PaDEL-Descriptor software

# read the dataset
df = pd.read_csv('SweetpredDB.csv')
df.head()

Unnamed: 0,Name,Test_ID,Smiles,Sweetness,logSw
0,"1',4',6'-3Cl-sucrose",test_1,ClC1C(O)[C@@](OC1CCl)(OC1OC(CO)C(O)C(O)C1O)CCl,10000.0,4.0
1,"1',4'-2Cl-sucrose",test_2,ClC1C(O)[C@@](OC1CO)(OC1OC(CO)C(O)C(O)C1O)CCl,3000.0,3.48
2,"1',6'-2Cl-sucrose",test_3,ClCC1O[C@](OC2OC(CO)C(O)C(O)C2O)(CCl)C(O)C1O,7800.0,3.89
3,1'-Cl-sucrose,test_4,ClC[C@]1(OC(CO)C(O)C1O)OC1OC(CO)C(O)C(O)C1O,2000.0,3.3
4,"4'-Br-4,1',6'-3Cl-sucrose",test_5,BrC1C(O)[C@@](OC1CCl)(OC1OC(CO)C(Cl)C(O)C1O)CCl,325000.0,5.51


In [3]:
import glob
xml_files = glob.glob('*.xml')
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [4]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [5]:
# creating mapping dictionary for the descriptors to XML files
mapping = dict(zip(FP_list, xml_files))
mapping


{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [6]:
# making a seperate data of molecule smiles with their IDs
df_smiles = df[['Test_ID', 'Smiles']]
df_smiles.to_csv('molecule.smi', sep='\t', index=False, header=False)
df_smiles.head()

Unnamed: 0,Test_ID,Smiles
0,test_1,ClC1C(O)[C@@](OC1CCl)(OC1OC(CO)C(O)C(O)C1O)CCl
1,test_2,ClC1C(O)[C@@](OC1CO)(OC1OC(CO)C(O)C(O)C1O)CCl
2,test_3,ClCC1O[C@](OC2OC(CO)C(O)C(O)C2O)(CCl)C(O)C1O
3,test_4,ClC[C@]1(OC(CO)C(O)C1O)OC1OC(CO)C(O)C(O)C1O
4,test_5,BrC1C(O)[C@@](OC1CCl)(OC1OC(CO)C(Cl)C(O)C1O)CCl


In [26]:
from padelpy import from_smiles

# get descriptors for 1st molecule from df_smiles
# first smile
smile = df_smiles['Smiles'][500]
descriptor = from_smiles(smile, threads=2, descriptors=True)

In [27]:
print(len(descriptor))

1875
