In [4]:
import pandas as pd
from rdkit import Chem
from mordred import Calculator, descriptors

In [8]:
import rdkit
import mordred
print("RDKit version:", rdkit.__version__)
print("Mordred version:", mordred.__version__)

RDKit version: 2022.09.5
Mordred version: 1.2.0


In [10]:
df = pd.read_csv('B3DB_classification.tsv', sep='\t')

In [12]:
df.head(2)

Unnamed: 0,NO.,compound_name,IUPAC_name,SMILES,CID,logBB,BBB+/BBB-,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,BBB-,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,
1,2,moxalactam,7-[[2-carboxy-2-(4-hydroxyphenyl)acetyl]amino]...,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,3889.0,-2.52,BBB-,InChI=1S/C20H20N6O9S/c1-25-19(22-23-24-25)36-8...,,R25|,A,


In [14]:
# Convert SMILES to RDKit Molecule objects
df['mol'] = df['SMILES'].apply(Chem.MolFromSmiles)

In [16]:
# Drop invalid SMILES (where MolFromSmiles returns None)
df = df[df['mol'].notnull()].reset_index(drop=True)

In [18]:
df['mol']

0       <rdkit.Chem.rdchem.Mol object at 0x000001BA5C9...
1       <rdkit.Chem.rdchem.Mol object at 0x000001BA5C9...
2       <rdkit.Chem.rdchem.Mol object at 0x000001BA5C9...
3       <rdkit.Chem.rdchem.Mol object at 0x000001BA5C9...
4       <rdkit.Chem.rdchem.Mol object at 0x000001BA5C9...
                              ...                        
7802    <rdkit.Chem.rdchem.Mol object at 0x000001BA5CA...
7803    <rdkit.Chem.rdchem.Mol object at 0x000001BA5CA...
7804    <rdkit.Chem.rdchem.Mol object at 0x000001BA5CA...
7805    <rdkit.Chem.rdchem.Mol object at 0x000001BA5CA...
7806    <rdkit.Chem.rdchem.Mol object at 0x000001BA5CA...
Name: mol, Length: 7807, dtype: object

In [20]:
# Initialize Mordred descriptor calculator
calc = Calculator(descriptors, ignore_3D=True)

In [22]:
# Calculate descriptors
desc_df = calc.pandas(df['mol'])

  1%|▍                                                                             | 48/7807 [00:37<1:14:55,  1.73it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  2%|█▉                                                                             | 189/7807 [00:53<34:02,  3.73it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  3%|██▎                                                                            | 234/7807 [00:55<10:09, 12.42it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 14%|██████████▊                                                                   | 1084/7807 [02:05<18:59,  5.90it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 16%|████████████▍                                                                 | 1251/7807 [02:25<20:31,  5.32it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 16%|████████████▌                                                                 | 1254/7807 [02:26<19:22,  5.64it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 27%|█████████████████████▏                                                        | 2117/7807 [03:59<17:42,  5.36it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 28%|█████████████████████▋                                                        | 2174/7807 [04:06<13:51,  6.78it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 58%|████████████████████████████████████████████▉                                 | 4499/7807 [08:25<13:28,  4.09it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 60%|██████████████████████████████████████████████▊                               | 4683/7807 [08:44<07:14,  7.19it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████████████████████████████████████████████████████████████████████████| 7807/7807 [15:16<00:00,  8.52it/s]


In [24]:
df = pd.concat([df.drop(columns='mol'), desc_df], axis=1)

In [26]:
df.head(3)

Unnamed: 0,NO.,compound_name,IUPAC_name,SMILES,CID,logBB,BBB+/BBB-,Inchi,threshold,reference,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,BBB-,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,...,10.171643,63.201012,398.068491,9.477821,2428,42,144.0,165.0,9.590278,6.097222
1,2,moxalactam,7-[[2-carboxy-2-(4-hydroxyphenyl)acetyl]amino]...,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,3889.0,-2.52,BBB-,InChI=1S/C20H20N6O9S/c1-25-19(22-23-24-25)36-8...,,R25|,...,10.89301,87.033695,520.101247,9.287522,4114,62,194.0,237.0,13.756944,7.916667
2,3,clioquinol,5-chloro-7-iodoquinolin-8-ol,Oc1c(I)cc(Cl)c2cccnc12,2788.0,-2.4,BBB-,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...,,R18|R26|R27|,...,9.618402,44.825548,304.910439,16.939469,218,21,68.0,81.0,4.805556,2.861111


In [28]:
# Save descriptors
df.to_csv("mordred_desc_B3DB.csv.csv", index=False)  