# **Computational Drug Discovery [Part 3] Descriptor Calculation and Dataset Preparation**

Nickolas Winters

**Part 3:** The molecular descriptors that are essentially quantitative description of the compounds in the dataset were calculated.

## **Download PaDEL-Descriptor**

## **Load bioactivity data**

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('amyloid_04_bioactivity_data_3class_pIC50.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL311039,CC12CCC(C1)C(C)(C)C2NS(=O)(=O)c1ccc(F)cc1,intermediate,311.422,3.31880,1.0,2.0,5.301030
1,1,CHEMBL450926,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1cccs1,intermediate,299.461,3.24120,1.0,3.0,5.568636
2,2,CHEMBL310242,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,intermediate,311.422,3.31880,1.0,2.0,5.744727
3,3,CHEMBL74874,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,inactive,327.877,3.83310,1.0,2.0,4.958607
4,4,CHEMBL75183,CC12CC[C@@H](C1)C(C)(C)[C@@H]2NS(=O)(=O)c1ccc(...,inactive,372.328,3.94220,1.0,2.0,5.000000
...,...,...,...,...,...,...,...,...,...
1183,1183,CHEMBL5274298,COC(=O)c1cc(O)cc(OC)c1C(=O)c1c(O)cc(CO[C@H]2O[...,intermediate,510.448,-0.85390,7.0,13.0,5.987163
1184,1184,CHEMBL5283067,COC(=O)c1c(Sc2c(O)cc(OC)c(C(=O)c3c(O)cc(C)cc3O...,active,680.640,4.43222,6.0,15.0,6.004365
1185,1185,CHEMBL5273520,COC(=O)c1c(O)cc(C)c(Sc2c(O)cc(OC)c(Oc3c(O)cc(C...,active,624.620,5.52076,7.0,13.0,7.000000
1186,1186,CHEMBL5282081,COC(=O)c1c(O)cc(C)cc1C(=O)c1cc(O)cc2oc3cc4c(c(...,intermediate,552.491,4.81824,4.0,10.0,5.806875


In [4]:
selection = ['canonical_smiles','molecule_chembl_id']
df_selection = df[selection]
df_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

## **Calculate fingerprint descriptors**


### **Calculate PaDEL descriptors**

In [14]:
from padelpy import padeldescriptor

In [23]:
padeldescriptor(
    mol_dir = "molecule.smi",
    d_file = "descriptors_output.csv",
    descriptortypes = "PubchemFingerprinter.xml",
    fingerprints = True,
    detectaromaticity = True, 
    standardizenitro = True,
    standardizetautomers = True,
    threads = 2,
    removesalt = True,
    log = True
)

## **Preparing the X and Y Data Matrices**

### **X data matrix**

In [24]:
df_X = pd.read_csv('descriptors_output.csv')

In [25]:
df_X

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL311039,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL450926,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL310242,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL74874,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL75183,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1183,CHEMBL5274298,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1184,CHEMBL5283067,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1185,CHEMBL5273520,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1186,CHEMBL5282081,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df_X = df_X.drop(columns=['Name'])
df_X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1183,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1184,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1185,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1186,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## **Y variable**

### **Convert IC50 to pIC50**

In [27]:
df_Y = df['pIC50']
df_Y

0       5.301030
1       5.568636
2       5.744727
3       4.958607
4       5.000000
          ...   
1183    5.987163
1184    6.004365
1185    7.000000
1186    5.806875
1187    6.744727
Name: pIC50, Length: 1188, dtype: float64

## **Combining X and Y variable**

In [28]:
dataset = pd.concat([df_X,df_Y], axis=1)
dataset

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.301030
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.568636
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.744727
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.958607
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1183,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.987163
1184,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.004365
1185,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.000000
1186,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.806875


In [30]:
# dataset.to_csv('amyloid_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)