In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from mordred import Calculator, descriptors

In [2]:
dataset = pd.read_csv('../Data/3203_merged_DrugBank_ZINC_chEMBL_KEGG.csv')

In [3]:
# Delete compounds that don't have an ATC code assigned
dataset = dataset.dropna(subset=['ATC Codes'])
dataset.reset_index(drop=True, inplace=True)
# Delete compounds that don't have Neutralized SMILES
dataset = dataset.dropna(subset=['Neutralized SMILES'])
dataset.reset_index(drop=True, inplace=True)

In [4]:
# Selection of descriptors types to compute: 0-1-2D
# Function to calculate descriptors for an array of SMILES strings
def calculate_descriptors(smiles):
    # Create descriptor calculator with all descriptors except 3D ones
    calc = Calculator(descriptors, ignore_3D = True)
    
    # Obtain an array of molecules from their SMILES strings
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    
    # Calculate descriptors for each molecule
    df = calc.pandas(mols)
    
    return df

In [5]:
# Obtain a dataframe of descriptors for every SMILES string
smiles_descriptors = calculate_descriptors(dataset['Neutralized SMILES'])

100%|██████████████████████████████████████████████████████████████████████████████| 3203/3203 [01:27<00:00, 36.54it/s]


In [6]:
print(smiles_descriptors.info())

<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 3203 entries, 0 to 3202
Columns: 1613 entries, ABC to mZagreb2
dtypes: bool(2), float64(786), int64(309), object(516)
memory usage: 39.4+ MB
None


In [7]:
smiles_descriptors = smiles_descriptors.apply(pd.to_numeric, errors = 'coerce')
print(smiles_descriptors.info())

<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 3203 entries, 0 to 3202
Columns: 1613 entries, ABC to mZagreb2
dtypes: bool(2), float64(1302), int64(309)
memory usage: 39.4 MB
None


In [8]:
boolean_cols = smiles_descriptors.select_dtypes(include=['bool']).columns
smiles_descriptors[boolean_cols] = smiles_descriptors[boolean_cols].astype(int)

smiles_descriptors = smiles_descriptors.apply(pd.to_numeric, errors = 'coerce')
print(smiles_descriptors.info())

<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 3203 entries, 0 to 3202
Columns: 1613 entries, ABC to mZagreb2
dtypes: float64(1302), int32(2), int64(309)
memory usage: 39.4 MB
None


In [9]:
# Delete descriptors that all of its values are NaN
smiles_descriptors.dropna(axis = 1, how = 'all')

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,116.622994,86.472639,6,4,190.059247,2.427362,4.802289,190.059247,1.226189,5.922447,...,11.639214,214.910121,2178.985813,7.436812,286334,230,752.0,846.0,62.416667,34.777778
1,69.692912,55.004022,0,3,113.155286,2.422176,4.782750,113.155286,1.243465,5.408858,...,11.198119,149.369536,1268.641439,7.249380,52357,134,458.0,522.0,34.284722,20.069444
2,99.960817,76.897005,0,0,162.225249,2.429697,4.800177,162.225249,1.238361,5.771282,...,11.619598,191.673779,1810.033419,6.804637,148212,208,660.0,762.0,50.972222,29.111111
3,55.942888,41.040734,0,3,91.364160,2.414758,4.775270,91.364160,1.234651,5.189253,...,10.885098,126.972448,1068.426955,7.742224,28498,110,360.0,404.0,28.055556,16.583333
4,77.541280,58.264383,0,3,126.895388,2.411531,4.787916,126.895388,1.244072,5.515081,...,11.281813,156.769635,1429.669818,7.369432,72418,154,504.0,571.0,38.166667,22.777778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3198,20.387860,16.599062,1,0,32.777540,2.533516,5.006933,32.777540,1.213983,4.199341,...,10.251571,76.398181,390.221781,6.613928,2140,42,136.0,158.0,10.930556,6.020833
3199,32.979560,25.880956,0,0,53.101049,2.734492,5.408105,53.101049,1.295148,4.689653,...,11.219936,99.590604,570.299282,6.789277,5128,91,240.0,307.0,14.722222,8.583333
3200,32.239600,25.363418,2,0,52.383156,2.515304,4.914363,52.383156,1.277638,4.647839,...,10.658459,95.480452,558.222683,7.862291,6025,65,220.0,261.0,13.506944,8.888889
3201,20.392476,18.274345,2,0,31.761241,2.495140,4.977702,31.761241,1.221586,4.188228,...,10.203111,76.220112,386.070396,9.899241,1475,36,138.0,159.0,9.180556,5.527778


In [10]:
# Delete descriptors with >1% of NaN
smiles_descriptors.drop(smiles_descriptors.columns[smiles_descriptors.isna().mean() > 0.01], inplace = True, axis = 1)

In [11]:
# Delete constant descriptors
list_constantes = smiles_descriptors.columns[smiles_descriptors.eq(smiles_descriptors.iloc[0]).all()].tolist()
smiles_descriptors.drop(list_constantes,  inplace = True, axis = 1)

In [12]:
smiles_descriptors.iloc[228]

ABC          8.025464
ABCGG        7.680380
nAcid        0.000000
nBase        1.000000
SpAbs_A     13.619695
              ...    
WPol        13.000000
Zagreb1     50.000000
Zagreb2     55.000000
mZagreb1     4.583333
mZagreb2     2.555556
Name: 228, Length: 1181, dtype: float64

In [13]:
# Delete highly correlated descriptors
corr_matrix = smiles_descriptors.corr().abs()
threshold = 1
upper_matrix = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

In [14]:
# Obtain the columns that are 100% correlated
to_drop = [column for column in upper_matrix.columns if any(upper_matrix[column] >= threshold)]

In [15]:
for column in upper_matrix.columns:
    for i in range(len(upper_matrix[column])):
        x = upper_matrix[column].iloc[i] >= threshold
        if x:
            print(f"{column} - {upper_matrix.index[i]}")

SpAD_A - SpAbs_A
nBondsA - nAromBond
SpAD_Dt - SpAbs_Dt
SpAD_D - SpAbs_D
NsF - nF
NdsssP - nP
NsBr - nBr
NsI - nI
SssssB - NssssB
SddC - NddC
ETA_eta_RL - Xp-1d
apol - Sp
n9HRing - n9Ring
n10HRing - n10Ring
n12HRing - n12Ring
nG12HRing - nG12Ring
nG12aHRing - nG12aRing
n3ARing - n3Ring
n4ARing - n4Ring
n8ARing - n8Ring
n9ARing - n9Ring
n9ARing - n9HRing
n10ARing - n10Ring
n10ARing - n10HRing
n12ARing - n12Ring
n12ARing - n12HRing
n3AHRing - n3HRing
n4AHRing - n4HRing
n7AHRing - n7HRing
n8AHRing - n8HRing
n9AHRing - n9Ring
n9AHRing - n9HRing
n9AHRing - n9ARing
n10AHRing - n10Ring
n10AHRing - n10HRing
n10AHRing - n10ARing
n12AHRing - n12Ring
n12AHRing - n12HRing
n12AHRing - n12ARing
nG12AHRing - nG12ARing
n6FHRing - n6FRing
n12FHRing - n12FRing
n9FaHRing - n9FaRing
n12FaHRing - n12FaRing
n6FARing - n6FRing
n6FARing - n6FHRing
n7FARing - n7FRing
n8FARing - n8FRing
n11FARing - n11FRing
n6FAHRing - n6FRing
n6FAHRing - n6FHRing
n6FAHRing - n6FARing
n7FAHRing - n7FHRing
n8FAHRing - n8FHRing
n

In [16]:
to_drop

['SpAD_A',
 'nBondsA',
 'SpAD_Dt',
 'SpAD_D',
 'NsF',
 'NdsssP',
 'NsBr',
 'NsI',
 'SssssB',
 'SddC',
 'ETA_eta_RL',
 'apol',
 'n9HRing',
 'n10HRing',
 'n12HRing',
 'nG12HRing',
 'nG12aHRing',
 'n3ARing',
 'n4ARing',
 'n8ARing',
 'n9ARing',
 'n10ARing',
 'n12ARing',
 'n3AHRing',
 'n4AHRing',
 'n7AHRing',
 'n8AHRing',
 'n9AHRing',
 'n10AHRing',
 'n12AHRing',
 'nG12AHRing',
 'n6FHRing',
 'n12FHRing',
 'n9FaHRing',
 'n12FaHRing',
 'n6FARing',
 'n7FARing',
 'n8FARing',
 'n11FARing',
 'n6FAHRing',
 'n7FAHRing',
 'n8FAHRing',
 'n11FAHRing',
 'n12FAHRing',
 'MWC01']

In [17]:
# Drop selected columns
smiles_descriptors.drop(to_drop, inplace = True, axis=1)
smiles_descriptors.info()

<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 3203 entries, 0 to 3202
Columns: 1136 entries, ABC to mZagreb2
dtypes: float64(969), int32(2), int64(165)
memory usage: 27.7 MB


In [18]:
# Delete descriptors with >1% of NaN
smiles_descriptors.drop(smiles_descriptors.columns[smiles_descriptors.isna().mean() > 0.01], inplace = True, axis = 1)

In [19]:
# Delete constant descriptors
list_constantes = smiles_descriptors.columns[smiles_descriptors.eq(smiles_descriptors.iloc[0]).all()].tolist()
smiles_descriptors.drop(list_constantes,  inplace = True, axis = 1)

In [20]:
smiles_descriptors.info()

<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 3203 entries, 0 to 3202
Columns: 1136 entries, ABC to mZagreb2
dtypes: float64(969), int32(2), int64(165)
memory usage: 27.7 MB


In [21]:
smiles_descriptors = smiles_descriptors.apply(pd.to_numeric, errors = 'coerce')
print(smiles_descriptors.info())

<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 3203 entries, 0 to 3202
Columns: 1136 entries, ABC to mZagreb2
dtypes: float64(969), int32(2), int64(165)
memory usage: 27.7 MB
None


In [22]:
dataset_descriptors = pd.concat([dataset['Neutralized SMILES'],dataset['ATC Codes'], smiles_descriptors], axis = 1)
dataset_descriptors

Unnamed: 0,Neutralized SMILES,ATC Codes,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpMAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,B01AE,116.622994,86.472639,6,4,190.059247,2.427362,4.802289,1.226189,...,11.639214,214.910121,2178.985813,7.436812,286334,230,752.0,846.0,62.416667,34.777778
1,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,L02AE,69.692912,55.004022,0,3,113.155286,2.422176,4.782750,1.243465,...,11.198119,149.369536,1268.641439,7.249380,52357,134,458.0,522.0,34.284722,20.069444
2,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,R02AB,99.960817,76.897005,0,0,162.225249,2.429697,4.800177,1.238361,...,11.619598,191.673779,1810.033419,6.804637,148212,208,660.0,762.0,50.972222,29.111111
3,N=C(N)NCCC[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H]...,H01BA,55.942888,41.040734,0,3,91.364160,2.414758,4.775270,1.234651,...,10.885098,126.972448,1068.426955,7.742224,28498,110,360.0,404.0,28.055556,16.583333
4,CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1cc...,H01CC,77.541280,58.264383,0,3,126.895388,2.411531,4.787916,1.244072,...,11.281813,156.769635,1429.669818,7.369432,72418,154,504.0,571.0,38.166667,22.777778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3198,CCCCC(F)(F)[C@@]1(O)CC[C@@H]2[C@H](CCCCCCC(=O)...,A06AX,20.387860,16.599062,1,0,32.777540,2.533516,5.006933,1.213983,...,10.251571,76.398181,390.221781,6.613928,2140,42,136.0,158.0,10.930556,6.020833
3199,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,R01AD,32.979560,25.880956,0,0,53.101049,2.734492,5.408105,1.295148,...,11.219936,99.590604,570.299282,6.789277,5128,91,240.0,307.0,14.722222,8.583333
3200,CCCc1nc(C(C)(C)O)c(C(=O)OCc2oc(=O)oc2C)n1Cc1cc...,C09CA,32.239600,25.363418,2,0,52.383156,2.515304,4.914363,1.277638,...,10.658459,95.480452,558.222683,7.862291,6025,65,220.0,261.0,13.506944,8.888889
3201,O=P(O)(O)OC(Cn1cncn1)(Cn1cncn1)c1ccc(F)cc1F,J02AC,20.392476,18.274345,2,0,31.761241,2.495140,4.977702,1.221586,...,10.203111,76.220112,386.070396,9.899241,1475,36,138.0,159.0,9.180556,5.527778


In [23]:
dataset_descriptors.to_csv('3203_descriptors.csv', index = False)