### Goal:
Build descriptors for a set of compounds from the Lazar web service (stored in `compounds.csv`)

In [14]:
from pip._internal import main as pip

try:
    import requests
except ImportError:
    pip(['install', 'request'])
    import requests

try:
    import pandas as pd
except ImportError:
    pip(['install', 'pandas'])
    import pandas as pd
       
try:
    from rdkit import Chem
except ImportError:
    print("Run the following from command line:\n\tconda install -c conda-forge rdkit")
    
try:
    from mordred import Calculator, descriptors
except ImportError:
    pip(['install', 'mordred'])
    from mordred import Calculator, descriptors

#### Load the set of compounds

In [26]:
df = pd.read_csv('compounds.csv')
df.head()

Unnamed: 0,SMILES,Blood-Brain-Barrier Penetration
0,OC[C@](c1onc(n1)c1ncn2-c3cccc(c3C(=O)N(Cc12)C)...,non-penetrating
1,NCCc1nc2n(c1)cccc2,non-penetrating
2,NCCc1nc2n(c1)cccc2,non-penetrating
3,CCCN(CCC)CCc1ccc(c2c1CC(=C)N2)O,penetrating
4,Fc1ccc2c(c1)onc2C1CCN(CC1)CCc1c(C)nc2n(c1=O)CC...,penetrating


#### Convert Smiles to Mol representation

In [27]:
mols = []

for smile in df['SMILES']:
    mols.append(Chem.MolFromSmiles(smile))

df['Mol'] = mols    

# remove compounds with non-processible Smiles
df = df.dropna()

#### Calculation of Mordred descriptors

In [37]:
calc = Calculator(descriptors)

dfMord = calc.pandas(df['Mol'])

dfMord.head()

 53%|█████▎    | 207/388 [00:30<01:05,  2.78it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 60%|██████    | 234/388 [00:31<00:28,  5.35it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 71%|███████▏  | 277/388 [00:40<00:43,  2.57it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 79%|███████▊  | 305/388 [00:45<00:48,  1.71it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 388/388 [00:59<00:00,  6.50it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,21.47408,17.978542,0,0,34.5534,2.54198,4.93359,34.5534,1.27976,4.25118,...,10.428837,78.871649,389.089082,9.048583,1727,52,152.0,187.0,9.67361,5.763889
1,9.151948,8.206878,0,1,15.659,2.37835,4.57188,15.659,1.30491,3.42249,...,9.190852,56.587917,161.095297,7.004143,197,14,60.0,69.0,3.33333,2.777778
2,9.151948,8.206878,0,1,15.659,2.37835,4.57188,15.659,1.30491,3.42249,...,9.190852,56.587917,161.095297,7.004143,197,14,60.0,69.0,3.33333,2.777778
3,14.946702,13.14067,0,1,25.0359,2.45245,4.79766,25.0359,1.2518,3.90305,...,9.742908,67.137495,274.204513,5.960968,862,28,98.0,113.0,7.16667,4.666667
4,24.862776,17.808737,0,1,40.9336,2.46674,4.9288,40.9336,1.32044,4.38836,...,10.513824,81.350168,426.206719,7.348392,3047,54,172.0,208.0,9.08333,6.638889


#### Curate Mordred descriptors:
- convert from int64, float64 to int32, float32
- drop columns that are of object type, because they contain text

In [39]:
# convert from 64 to 32 types
for col in dfMord.columns:
    if dfMord[col].dtype == 'float64':
        dfMord[col] = dfMord[col].astype('float32')
    elif dfMord[col].dtype == 'int64':
        dfMord[col] = dfMord[col].astype('int32')
        
dfMord.dtypes

ABC            float32
ABCGG          float32
nAcid            int32
nBase            int32
SpAbs_A         object
SpMax_A         object
SpDiam_A        object
SpAD_A          object
SpMAD_A         object
LogEE_A         object
VE1_A           object
VE2_A           object
VE3_A           object
VR1_A           object
VR2_A           object
VR3_A           object
nAromAtom        int32
nAromBond        int32
nAtom            int32
nHeavyAtom       int32
nSpiro           int32
nBridgehead      int32
nHetero          int32
nH               int32
nB               int32
nC               int32
nN               int32
nO               int32
nS               int32
nP               int32
                ...   
VAdjMat        float32
MWC01          float32
MWC02          float32
MWC03          float32
MWC04          float32
MWC05          float32
MWC06          float32
MWC07          float32
MWC08          float32
MWC09          float32
MWC10          float32
TMWC10         float32
SRW02      

In [43]:
# drop all object columns

print('Dataframe shape before dropping:', dfMord.shape)

# list to collect columns to be dropped
toDrop = []

for col in dfMord.columns:
    if dfMord[col].dtype == 'object':
        toDrop.append(col)

dfMord = dfMord.drop(toDrop, axis=1)

print('Dataframe shape after dropping:', dfMord.shape)

Dataframe shape before dropping: (388, 888)
Dataframe shape after dropping: (388, 888)


#### Join the compounds with descriptors

In [58]:
df = pd.concat([df, dfMord], axis=1)
df = df.rename(columns = {'Blood-Brain-Barrier Penetration': 'True'})
df = df.drop('Mol', axis=1)

df.to_csv('compounds_descriptors.csv', index=False)