In [1]:
import pandas as pd

In [2]:
from rdkit import Chem
import rdkit
from rdkit.Chem import Descriptors

In [3]:
combined_data = pd.read_csv("..//Data//combined_data.csv")

In [4]:
from joblib import Parallel, delayed

In [5]:
## function from link: https://greglandrum.github.io/rdkit-blog/posts/2022-12-23-descriptor-tutorial.html
def getMolDescriptors(smiles, missingVal=None):
    ''' calculate the full list of descriptors for a molecule
    
        missingVal is used if the descriptor cannot be calculated
    '''
    mol = Chem.MolFromSmiles(smiles)
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [6]:
# getMolDescriptors(combined_data["Smiles"].values[22])

In [7]:
from tqdm.notebook import tqdm

In [8]:
catch = Parallel(n_jobs=15, verbose = 1)(delayed(getMolDescriptors)(i) for i in combined_data["Smiles"].values)

[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.4s
[Parallel(n_jobs=15)]: Done 800 tasks      | elapsed:    1.1s
[Parallel(n_jobs=15)]: Done 4800 tasks      | elapsed:    3.8s
[Parallel(n_jobs=15)]: Done 10400 tasks      | elapsed:    7.6s
[Parallel(n_jobs=15)]: Done 17600 tasks      | elapsed:   12.3s
[Parallel(n_jobs=15)]: Done 26400 tasks      | elapsed:   18.2s
[Parallel(n_jobs=15)]: Done 36800 tasks      | elapsed:   25.0s
[Parallel(n_jobs=15)]: Done 48800 tasks      | elapsed:   33.1s
[Parallel(n_jobs=15)]: Done 62400 tasks      | elapsed:   42.3s
[Parallel(n_jobs=15)]: Done 77600 tasks      | elapsed:   52.6s
[Parallel(n_jobs=15)]: Done 94400 tasks      | elapsed:  1.1min
[Parallel(n_jobs=15)]: Done 112800 tasks      | elapsed:  1.3min
[Parallel(n_jobs=15)]: Done 132800 tasks      | elapsed:  1.5min
[Parallel(n_jobs=15)]: Done 154400 tasks      | elapsed:  1.7min
[Parallel(n_jobs=15)]: Done 1

In [9]:
rdkit_features = pd.DataFrame(catch)

In [10]:
combined_data.shape

(2270960, 4)

In [12]:
combined_data.head()

Unnamed: 0,ChEMBL ID,Smiles,AlogP,Polar Surface Area
0,CHEMBL248825,O=C(OCc1ccco1)C1=C(c2ccc(Cl)c(Cl)c2)CCC1,5.27,39.44
1,CHEMBL149936,CCOC(=O)c1csc2nc(-c3ccccc3)c(-c3ccc(S(C)(=O)=O...,4.31,77.74
2,CHEMBL4169504,Cc1cccnc1CN(C)C[C@H]1Cc2c(cccc2N2CCNCC2)CN1,1.95,43.43
3,CHEMBL3915507,N#CCC1(n2cc(-c3ncnc4[nH]ccc34)cn2)CN(C2CCN(Cc3...,3.69,89.66
4,CHEMBL1306661,COC(CNC(=O)c1cc(C(=O)c2c(-c3c(Cl)cccc3Cl)noc2C...,3.86,106.45


In [11]:
rdkit_features.shape

(2270960, 209)

In [13]:
rdkit_features.head()

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,12.295307,12.295307,0.142845,-0.296322,0.71388,337.202,323.09,336.032,114.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12.491187,12.491187,0.235375,-3.305778,0.444525,426.519,408.375,426.070799,148.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,4.547914,4.547914,0.482022,0.482022,0.847856,365.525,334.277,365.257946,144.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,14.04558,14.04558,0.355489,-0.548433,0.445375,488.546,462.338,488.224849,184.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13.161794,13.161794,0.14687,-0.576299,0.397212,452.294,433.142,451.070176,158.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
rdkit_features["ChEMBL ID"] = combined_data["ChEMBL ID"]

In [19]:
rdkit_features = rdkit_features.drop(['TPSA', 'MolLogP'], axis = 1)

In [20]:
rdkit_features.shape

(2270960, 208)

In [21]:
combined_with_rdkit = combined_data.merge(rdkit_features, on = ["ChEMBL ID"])

In [22]:
combined_with_rdkit.shape

(2272938, 211)

In [23]:
combined_with_rdkit.head()

Unnamed: 0,ChEMBL ID,Smiles,AlogP,Polar Surface Area,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL248825,O=C(OCc1ccco1)C1=C(c2ccc(Cl)c(Cl)c2)CCC1,5.27,39.44,12.295307,12.295307,0.142845,-0.296322,0.71388,337.202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CHEMBL149936,CCOC(=O)c1csc2nc(-c3ccccc3)c(-c3ccc(S(C)(=O)=O...,4.31,77.74,12.491187,12.491187,0.235375,-3.305778,0.444525,426.519,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,CHEMBL4169504,Cc1cccnc1CN(C)C[C@H]1Cc2c(cccc2N2CCNCC2)CN1,1.95,43.43,4.547914,4.547914,0.482022,0.482022,0.847856,365.525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CHEMBL3915507,N#CCC1(n2cc(-c3ncnc4[nH]ccc34)cn2)CN(C2CCN(Cc3...,3.69,89.66,14.04558,14.04558,0.355489,-0.548433,0.445375,488.546,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CHEMBL1306661,COC(CNC(=O)c1cc(C(=O)c2c(-c3c(Cl)cccc3Cl)noc2C...,3.86,106.45,13.161794,13.161794,0.14687,-0.576299,0.397212,452.294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
combined_with_rdkit.dtypes.value_counts()

float64    209
object       2
Name: count, dtype: int64

In [26]:
combined_with_rdkit.to_csv("..//Data//combined_data_with_rdkit.csv", index = False)