In [1]:
# import libraries
import pandas as pd
import numpy as np
import pickle
import sklearn.ensemble
from sklearn.metrics import mean_squared_error, mean_absolute_error

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys, PandasTools
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem.Draw import IPythonConsole

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# provide path to the dataset
path = 'Data/DRD2_clean_data.csv'

df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6770 entries, 0 to 6769
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   canonical  6770 non-null   object 
 1   pChEMBL    6770 non-null   float64
dtypes: float64(1), object(1)
memory usage: 105.9+ KB


In [3]:
df.head()

Unnamed: 0,canonical,pChEMBL
0,CCCSc1nnc(-c2ccccc2)n1C,3.74
1,Cc1ccc(CNCC2(F)CCN(C(=O)c3cc(Br)cs3)CC2)nc1,4.0
2,COc1ccccc1N1CCC2(CCNCC2)CC1,4.0
3,Cl.NCCc1ccc(O)c(O)c1,4.0
4,Cn1c(SCCCN2CCCCC2)nnc1-c1ccccc1,4.01


In [4]:
X = df['canonical']

# note that the target label that we are trying to predic are pChEMBL values in this case
y = df['pChEMBL']

In [5]:
# divide data into train and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
print(X_train.size)
print(X_test.size)

4739
2031


# Circular Fingerprints

SMILEs are in string format. However, we will transform them into numerical encodings known as molecular fingerprints. Each SMILE will be represented as a 1-D vextor with certain number of bits (1024 or 2048). This numerical data will then be used to train a regression model.

In [7]:
# convers a list of SMILEs to a list of mols
def smi_to_mols(smi_list):
    mols_list = [Chem.MolFromSmiles(smile) for smile in smi_list]
    return mols_list


# returns morgan fingerprints as a 2D array for a list of SMILE strings
def get_morgan_fingerprints(smi_list, size, radius, useFeatures, useCounts=True):
        mols = smi_to_mols(smi_list)
        fps = [AllChem.GetMorganFingerprint(mol, radius, useCounts=useCounts, useFeatures=useFeatures) for mol in mols]
        fps_arr = np.zeros((len(fps), size), np.int32)
        for i, fp in enumerate(fps):
            for index, value in fp.GetNonzeroElements().items():
                n_index = index % size
                fps_arr[i, n_index] += int(value)
        return fps_arr 


In [8]:
# conver series to a list
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()

In [9]:
# get fingerprints for training data
X_train_arr = get_morgan_fingerprints(X_train_list, size=2048, radius=3, useFeatures=True, useCounts=True)

In [10]:
X_train_arr.shape

(4739, 2048)

In [10]:
# get fingerprints for test data
X_test_arr = get_morgan_fingerprints(X_test_list, size=2048, radius=3, useFeatures=True, useCounts=True)

# Model Training and Cross Validation
Two types of model will be trained to predict the pChEMBL value: **Random Forest Regressor** and **Support Vector Regressor**. Data will not be scaled before training for the RFR model as tree based algorithms are invariant to scale. RandomizedSearchCV will be used to find the best set of parameters for both models. 

In [12]:
# create a grid of parameters

n_estimators = [int(x) for x in np.linspace(start = 50, stop = 5000, num = 200)]
max_features = [None, 'sqrt','log2']
max_depth = [int(x) for x in np.linspace(3, 60, num = 35)]
max_depth.append(None)
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [50, 74, 99, 124, 149, 174, 199, 224, 248, 273, 298, 323, 348, 373, 398, 423, 447, 472, 497, 522, 547, 572, 597, 622, 646, 671, 696, 721, 746, 771, 796, 821, 845, 870, 895, 920, 945, 970, 995, 1020, 1044, 1069, 1094, 1119, 1144, 1169, 1194, 1219, 1243, 1268, 1293, 1318, 1343, 1368, 1393, 1418, 1442, 1467, 1492, 1517, 1542, 1567, 1592, 1617, 1641, 1666, 1691, 1716, 1741, 1766, 1791, 1816, 1840, 1865, 1890, 1915, 1940, 1965, 1990, 2015, 2039, 2064, 2089, 2114, 2139, 2164, 2189, 2214, 2238, 2263, 2288, 2313, 2338, 2363, 2388, 2413, 2437, 2462, 2487, 2512, 2537, 2562, 2587, 2612, 2636, 2661, 2686, 2711, 2736, 2761, 2786, 2811, 2835, 2860, 2885, 2910, 2935, 2960, 2985, 3010, 3034, 3059, 3084, 3109, 3134, 3159, 3184, 3209, 3233, 3258, 3283, 3308, 3333, 3358, 3383, 3408, 3432, 3457, 3482, 3507, 3532, 3557, 3582, 3607, 3631, 3656, 3681, 3706, 3731, 3756, 3781, 3806, 3830, 3855, 3880, 3905, 3930, 3955, 3980, 4005, 4029, 4054, 4079, 4104, 4129, 4154, 4179, 4204, 4228, 4253, 4278

In [13]:
# initialize a RFR model. n_jobs = 2 means the total no. of CPUs used = n -2
rf = sklearn.ensemble.RandomForestRegressor(n_jobs = -2)

In [14]:
# begin cross validation
# we will only try 50 fits in this case
from sklearn.model_selection import RandomizedSearchCV

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100,
                               cv = 5, verbose = 2)

rf_random.fit(X_train_arr, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] n_estimators=721, max_features=sqrt, max_depth=26, bootstrap=False 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=721, max_features=sqrt, max_depth=26, bootstrap=False, total=   4.2s
[CV] n_estimators=721, max_features=sqrt, max_depth=26, bootstrap=False 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.2s remaining:    0.0s


[CV]  n_estimators=721, max_features=sqrt, max_depth=26, bootstrap=False, total=   2.0s
[CV] n_estimators=721, max_features=sqrt, max_depth=26, bootstrap=False 
[CV]  n_estimators=721, max_features=sqrt, max_depth=26, bootstrap=False, total=   1.9s
[CV] n_estimators=721, max_features=sqrt, max_depth=26, bootstrap=False 
[CV]  n_estimators=721, max_features=sqrt, max_depth=26, bootstrap=False, total=   1.9s
[CV] n_estimators=721, max_features=sqrt, max_depth=26, bootstrap=False 
[CV]  n_estimators=721, max_features=sqrt, max_depth=26, bootstrap=False, total=   1.9s
[CV] n_estimators=4801, max_features=log2, max_depth=24, bootstrap=False 
[CV]  n_estimators=4801, max_features=log2, max_depth=24, bootstrap=False, total=   8.0s
[CV] n_estimators=4801, max_features=log2, max_depth=24, bootstrap=False 
[CV]  n_estimators=4801, max_features=log2, max_depth=24, bootstrap=False, total=   7.7s
[CV] n_estimators=4801, max_features=log2, max_depth=24, bootstrap=False 
[CV]  n_estimators=4801, max_

[CV]  n_estimators=4253, max_features=log2, max_depth=3, bootstrap=True, total=   4.0s
[CV] n_estimators=4253, max_features=log2, max_depth=3, bootstrap=True 
[CV]  n_estimators=4253, max_features=log2, max_depth=3, bootstrap=True, total=   4.0s
[CV] n_estimators=4253, max_features=log2, max_depth=3, bootstrap=True 
[CV]  n_estimators=4253, max_features=log2, max_depth=3, bootstrap=True, total=   4.0s
[CV] n_estimators=2786, max_features=None, max_depth=34, bootstrap=False 
[CV]  n_estimators=2786, max_features=None, max_depth=34, bootstrap=False, total= 2.1min
[CV] n_estimators=2786, max_features=None, max_depth=34, bootstrap=False 
[CV]  n_estimators=2786, max_features=None, max_depth=34, bootstrap=False, total= 2.2min
[CV] n_estimators=2786, max_features=None, max_depth=34, bootstrap=False 
[CV]  n_estimators=2786, max_features=None, max_depth=34, bootstrap=False, total= 2.0min
[CV] n_estimators=2786, max_features=None, max_depth=34, bootstrap=False 
[CV]  n_estimators=2786, max_fea

[CV]  n_estimators=1641, max_features=log2, max_depth=31, bootstrap=False, total=   3.1s
[CV] n_estimators=1641, max_features=log2, max_depth=31, bootstrap=False 
[CV]  n_estimators=1641, max_features=log2, max_depth=31, bootstrap=False, total=   3.0s
[CV] n_estimators=4626, max_features=log2, max_depth=9, bootstrap=True 
[CV]  n_estimators=4626, max_features=log2, max_depth=9, bootstrap=True, total=   4.4s
[CV] n_estimators=4626, max_features=log2, max_depth=9, bootstrap=True 
[CV]  n_estimators=4626, max_features=log2, max_depth=9, bootstrap=True, total=   4.5s
[CV] n_estimators=4626, max_features=log2, max_depth=9, bootstrap=True 
[CV]  n_estimators=4626, max_features=log2, max_depth=9, bootstrap=True, total=   4.2s
[CV] n_estimators=4626, max_features=log2, max_depth=9, bootstrap=True 
[CV]  n_estimators=4626, max_features=log2, max_depth=9, bootstrap=True, total=   4.5s
[CV] n_estimators=4626, max_features=log2, max_depth=9, bootstrap=True 
[CV]  n_estimators=4626, max_features=lo

[CV]  n_estimators=4751, max_features=log2, max_depth=60, bootstrap=True, total=   8.0s
[CV] n_estimators=3408, max_features=None, max_depth=46, bootstrap=False 
[CV]  n_estimators=3408, max_features=None, max_depth=46, bootstrap=False, total= 2.7min
[CV] n_estimators=3408, max_features=None, max_depth=46, bootstrap=False 
[CV]  n_estimators=3408, max_features=None, max_depth=46, bootstrap=False, total= 2.8min
[CV] n_estimators=3408, max_features=None, max_depth=46, bootstrap=False 
[CV]  n_estimators=3408, max_features=None, max_depth=46, bootstrap=False, total= 2.6min
[CV] n_estimators=3408, max_features=None, max_depth=46, bootstrap=False 
[CV]  n_estimators=3408, max_features=None, max_depth=46, bootstrap=False, total= 2.6min
[CV] n_estimators=3408, max_features=None, max_depth=46, bootstrap=False 
[CV]  n_estimators=3408, max_features=None, max_depth=46, bootstrap=False, total= 2.8min
[CV] n_estimators=1467, max_features=log2, max_depth=60, bootstrap=True 
[CV]  n_estimators=1467,

[CV]  n_estimators=1169, max_features=log2, max_depth=13, bootstrap=True, total=   1.9s
[CV] n_estimators=1169, max_features=log2, max_depth=13, bootstrap=True 
[CV]  n_estimators=1169, max_features=log2, max_depth=13, bootstrap=True, total=   1.8s
[CV] n_estimators=1169, max_features=log2, max_depth=13, bootstrap=True 
[CV]  n_estimators=1169, max_features=log2, max_depth=13, bootstrap=True, total=   1.6s
[CV] n_estimators=1169, max_features=log2, max_depth=13, bootstrap=True 
[CV]  n_estimators=1169, max_features=log2, max_depth=13, bootstrap=True, total=   1.6s
[CV] n_estimators=1169, max_features=log2, max_depth=13, bootstrap=True 
[CV]  n_estimators=1169, max_features=log2, max_depth=13, bootstrap=True, total=   1.5s
[CV] n_estimators=3034, max_features=sqrt, max_depth=18, bootstrap=False 
[CV]  n_estimators=3034, max_features=sqrt, max_depth=18, bootstrap=False, total=   5.6s
[CV] n_estimators=3034, max_features=sqrt, max_depth=18, bootstrap=False 
[CV]  n_estimators=3034, max_fe

[CV]  n_estimators=2935, max_features=sqrt, max_depth=None, bootstrap=True, total=   6.3s
[CV] n_estimators=2935, max_features=sqrt, max_depth=None, bootstrap=True 
[CV]  n_estimators=2935, max_features=sqrt, max_depth=None, bootstrap=True, total=   6.4s
[CV] n_estimators=2935, max_features=sqrt, max_depth=None, bootstrap=True 
[CV]  n_estimators=2935, max_features=sqrt, max_depth=None, bootstrap=True, total=   6.3s
[CV] n_estimators=2935, max_features=sqrt, max_depth=None, bootstrap=True 
[CV]  n_estimators=2935, max_features=sqrt, max_depth=None, bootstrap=True, total=   6.3s
[CV] n_estimators=4825, max_features=log2, max_depth=None, bootstrap=False 
[CV]  n_estimators=4825, max_features=log2, max_depth=None, bootstrap=False, total=  11.3s
[CV] n_estimators=4825, max_features=log2, max_depth=None, bootstrap=False 
[CV]  n_estimators=4825, max_features=log2, max_depth=None, bootstrap=False, total=  10.9s
[CV] n_estimators=4825, max_features=log2, max_depth=None, bootstrap=False 
[CV] 

[CV]  n_estimators=4029, max_features=None, max_depth=26, bootstrap=True, total= 1.7min
[CV] n_estimators=4029, max_features=None, max_depth=26, bootstrap=True 
[CV]  n_estimators=4029, max_features=None, max_depth=26, bootstrap=True, total= 1.7min
[CV] n_estimators=4029, max_features=None, max_depth=26, bootstrap=True 
[CV]  n_estimators=4029, max_features=None, max_depth=26, bootstrap=True, total= 1.6min
[CV] n_estimators=547, max_features=None, max_depth=24, bootstrap=True 
[CV]  n_estimators=547, max_features=None, max_depth=24, bootstrap=True, total=  13.6s
[CV] n_estimators=547, max_features=None, max_depth=24, bootstrap=True 
[CV]  n_estimators=547, max_features=None, max_depth=24, bootstrap=True, total=  13.7s
[CV] n_estimators=547, max_features=None, max_depth=24, bootstrap=True 
[CV]  n_estimators=547, max_features=None, max_depth=24, bootstrap=True, total=  13.6s
[CV] n_estimators=547, max_features=None, max_depth=24, bootstrap=True 
[CV]  n_estimators=547, max_features=None

[CV]  n_estimators=4701, max_features=None, max_depth=24, bootstrap=True, total= 1.9min
[CV] n_estimators=4701, max_features=None, max_depth=24, bootstrap=True 
[CV]  n_estimators=4701, max_features=None, max_depth=24, bootstrap=True, total= 1.8min
[CV] n_estimators=4701, max_features=log2, max_depth=21, bootstrap=True 
[CV]  n_estimators=4701, max_features=log2, max_depth=21, bootstrap=True, total=   6.9s
[CV] n_estimators=4701, max_features=log2, max_depth=21, bootstrap=True 
[CV]  n_estimators=4701, max_features=log2, max_depth=21, bootstrap=True, total=   6.6s
[CV] n_estimators=4701, max_features=log2, max_depth=21, bootstrap=True 
[CV]  n_estimators=4701, max_features=log2, max_depth=21, bootstrap=True, total=   6.5s
[CV] n_estimators=4701, max_features=log2, max_depth=21, bootstrap=True 
[CV]  n_estimators=4701, max_features=log2, max_depth=21, bootstrap=True, total=   6.4s
[CV] n_estimators=4701, max_features=log2, max_depth=21, bootstrap=True 
[CV]  n_estimators=4701, max_featu

[CV]  n_estimators=3582, max_features=sqrt, max_depth=48, bootstrap=True, total=   7.2s
[CV] n_estimators=1094, max_features=sqrt, max_depth=39, bootstrap=False 
[CV]  n_estimators=1094, max_features=sqrt, max_depth=39, bootstrap=False, total=   3.0s
[CV] n_estimators=1094, max_features=sqrt, max_depth=39, bootstrap=False 
[CV]  n_estimators=1094, max_features=sqrt, max_depth=39, bootstrap=False, total=   3.1s
[CV] n_estimators=1094, max_features=sqrt, max_depth=39, bootstrap=False 
[CV]  n_estimators=1094, max_features=sqrt, max_depth=39, bootstrap=False, total=   3.1s
[CV] n_estimators=1094, max_features=sqrt, max_depth=39, bootstrap=False 
[CV]  n_estimators=1094, max_features=sqrt, max_depth=39, bootstrap=False, total=   3.0s
[CV] n_estimators=1094, max_features=sqrt, max_depth=39, bootstrap=False 
[CV]  n_estimators=1094, max_features=sqrt, max_depth=39, bootstrap=False, total=   3.1s
[CV] n_estimators=597, max_features=log2, max_depth=58, bootstrap=False 
[CV]  n_estimators=597, 

[CV]  n_estimators=945, max_features=sqrt, max_depth=14, bootstrap=True, total=   1.8s
[CV] n_estimators=945, max_features=sqrt, max_depth=14, bootstrap=True 
[CV]  n_estimators=945, max_features=sqrt, max_depth=14, bootstrap=True, total=   1.8s
[CV] n_estimators=945, max_features=sqrt, max_depth=14, bootstrap=True 
[CV]  n_estimators=945, max_features=sqrt, max_depth=14, bootstrap=True, total=   1.7s
[CV] n_estimators=945, max_features=sqrt, max_depth=14, bootstrap=True 
[CV]  n_estimators=945, max_features=sqrt, max_depth=14, bootstrap=True, total=   1.6s
[CV] n_estimators=945, max_features=sqrt, max_depth=14, bootstrap=True 
[CV]  n_estimators=945, max_features=sqrt, max_depth=14, bootstrap=True, total=   1.7s
[CV] n_estimators=3432, max_features=None, max_depth=14, bootstrap=False 
[CV]  n_estimators=3432, max_features=None, max_depth=14, bootstrap=False, total= 1.9min
[CV] n_estimators=3432, max_features=None, max_depth=14, bootstrap=False 
[CV]  n_estimators=3432, max_features=No

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 225.0min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=-2, oob_score=False,
                                                   random_state...


In [15]:
# find the best set of parameters
rf_random.best_params_

{'n_estimators': 497,
 'max_features': 'sqrt',
 'max_depth': 54,
 'bootstrap': False}

In [16]:
params = rf_random.best_params_

# add n_jobs as additional parameter
params['n_jobs'] = -2

In [17]:
# create a final model with the best parameters
rf_final = sklearn.ensemble.RandomForestRegressor(**params)

rf_final.fit(X_train_arr, y_train)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=54,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=497, n_jobs=-2,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [18]:
# check results with the test data
y_pred_test = rf_final.predict(X= X_test_arr)
MSE = mean_squared_error(y_test, y_pred_test)
MAE = mean_absolute_error(y_test, y_pred_test)

In [19]:
print("Mean Squared Error: " + str(MSE))
print("Mean Absolute Error: " + str(MAE))

Mean Squared Error: 0.38472023779139575
Mean Absolute Error: 0.45935988118759524


## Model Training Using SVR

In [11]:
# import standard scaler
from sklearn.preprocessing import StandardScaler

In [12]:
# apply scaling to train and test data

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train_arr)
X_test_sc = sc.transform(X_test_arr)

In [17]:
# create a grid for cross validation

kernel = ['poly','rbf','sigmoid']
gamma = ['scale','auto']
C = [float(x) for x in np.linspace(start = 0.02, stop = 60, num = 100)]

sv_grid = {'kernel': kernel,
        'gamma': gamma,
        'C': C}

print(sv_grid)

{'kernel': ['poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto'], 'C': [0.02, 0.6258585858585859, 1.2317171717171718, 1.8375757575757576, 2.4434343434343435, 3.0492929292929296, 3.6551515151515153, 4.2610101010101005, 4.866868686868687, 5.472727272727273, 6.078585858585859, 6.684444444444444, 7.29030303030303, 7.896161616161616, 8.502020202020201, 9.107878787878787, 9.713737373737374, 10.31959595959596, 10.925454545454546, 11.531313131313132, 12.137171717171718, 12.743030303030302, 13.348888888888888, 13.954747474747474, 14.56060606060606, 15.166464646464647, 15.772323232323233, 16.37818181818182, 16.984040404040403, 17.58989898989899, 18.195757575757575, 18.801616161616163, 19.407474747474748, 20.013333333333332, 20.61919191919192, 21.225050505050504, 21.830909090909092, 22.436767676767676, 23.042626262626264, 23.64848484848485, 24.254343434343436, 24.86020202020202, 25.466060606060605, 26.071919191919193, 26.677777777777777, 27.283636363636365, 27.88949494949495, 28.495353535353537,

In [18]:
# create an instance of an SVR

from sklearn import svm

sv = svm.SVR()

In [None]:
# start the cross validation process
from sklearn.model_selection import RandomizedSearchCV

sv_random = RandomizedSearchCV(estimator = sv, param_distributions = sv_grid, n_iter = 100,
                               cv = 5, verbose = 2)

sv_random.fit(X_train_sc, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] kernel=rbf, gamma=scale, C=21.830909090909092 ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... kernel=rbf, gamma=scale, C=21.830909090909092, total=  38.5s
[CV] kernel=rbf, gamma=scale, C=21.830909090909092 ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   38.5s remaining:    0.0s


[CV] .... kernel=rbf, gamma=scale, C=21.830909090909092, total=  56.1s
[CV] kernel=rbf, gamma=scale, C=21.830909090909092 ...................
[CV] .... kernel=rbf, gamma=scale, C=21.830909090909092, total= 1.1min
[CV] kernel=rbf, gamma=scale, C=21.830909090909092 ...................
[CV] .... kernel=rbf, gamma=scale, C=21.830909090909092, total= 1.0min
[CV] kernel=rbf, gamma=scale, C=21.830909090909092 ...................
[CV] .... kernel=rbf, gamma=scale, C=21.830909090909092, total= 1.0min
[CV] kernel=poly, gamma=scale, C=36.97737373737374 ...................
[CV] .... kernel=poly, gamma=scale, C=36.97737373737374, total= 1.0min
[CV] kernel=poly, gamma=scale, C=36.97737373737374 ...................
[CV] .... kernel=poly, gamma=scale, C=36.97737373737374, total= 1.0min
[CV] kernel=poly, gamma=scale, C=36.97737373737374 ...................
[CV] .... kernel=poly, gamma=scale, C=36.97737373737374, total= 1.0min
[CV] kernel=poly, gamma=scale, C=36.97737373737374 ...................
[CV] .

[CV] . kernel=sigmoid, gamma=auto, C=34.553939393939395, total=  48.6s
[CV] kernel=sigmoid, gamma=scale, C=11.531313131313132 ...............
[CV]  kernel=sigmoid, gamma=scale, C=11.531313131313132, total=  47.8s
[CV] kernel=sigmoid, gamma=scale, C=11.531313131313132 ...............
[CV]  kernel=sigmoid, gamma=scale, C=11.531313131313132, total=  47.0s
[CV] kernel=sigmoid, gamma=scale, C=11.531313131313132 ...............
[CV]  kernel=sigmoid, gamma=scale, C=11.531313131313132, total=  47.7s
[CV] kernel=sigmoid, gamma=scale, C=11.531313131313132 ...............
[CV]  kernel=sigmoid, gamma=scale, C=11.531313131313132, total=  47.3s
[CV] kernel=sigmoid, gamma=scale, C=11.531313131313132 ...............
[CV]  kernel=sigmoid, gamma=scale, C=11.531313131313132, total=  48.2s
[CV] kernel=rbf, gamma=scale, C=29.707070707070706 ...................
[CV] .... kernel=rbf, gamma=scale, C=29.707070707070706, total=  40.3s
[CV] kernel=rbf, gamma=scale, C=29.707070707070706 ...................
[CV] .

[CV] ...... kernel=rbf, gamma=scale, C=55.7589898989899, total=  38.4s
[CV] kernel=rbf, gamma=scale, C=55.7589898989899 .....................
[CV] ...... kernel=rbf, gamma=scale, C=55.7589898989899, total=  38.8s
[CV] kernel=rbf, gamma=scale, C=55.7589898989899 .....................
[CV] ...... kernel=rbf, gamma=scale, C=55.7589898989899, total=  54.2s
[CV] kernel=sigmoid, gamma=auto, C=58.78828282828283 .................
[CV] .. kernel=sigmoid, gamma=auto, C=58.78828282828283, total= 1.3min
[CV] kernel=sigmoid, gamma=auto, C=58.78828282828283 .................
[CV] .. kernel=sigmoid, gamma=auto, C=58.78828282828283, total= 1.3min
[CV] kernel=sigmoid, gamma=auto, C=58.78828282828283 .................
[CV] .. kernel=sigmoid, gamma=auto, C=58.78828282828283, total= 1.3min
[CV] kernel=sigmoid, gamma=auto, C=58.78828282828283 .................
[CV] .. kernel=sigmoid, gamma=auto, C=58.78828282828283, total= 1.2min
[CV] kernel=sigmoid, gamma=auto, C=58.78828282828283 .................
[CV] .

[CV] .... kernel=rbf, gamma=scale, C=36.371515151515155, total=  41.4s
[CV] kernel=rbf, gamma=scale, C=36.371515151515155 ...................
[CV] .... kernel=rbf, gamma=scale, C=36.371515151515155, total=  40.5s
[CV] kernel=rbf, gamma=scale, C=36.371515151515155 ...................
[CV] .... kernel=rbf, gamma=scale, C=36.371515151515155, total=  40.1s
[CV] kernel=rbf, gamma=scale, C=36.371515151515155 ...................
[CV] .... kernel=rbf, gamma=scale, C=36.371515151515155, total=  40.0s
[CV] kernel=rbf, gamma=scale, C=36.371515151515155 ...................
[CV] .... kernel=rbf, gamma=scale, C=36.371515151515155, total=  40.5s
[CV] kernel=poly, gamma=auto, C=3.0492929292929296 ...................
[CV] .... kernel=poly, gamma=auto, C=3.0492929292929296, total=  41.8s
[CV] kernel=poly, gamma=auto, C=3.0492929292929296 ...................
[CV] .... kernel=poly, gamma=auto, C=3.0492929292929296, total=  39.6s
[CV] kernel=poly, gamma=auto, C=3.0492929292929296 ...................
[CV] .

[CV] ... kernel=poly, gamma=scale, C=19.407474747474748, total=  39.7s
[CV] kernel=poly, gamma=scale, C=19.407474747474748 ..................
[CV] ... kernel=poly, gamma=scale, C=19.407474747474748, total=  39.8s
[CV] kernel=rbf, gamma=auto, C=26.071919191919193 ....................
[CV] ..... kernel=rbf, gamma=auto, C=26.071919191919193, total=  39.7s
[CV] kernel=rbf, gamma=auto, C=26.071919191919193 ....................
[CV] ..... kernel=rbf, gamma=auto, C=26.071919191919193, total=  43.0s
[CV] kernel=rbf, gamma=auto, C=26.071919191919193 ....................
[CV] ..... kernel=rbf, gamma=auto, C=26.071919191919193, total=  40.5s
[CV] kernel=rbf, gamma=auto, C=26.071919191919193 ....................
[CV] ..... kernel=rbf, gamma=auto, C=26.071919191919193, total=  40.6s
[CV] kernel=rbf, gamma=auto, C=26.071919191919193 ....................
[CV] ..... kernel=rbf, gamma=auto, C=26.071919191919193, total=  40.8s
[CV] kernel=sigmoid, gamma=scale, C=27.283636363636365 ...............
[CV]  

In [None]:
# print the best parameters
sv_random.best_params_

In [None]:
params = sv_random.best_params_

In [None]:
# create a final model with the best parameters
sv_final = svm.SVR(**params)

sv_final.fit(X_train_sc, y_train)

In [None]:
# check results with the test data
y_pred_test = sv_final.predict(X= X_test_sc)
MSE = mean_squared_error(y_test, y_pred_test)
MAE = mean_absolute_error(y_test, y_pred_test)
print("Mean Squared Error: " + MSE)
print("Mean Absolute Error: " + MAE)

## Save final models

In [None]:
with open("models/DRD2_pChEMBL_rf_predictor.pkl", "wb") as f:
    pickle.dump(rf_final, f)
    
with open("models/DRD2_pChEMBL_sv_predictor.pkl", "wb") as f:
    pickle.dump(sv_final, f)