# Install Libraries

In [None]:
pip install PyTDC

In [None]:
pip install rdkit-pypi

In [None]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

# Creating global features extractor function

In [None]:
def molecular_descriptors(table):

  descriptors = pd.DataFrame()

  mol = [Chem.MolFromSmiles(drug) for drug in table.Drug]

  # Exact molecular weight of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.ExactMolWt(i) for i in mol])
  descriptors['Exact_MW'] = Nilavo[0]

  # FpDensityMorgan1
  Nilavo = []
  Nilavo.append([Descriptors.FpDensityMorgan1(i) for i in mol])
  descriptors['FpDensityMorgan1'] = Nilavo[0]

  # FpDensityMorgan2
  Nilavo = []
  Nilavo.append([Descriptors.FpDensityMorgan2(i) for i in mol])
  descriptors['FpDensityMorgan2'] = Nilavo[0]

  # FpDensityMorgan3
  Nilavo = []
  Nilavo.append([Descriptors.FpDensityMorgan3(i) for i in mol])
  descriptors['FpDensityMorgan3'] = Nilavo[0]

  # Average molecular weight of the molecule ignoring hydrogens
  Nilavo = []
  Nilavo.append([Descriptors.HeavyAtomMolWt(i) for i in mol])
  descriptors['HeavyAtomMolWt'] = Nilavo[0]

  ###
  ### MaxAbsPartialCharge ###
  Nilavo = []
  Nilavo.append([Descriptors.MaxAbsPartialCharge(i) for i in mol])
  descriptors['MaxAbsPartialCharge'] = Nilavo[0]

  ###
  ### MaxPartialCharge ###
  Nilavo = []
  Nilavo.append([Descriptors.MaxPartialCharge(i) for i in mol])
  descriptors['MaxPartialCharge'] = Nilavo[0]

  ###
  ### MinAbsPartialCharge ###
  Nilavo = []
  Nilavo.append([Descriptors.MinAbsPartialCharge(i) for i in mol])
  descriptors['MinAbsPartialCharge'] = Nilavo[0]

  ###
  ### MinPartialCharge ###
  Nilavo = []
  Nilavo.append([Descriptors.MinPartialCharge(i) for i in mol])
  descriptors['MinPartialCharge'] = Nilavo[0]

  # Average molecular weight of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.MolWt(i) for i in mol])
  descriptors['MolWt'] = Nilavo[0]

  # Number of radical electrons of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.NumRadicalElectrons(i) for i in mol])
  descriptors['NumRadicalElectrons'] = Nilavo[0]

  # Number of valence electrons of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.NumValenceElectrons(i) for i in mol])
  descriptors['NumValenceElectrons'] = Nilavo[0]

  # Log of partition coefficient
  Nilavo = []
  Nilavo.append([Descriptors.MolLogP(i) for i in mol])
  descriptors['Partition_Coefficient'] = Nilavo[0]


  ### Lipinski Descriptors ###
  # Fraction of C atoms that are SP3 hybridized
  Nilavo = []
  Nilavo.append([Lipinski.FractionCSP3(i) for i in mol])
  descriptors['FractionCSP3'] = Nilavo[0]

  # Number of heavy atoms a molecule
  Nilavo = []
  Nilavo.append([Lipinski.HeavyAtomCount(i) for i in mol])
  descriptors['Heavy_atoms'] = Nilavo[0]

  # Number of NHs or OHs
  Nilavo = []
  Nilavo.append([Lipinski.NHOHCount(i) for i in mol])
  descriptors['NHs/OHs'] = Nilavo[0]

  # Number of Nitrogens and Oxygens
  Nilavo = []
  Nilavo.append([Lipinski.NOCount(i) for i in mol])
  descriptors['N&O'] = Nilavo[0]

  # Number of aliphatic (containing at least one non-aromatic bond) carbocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAliphaticCarbocycles(i) for i in mol])
  descriptors['Aliphatic_carbocycles'] = Nilavo[0]

  # Number of aliphatic (containing at least one non-aromatic bond) heterocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAliphaticHeterocycles(i) for i in mol])
  descriptors['Aliphatic_heterocycles'] = Nilavo[0]

  # Number of aliphatic (containing at least one non-aromatic bond) rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAliphaticRings(i) for i in mol])
  descriptors['Aliphatic_rings'] = Nilavo[0]

  # Nmber of aromatic carbocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAromaticCarbocycles(i) for i in mol])
  descriptors['Aromatic_carbocycles'] = Nilavo[0]

  # Number of aromatic heterocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAromaticHeterocycles(i) for i in mol])
  descriptors['Aromatic_heterocycles'] = Nilavo[0]

  # Number of aromatic rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAromaticRings(i) for i in mol])
  descriptors['Aromatic_rings'] = Nilavo[0]

  # Number of Hydrogen Bond Acceptors
  Nilavo = []
  Nilavo.append([Lipinski.NumHAcceptors(i) for i in mol])
  descriptors['HAcceptors'] = Nilavo[0]

  # Number of Hydrogen Bond Donors
  Nilavo = []
  Nilavo.append([Lipinski.NumHDonors(i) for i in mol])
  descriptors['HDonors'] = Nilavo[0]

  # Number of Heteroatoms
  Nilavo = []
  Nilavo.append([Lipinski.NumHeteroatoms(i) for i in mol])
  descriptors['Heteroatoms'] = Nilavo[0]

  # Number of Rotatable Bonds
  Nilavo = []
  Nilavo.append([Lipinski.NumRotatableBonds(i) for i in mol])
  descriptors['Rotatable_Bonds'] = Nilavo[0]

  # Number of saturated carbocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumSaturatedCarbocycles(i) for i in mol])
  descriptors['Saturated_Carbocycles'] = Nilavo[0]

  # Number of saturated heterocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumSaturatedHeterocycles(i) for i in mol])
  descriptors['Saturated_Heterocycles'] = Nilavo[0]

  # Number of saturated rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumSaturatedRings(i) for i in mol])
  descriptors['Saturated_Rings'] = Nilavo[0]

  # Number of rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.RingCount(i) for i in mol])
  descriptors['Rings'] = Nilavo[0]

  return descriptors

# Regression Problems

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_error as mae

In [None]:
regression_datasets = ['caco2_wang', 
                       'lipophilicity_astrazeneca', 
                       'solubility_aqsoldb', 
                       'ppbr_az', 
                       'vdss_lombardo', 
                       'half_life_obach', 
                       'clearance_microsome_az',
                       'clearance_hepatocyte_az', 
                       'ld50_zhu'
                      ]

* Search best ML models

In [None]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')

best_model_list = []

for reg_data in regression_datasets:
  LR = []
  KNN = []
  DT = []
  Bag = []
  RF = []
  ET = []
  GB = []
  AB = []

  benchmark = group.get(reg_data)
  name = benchmark['name']

  # split the dataset into train_val & test set
  train_val, test = benchmark['train_val'], benchmark['test']
  train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = 1)

  # feature extracting
  x_train = molecular_descriptors(train)
  x_valid = molecular_descriptors(valid)

  # Replace NaN values with 0
  x_train = np.nan_to_num(x_train, nan=0, posinf=0)
  x_valid = np.nan_to_num(x_valid, nan=0, posinf=0)

  # target data
  y_train = train.Y
  y_valid = valid.Y

  ### USE ONLY TRAINING AND VALIDATION SET TO SELECT MODEL ###

  lin = LinearRegression()
  lin.fit(x_train, y_train)
  y_pred_valid = lin.predict(x_valid)
  LR.append(mae(y_valid, y_pred_valid))

  knn = KNeighborsRegressor()
  knn.fit(x_train, y_train)
  y_pred_valid = knn.predict(x_valid)
  KNN.append(mae(y_valid, y_pred_valid))

  dt = DecisionTreeRegressor(random_state=0)
  dt.fit(x_train, y_train)
  y_pred_valid = dt.predict(x_valid)
  DT.append(mae(y_valid, y_pred_valid))

  bag = BaggingRegressor(DecisionTreeRegressor(), random_state=0)
  bag.fit(x_train, y_train)
  y_pred_valid = bag.predict(x_valid)
  Bag.append(mae(y_valid, y_pred_valid))

  rf = RandomForestRegressor(random_state=0)
  rf.fit(x_train, y_train)
  y_pred_valid = rf.predict(x_valid)
  RF.append(mae(y_valid, y_pred_valid))

  et = ExtraTreesRegressor(random_state=0)
  et.fit(x_train, y_train)
  y_pred_valid = et.predict(x_valid)
  ET.append(mae(y_valid, y_pred_valid))

  grad = GradientBoostingRegressor(random_state=0)
  grad.fit(x_train, y_train)
  y_pred_valid = grad.predict(x_valid)
  GB.append(mae(y_valid, y_pred_valid))

  ada = AdaBoostRegressor(DecisionTreeRegressor(),random_state=0)
  ada.fit(x_train, y_train)
  y_pred_valid = ada.predict(x_valid)
  AB.append(mae(y_valid, y_pred_valid))

  # Find out which model gives lowest MAE
  m = []
  models = ['Linear', 'K_Neighbors', 'Decision_Tree', 'Bagging', 'Random_Forest', 'Extra_Trees', 'Gradient_Boosting', 'Ada_Boost']
  for ml_mae in [LR, KNN, DT, Bag, RF, ET, GB, AB]:
    m.append(ml_mae)
  m = pd.Series(m, index = models)

  # Search best parameters of best_model for full train_val set
  mae_tune = []
  best_model_store = []

  for low_mae in [0, 1, 2]:
      best_model_name = m[m == np.sort(m)[low_mae][0]].index[0]

      ml_model = [lin, knn, dt, bag, rf, et, grad, ada]
      best_model = ml_model[models.index(best_model_name)]

      if best_model_name == 'Linear':
        best_model = LinearRegression()
        best_model_store.append(best_model)
        mae_tune.append(LR[0])

      elif best_model_name == 'Decision_Tree':
        best_model = DecisionTreeRegressor(random_state=0)
        best_model_store.append(best_model)
        mae_tune.append(DT[0])

      elif best_model_name == 'K_Neighbors':
        parameters = {'n_neighbors': np.arange(2,10,2)}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train, y_train)

        best_param = rs_cv.best_params_['n_neighbors']

        best_model = KNeighborsRegressor(n_neighbors = best_param)
        best_model_store.append(best_model)

        y_p = rs_cv.predict(x_valid)
        mae_tune.append(mae(y_valid, y_p))

      elif best_model_name == 'Bagging' or best_model_name == 'Random_Forest' or best_model_name == 'Extra_Trees':
        parameters = {'n_estimators': np.arange(100,550,50)}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train, y_train)

        best_param = rs_cv.best_params_['n_estimators']

        if best_model_name == 'Bagging':
          best_model = BaggingRegressor(DecisionTreeRegressor(), n_estimators = best_param, random_state=0)
        elif best_model_name == 'Random_Forest':
          best_model = RandomForestRegressor(n_estimators = best_param, random_state=0)
        else:
          best_model = ExtraTreesRegressor(n_estimators = best_param, random_state=0)

        best_model_store.append(best_model)

        y_p = rs_cv.predict(x_valid)
        mae_tune.append(mae(y_valid, y_p))

      else:
        parameters = {'n_estimators': np.arange(100,550,50), 'learning_rate': [0.005, 0.05, 0.08, 0.1, 0.2, 0.3]}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train, y_train)

        best_param1 = rs_cv.best_params_['n_estimators']
        best_param2 = rs_cv.best_params_['learning_rate']

        if best_model_name == 'Gradient_Boosting':
          best_model = GradientBoostingRegressor(n_estimators = best_param1, learning_rate = best_param2, random_state=0)
        else:
          best_model = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = best_param1, learning_rate = best_param2, random_state=0)

        best_model_store.append(best_model)

        y_p = rs_cv.predict(x_valid)
        mae_tune.append(mae(y_valid, y_p))

  mae_tune_series = pd.Series(mae_tune, index = best_model_store)

  best_model = mae_tune_series[mae_tune_series == min(mae_tune_series)].index[0]
  best_model_list.append(best_model)

best_model_series = pd.Series(best_model_list, index = regression_datasets)

Downloading Benchmark Group...
100%|██████████| 1.47M/1.47M [00:00<00:00, 20.4MiB/s]
Extracting zip file...
Done!
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 831.26it/s]
generating training, validation splits...
100%|██████████| 3360/3360 [00:03<00:00, 858.79it/s]
generating training, validation splits...
100%|██████████| 7985/7985 [00:01<00:00, 4923.60it/s]
generating training, validation splits...
100%|██████████| 2231/2231 [00:00<00:00, 2491.33it/s]
generating training, validation splits...
100%|██████████| 904/904 [00:00<00:00, 1956.94it/s]
generating training, validation splits...
100%|██████████| 532/532 [00:00<00:00, 2338.02it/s]
generating training, validation splits...
100%|██████████| 881/881 [00:00<00:00, 2443.57it/s]
generating training, validation splits...
100%|██████████| 970/970 [00:00<00:00, 2359.12it/s]
generating training, validation splits...
100%|██████████| 5907/5907 [00:01<00:00, 5660.94it/s]


In [None]:
best_model_series

caco2_wang                   AdaBoostRegressor(base_estimator=DecisionTreeR...
lipophilicity_astrazeneca    AdaBoostRegressor(base_estimator=DecisionTreeR...
solubility_aqsoldb           AdaBoostRegressor(base_estimator=DecisionTreeR...
ppbr_az                      GradientBoostingRegressor(learning_rate=0.005,...
vdss_lombardo                AdaBoostRegressor(base_estimator=DecisionTreeR...
half_life_obach              AdaBoostRegressor(base_estimator=DecisionTreeR...
clearance_microsome_az       AdaBoostRegressor(base_estimator=DecisionTreeR...
clearance_hepatocyte_az      ExtraTreesRegressor(n_estimators=500, random_s...
ld50_zhu                     ExtraTreesRegressor(n_estimators=350, random_s...
dtype: object

In [None]:
best_model_list

[AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.2,
                   n_estimators=500, random_state=0),
 AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.3,
                   n_estimators=500, random_state=0),
 AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.2,
                   n_estimators=450, random_state=0),
 GradientBoostingRegressor(learning_rate=0.005, n_estimators=500, random_state=0),
 AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.005,
                   n_estimators=200, random_state=0),
 AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.08,
                   n_estimators=350, random_state=0),
 AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.3,
                   n_estimators=350, random_state=0),
 ExtraTreesRegressor(n_estimators=500, random_state=0),
 ExtraTreesRegressor(n_estimators=350, random_state=0)]

# Classification Problems

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score as acc

In [None]:
classification_datasets = ['hia_hou', 
                       'pgp_broccatelli', 
                       'bioavailability_ma', 
                       'bbb_martins', 
                       'cyp2d6_veith', 
                       'cyp3a4_veith', 
                       'cyp2c9_veith',
                       'cyp2d6_substrate_carbonmangels', 
                       'cyp3a4_substrate_carbonmangels',
                       'cyp2c9_substrate_carbonmangels',
                       'herg',
                       'ames',
                       'dili'
                       ]

* Search best ML models

In [None]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')

best_Cmodel_list = []

for clf_data in classification_datasets:
  
  KNN = []
  DT = []
  Bag = []
  RF = []
  ET = []
  AB = []
  XG = []

  benchmark = group.get(clf_data)
  name = benchmark['name']

  # split the dataset into train_val & test set
  train_val, test = benchmark['train_val'], benchmark['test']
  train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = 1)

  # feature extracting
  x_train = molecular_descriptors(train)
  x_valid = molecular_descriptors(valid)

  # Replace NaN values with 0
  x_train = np.nan_to_num(x_train, nan=0, posinf=0)
  x_valid = np.nan_to_num(x_valid, nan=0, posinf=0)

  # target data
  y_train = train.Y
  y_valid = valid.Y

  ### USE ONLY TRAINING AND VALIDATION SET TO SELECT MODEL ###

  # neighbors
  knn = KNeighborsClassifier()
  knn.fit(x_train, y_train)
  y_pred_valid = knn.predict(x_valid)
  KNN.append(acc(y_valid, y_pred_valid))

  # tree
  dt = DecisionTreeClassifier(random_state=0)
  dt.fit(x_train, y_train)
  y_pred_valid = dt.predict(x_valid)
  DT.append(acc(y_valid, y_pred_valid))

  # ensemble
  bag = BaggingClassifier(DecisionTreeClassifier(), random_state=0)
  bag.fit(x_train, y_train)
  y_pred_valid = bag.predict(x_valid)
  Bag.append(acc(y_valid, y_pred_valid))

  rf = RandomForestClassifier(random_state=0)
  rf.fit(x_train, y_train)
  y_pred_valid = rf.predict(x_valid)
  RF.append(acc(y_valid, y_pred_valid))

  et = ExtraTreesClassifier(random_state=0)
  et.fit(x_train, y_train)
  y_pred_valid = et.predict(x_valid)
  ET.append(acc(y_valid, y_pred_valid))

  ada = AdaBoostClassifier(DecisionTreeClassifier(),random_state=0)
  ada.fit(x_train, y_train)
  y_pred_valid = ada.predict(x_valid)
  AB.append(acc(y_valid, y_pred_valid))

  xg = XGBClassifier(random_state=0)
  xg.fit(x_train, y_train)
  y_pred_valid = xg.predict(x_valid)
  XG.append(acc(y_valid, y_pred_valid))

  # Find out which model gives highest accuracy
  a = []
  models = ['K_Neighbors', 'Decision_Tree', 'Bagging', 'Random_Forest', 'Extra_Trees', 'Ada_Boost', 'XG_Boost']

  for ml_acc in [KNN, DT, Bag, RF, ET, AB, XG]:
    a.append(ml_acc)
  a = pd.Series(a, index = models)

  # Search best parameters of best_model for full train_val set
  acc_tune = []
  best_Cmodel_store = []

  for high_acc in [-1, -2]:
      best_model_name = a[a == np.sort(a)[high_acc][0]].index[0]

      ml_Cmodel = [knn, dt, bag, rf, et, ada, xg]
      best_model = ml_Cmodel[models.index(best_model_name)]

      if best_model_name == 'Decision_Tree':
        best_model = DecisionTreeClassifier(random_state=0)
        best_Cmodel_store.append(best_model)
        acc_tune.append(DT[0])

      elif best_model_name == 'K_Neighbors':
        parameters = {'n_neighbors': np.arange(2,10,2)}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train, y_train)

        best_param = rs_cv.best_params_['n_neighbors']

        best_model = KNeighborsClassifier(n_neighbors = best_param)
        best_Cmodel_store.append(best_model)

        y_p = rs_cv.predict(x_valid)
        acc_tune.append(acc(y_valid, y_p))

      elif best_model_name == 'Bagging' or best_model_name == 'Random_Forest' or best_model_name == 'Extra_Trees':
        parameters = {'n_estimators': np.arange(100,550,50)}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train, y_train)

        best_param = rs_cv.best_params_['n_estimators']

        if best_model_name == 'Bagging':
          best_model = BaggingClassifier(DecisionTreeClassifier(), n_estimators = best_param, random_state=0)
        elif best_model_name == 'Random_Forest':
          best_model = RandomForestClassifier(n_estimators = best_param, random_state=0)
        else:
          best_model = ExtraTreesClassifier(n_estimators = best_param, random_state=0)

        best_Cmodel_store.append(best_model)

        y_p = rs_cv.predict(x_valid)
        acc_tune.append(acc(y_valid, y_p))

      else:
        parameters = {'n_estimators': np.arange(100,550,50), 'learning_rate': [0.005, 0.05, 0.08, 0.1, 0.2, 0.3]}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train, y_train)

        best_param1 = rs_cv.best_params_['n_estimators']
        best_param2 = rs_cv.best_params_['learning_rate']

        if best_model_name == 'Ada_Boost':
          best_model = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators = best_param1, learning_rate = best_param2, random_state=0)
        else:
          best_model = XGBClassifier(n_estimators = best_param1, learning_rate = best_param2, random_state=0)
          
        best_Cmodel_store.append(best_model)

        y_p = rs_cv.predict(x_valid)
        acc_tune.append(acc(y_valid, y_p))

  acc_tune_series = pd.Series(acc_tune, index = best_Cmodel_store)

  best_Cmodel = acc_tune_series[acc_tune_series == max(acc_tune_series)].index[0]
  best_Cmodel_list.append(best_model)

best_Cmodel_series = pd.Series(best_Cmodel_list, index = classification_datasets)

Found local copy...
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 3279.83it/s]
generating training, validation splits...
100%|██████████| 973/973 [00:00<00:00, 2633.32it/s]
generating training, validation splits...
100%|██████████| 512/512 [00:00<00:00, 3114.05it/s]
generating training, validation splits...
100%|██████████| 1624/1624 [00:00<00:00, 2851.14it/s]
generating training, validation splits...
100%|██████████| 10504/10504 [00:03<00:00, 2761.57it/s]
generating training, validation splits...
100%|██████████| 9861/9861 [00:03<00:00, 2961.92it/s]
generating training, validation splits...
100%|██████████| 9673/9673 [00:03<00:00, 2886.68it/s]
generating training, validation splits...
100%|██████████| 532/532 [00:00<00:00, 3088.80it/s]
generating training, validation splits...
100%|██████████| 535/535 [00:00<00:00, 2934.64it/s]
generating training, validation splits...
100%|██████████| 534/534 [00:00<00:00, 2864.14it/s]
generating training, validatio

In [None]:
best_Cmodel_series

hia_hou                           BaggingClassifier(base_estimator=DecisionTreeC...
pgp_broccatelli                   ExtraTreesClassifier(n_estimators=500, random_...
bioavailability_ma                RandomForestClassifier(n_estimators=400, rando...
bbb_martins                       BaggingClassifier(base_estimator=DecisionTreeC...
cyp2d6_veith                      ExtraTreesClassifier(n_estimators=250, random_...
cyp3a4_veith                                        XGBClassifier(n_estimators=400)
cyp2c9_veith                      BaggingClassifier(base_estimator=DecisionTreeC...
cyp2d6_substrate_carbonmangels    BaggingClassifier(base_estimator=DecisionTreeC...
cyp3a4_substrate_carbonmangels    XGBClassifier(learning_rate=0.005, n_estimator...
cyp2c9_substrate_carbonmangels    BaggingClassifier(base_estimator=DecisionTreeC...
herg                              BaggingClassifier(base_estimator=DecisionTreeC...
ames                              ExtraTreesClassifier(n_estimators=500, ran

In [None]:
best_Cmodel_list

[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=200,
                   random_state=0),
 ExtraTreesClassifier(n_estimators=500, random_state=0),
 RandomForestClassifier(n_estimators=400, random_state=0),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100,
                   random_state=0),
 ExtraTreesClassifier(n_estimators=250, random_state=0),
 XGBClassifier(n_estimators=400),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=350,
                   random_state=0),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=250,
                   random_state=0),
 XGBClassifier(learning_rate=0.005, n_estimators=150),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=500,
                   random_state=0),
 BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=150,
                   random_state=0),
 ExtraTreesClassifier(n_estimators=500, random_state=0

# Save selected best models for all datasets

In [None]:
import pickle

In [None]:
# save selected models for regression datasets
pickle.dump(best_model_series,open('best_model_series.pkl','wb'))

In [None]:
# save selected models for classification datasets
pickle.dump(best_Cmodel_series,open('best_Cmodel_series.pkl','wb'))

| **Dataset**                    | Model                                                                                                            |
| ------------------------------ | ---------------------------------------------------------------------------------------------------------------- |
| *`ABSORPTION`*                                                                                                                                    |
| caco2_wang                     | AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.2, n_estimators=500, random_state=0)   |
| hia_hou                        | BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=200, random_state=0)                     |
| pgp_broccatelli                | ExtraTreesClassifier(n_estimators=500, random_state=0)                                                           |
| bioavailability_ma             | RandomForestClassifier(n_estimators=400, random_state=0)                                                         |
| lipophilicity_astrazeneca      | AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.3, n_estimators=500, random_state=0)   |
| solubility_aqsoldb             | AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.2, n_estimators=450, random_state=0)   |
| *`DISTRIBUTION`*                                                                                                                                  |
| bbb_martins                    | BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=0)                     |
| ppbr_az                        | GradientBoostingRegressor(learning_rate=0.005, n_estimators=500, random_state=0)                                 |
| vdss_lombardo                  | AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.005, n_estimators=200, random_state=0) |
| *`METABOLISM`*                                                                                                                                    |
| cyp2d6_veith                   | ExtraTreesClassifier(n_estimators=250, random_state=0)                                                           |
| cyp3a4_veith                   | XGBClassifier(n_estimators=400)                                                                                  |
| cyp2c9_veith                   | BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=350, random_state=0)                     |
| cyp2d6_substrate_carbonmangels | BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=250, random_state=0)                     |
| cyp3a4_substrate_carbonmangels | XGBClassifier(learning_rate=0.005, n_estimators=150)                                                             |
| cyp2c9_substrate_carbonmangels | BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=500, random_state=0)                     |
| *`EXCRETION`*                                                                                                                                     |
| half_life_obach                | AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.08, n_estimators=350, random_state=0)  |
| clearance_microsome_az         | AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.3, n_estimators=350, random_state=0)   |
| clearance_hepatocyte_az        | ExtraTreesRegressor(n_estimators=500, random_state=0)                                                            |
| *`TOXICITY`*                                                                                                                                      |
| herg                           | BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=150, random_state=0)                     |
| ames                           | ExtraTreesClassifier(n_estimators=500, random_state=0)                                                           |
| dili                           | KNeighborsClassifier(n_neighbors=4)                                                                              |
| ld50_zhu                       | ExtraTreesRegressor(n_estimators=350, random_state=0)                                                            |