# Install Libraries

In [None]:
pip install PyTDC

In [None]:
pip install rdkit-pypi

In [6]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

# Creating feature extractor function

In [7]:
def molecular_descriptors(table):

  descriptors = pd.DataFrame()

  mol = [Chem.MolFromSmiles(drug) for drug in table.Drug]

  # Exact molecular weight of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.ExactMolWt(i) for i in mol])
  descriptors['Exact_MW'] = Nilavo[0]

  # FpDensityMorgan1
  Nilavo = []
  Nilavo.append([Descriptors.FpDensityMorgan1(i) for i in mol])
  descriptors['FpDensityMorgan1'] = Nilavo[0]

  # FpDensityMorgan2
  Nilavo = []
  Nilavo.append([Descriptors.FpDensityMorgan2(i) for i in mol])
  descriptors['FpDensityMorgan2'] = Nilavo[0]

  # FpDensityMorgan3
  Nilavo = []
  Nilavo.append([Descriptors.FpDensityMorgan3(i) for i in mol])
  descriptors['FpDensityMorgan3'] = Nilavo[0]

  # Average molecular weight of the molecule ignoring hydrogens
  Nilavo = []
  Nilavo.append([Descriptors.HeavyAtomMolWt(i) for i in mol])
  descriptors['HeavyAtomMolWt'] = Nilavo[0]

  ###
  ### MaxAbsPartialCharge ###
  Nilavo = []
  Nilavo.append([Descriptors.MaxAbsPartialCharge(i) for i in mol])
  descriptors['MaxAbsPartialCharge'] = Nilavo[0]

  ###
  ### MaxPartialCharge ###
  Nilavo = []
  Nilavo.append([Descriptors.MaxPartialCharge(i) for i in mol])
  descriptors['MaxPartialCharge'] = Nilavo[0]

  ###
  ### MinAbsPartialCharge ###
  Nilavo = []
  Nilavo.append([Descriptors.MinAbsPartialCharge(i) for i in mol])
  descriptors['MinAbsPartialCharge'] = Nilavo[0]

  ###
  ### MinPartialCharge ###
  Nilavo = []
  Nilavo.append([Descriptors.MinPartialCharge(i) for i in mol])
  descriptors['MinPartialCharge'] = Nilavo[0]

  # Average molecular weight of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.MolWt(i) for i in mol])
  descriptors['MolWt'] = Nilavo[0]

  # Number of radical electrons of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.NumRadicalElectrons(i) for i in mol])
  descriptors['NumRadicalElectrons'] = Nilavo[0]

  # Number of valence electrons of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.NumValenceElectrons(i) for i in mol])
  descriptors['NumValenceElectrons'] = Nilavo[0]

  # Log of partition coefficient
  Nilavo = []
  Nilavo.append([Descriptors.MolLogP(i) for i in mol])
  descriptors['Partition_Coefficient'] = Nilavo[0]


  ### Lipinski Descriptors ###
  # Fraction of C atoms that are SP3 hybridized
  Nilavo = []
  Nilavo.append([Lipinski.FractionCSP3(i) for i in mol])
  descriptors['FractionCSP3'] = Nilavo[0]

  # Number of heavy atoms a molecule
  Nilavo = []
  Nilavo.append([Lipinski.HeavyAtomCount(i) for i in mol])
  descriptors['Heavy_atoms'] = Nilavo[0]

  # Number of NHs or OHs
  Nilavo = []
  Nilavo.append([Lipinski.NHOHCount(i) for i in mol])
  descriptors['NHs/OHs'] = Nilavo[0]

  # Number of Nitrogens and Oxygens
  Nilavo = []
  Nilavo.append([Lipinski.NOCount(i) for i in mol])
  descriptors['N&O'] = Nilavo[0]

  # Number of aliphatic (containing at least one non-aromatic bond) carbocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAliphaticCarbocycles(i) for i in mol])
  descriptors['Aliphatic_carbocycles'] = Nilavo[0]

  # Number of aliphatic (containing at least one non-aromatic bond) heterocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAliphaticHeterocycles(i) for i in mol])
  descriptors['Aliphatic_heterocycles'] = Nilavo[0]

  # Number of aliphatic (containing at least one non-aromatic bond) rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAliphaticRings(i) for i in mol])
  descriptors['Aliphatic_rings'] = Nilavo[0]

  # Nmber of aromatic carbocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAromaticCarbocycles(i) for i in mol])
  descriptors['Aromatic_carbocycles'] = Nilavo[0]

  # Number of aromatic heterocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAromaticHeterocycles(i) for i in mol])
  descriptors['Aromatic_heterocycles'] = Nilavo[0]

  # Number of aromatic rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAromaticRings(i) for i in mol])
  descriptors['Aromatic_rings'] = Nilavo[0]

  # Number of Hydrogen Bond Acceptors
  Nilavo = []
  Nilavo.append([Lipinski.NumHAcceptors(i) for i in mol])
  descriptors['HAcceptors'] = Nilavo[0]

  # Number of Hydrogen Bond Donors
  Nilavo = []
  Nilavo.append([Lipinski.NumHDonors(i) for i in mol])
  descriptors['HDonors'] = Nilavo[0]

  # Number of Heteroatoms
  Nilavo = []
  Nilavo.append([Lipinski.NumHeteroatoms(i) for i in mol])
  descriptors['Heteroatoms'] = Nilavo[0]

  # Number of Rotatable Bonds
  Nilavo = []
  Nilavo.append([Lipinski.NumRotatableBonds(i) for i in mol])
  descriptors['Rotatable_Bonds'] = Nilavo[0]

  # Number of saturated carbocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumSaturatedCarbocycles(i) for i in mol])
  descriptors['Saturated_Carbocycles'] = Nilavo[0]

  # Number of saturated heterocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumSaturatedHeterocycles(i) for i in mol])
  descriptors['Saturated_Heterocycles'] = Nilavo[0]

  # Number of saturated rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumSaturatedRings(i) for i in mol])
  descriptors['Saturated_Rings'] = Nilavo[0]

  # Number of rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.RingCount(i) for i in mol])
  descriptors['Rings'] = Nilavo[0]

  return descriptors

# Regression Problems

In [83]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_error as mae

* Search best ML models

In [131]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')

best_model_list = []

regression_datasets = ['caco2_wang', 
                       'lipophilicity_astrazeneca', 
                       'solubility_aqsoldb', 
                       #'ppbr_az', 
                       #'vdss_lombardo', 
                       #'half_life_obach', 
                       #'clearance_microsome_az',
                       #'clearance_hepatocyte_az', 
                       #'ld50_zhu'
                      ]

for reg_data in regression_datasets:
  LR = []
  KNN = []
  DT = []
  Bag = []
  RF = []
  ET = []
  GB = []
  AB = []

  #for seed in np.arange(1,6):
  benchmark = group.get(reg_data)
  name = benchmark['name']

  # split the dataset into train_val & test set
  train_val, test = benchmark['train_val'], benchmark['test']

  # feature extracting
  x_train_val = molecular_descriptors(train_val)
  x_test = molecular_descriptors(test)

  # Replace NaN values with 0
  x_train_val = np.nan_to_num(x_train_val, nan=0, posinf=0)
  x_test = np.nan_to_num(x_test, nan=0, posinf=0)

  # target data
  y_train_val = train_val.Y
  y_test = test.Y

  #mms = MinMaxScaler()
  #x_train_val = mms.fit_transform(x_train_val)
  #x_test = mms.transform(x_test)

  lin = LinearRegression()
  lin.fit(x_train_val, y_train_val)
  y_pred_test = lin.predict(x_test)
  LR.append(mae(y_test, y_pred_test))

  knn = KNeighborsRegressor()
  knn.fit(x_train_val, y_train_val)
  y_pred_test = knn.predict(x_test)
  KNN.append(mae(y_test, y_pred_test))

  dt = DecisionTreeRegressor(random_state=0)
  dt.fit(x_train_val, y_train_val)
  y_pred_test = dt.predict(x_test)
  DT.append(mae(y_test, y_pred_test))

  bag = BaggingRegressor(DecisionTreeRegressor(), random_state=0)
  bag.fit(x_train_val, y_train_val)
  y_pred_test = bag.predict(x_test)
  Bag.append(mae(y_test, y_pred_test))

  rf = RandomForestRegressor(random_state=0)
  rf.fit(x_train_val, y_train_val)
  y_pred_test = rf.predict(x_test)
  RF.append(mae(y_test, y_pred_test))

  et = ExtraTreesRegressor(random_state=0)
  et.fit(x_train_val, y_train_val)
  y_pred_test = et.predict(x_test)
  ET.append(mae(y_test, y_pred_test))

  grad = GradientBoostingRegressor(random_state=0)
  grad.fit(x_train_val, y_train_val)
  y_pred_test = grad.predict(x_test)
  GB.append(mae(y_test, y_pred_test))

  ada = AdaBoostRegressor(DecisionTreeRegressor(),random_state=0)
  ada.fit(x_train_val, y_train_val)
  y_pred_test = ada.predict(x_test)
  AB.append(mae(y_test, y_pred_test))

  # Find out which model gives lowest MAE
  m = []
  models = ['Linear', 'K_Neighbors', 'Decision_Tree', 'Bagging', 'Random_Forest', 'Extra_Trees', 'Gradient_Boosting', 'Ada_Boost']
  for ml_mae in [LR, KNN, DT, Bag, RF, ET, GB, AB]:
    m.append(ml_mae)
  m = pd.Series(m, index = models)

  # Search best parameters of best_model for full train_val set
  mae_tune = []
  best_model_store = []

  for low_mae in [0, 1, 2]:
      best_model_name = m[m == np.sort(m)[low_mae][0]].index[0]

      ml_model = [lin, knn, dt, bag, rf, et, grad, ada]
      best_model = ml_model[models.index(best_model_name)]

      if best_model_name == 'Linear':
        best_model = LinearRegression()
        best_model_store.append(best_model)
        mae_tune.append(LR[0])

      elif best_model_name == 'Decision_Tree':
        best_model = DecisionTreeRegressor(random_state=0)
        best_model_store.append(best_model)
        mae_tune.append(DT[0])

      elif best_model_name == 'K_Neighbors':
        parameters = {'n_neighbors': np.arange(2,10,2)}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train_val, y_train_val)

        best_param = rs_cv.best_params_['n_neighbors']

        best_model = KNeighborsRegressor(n_neighbors = best_param)
        best_model_store.append(best_model)

        y_p = rs_cv.predict(x_test)
        mae_tune.append(mae(y_test, y_p))

      elif best_model_name == 'Bagging' or best_model_name == 'Random_Forest' or best_model_name == 'Extra_Trees':
        parameters = {'n_estimators': np.arange(100,550,50)}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train_val, y_train_val)

        best_param = rs_cv.best_params_['n_estimators']

        if best_model_name == 'Bagging':
          best_model = BaggingRegressor(DecisionTreeRegressor(), n_estimators = best_param, random_state=0)
        elif best_model_name == 'Random_Forest':
          best_model = RandomForestRegressor(n_estimators = best_param, random_state=0)
        else:
          best_model = ExtraTreesRegressor(n_estimators = best_param, random_state=0)

        best_model_store.append(best_model)

        y_p = rs_cv.predict(x_test)
        mae_tune.append(mae(y_test, y_p))

      else:
        parameters = {'n_estimators': np.arange(100,550,50), 'learning_rate': [0.005, 0.05, 0.08, 0.1, 0.2, 0.3]}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train_val, y_train_val)

        best_param1 = rs_cv.best_params_['n_estimators']
        best_param2 = rs_cv.best_params_['learning_rate']

        if best_model_name == 'Gradient_Boosting':
          best_model = GradientBoostingRegressor(n_estimators = best_param1, learning_rate = best_param2, random_state=0)
        else:
          best_model = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = best_param1, learning_rate = best_param2, random_state=0)

        best_model_store.append(best_model)

        y_p = rs_cv.predict(x_test)
        mae_tune.append(mae(y_test, y_p))

  mae_tune_series = pd.Series(mae_tune, index = best_model_store)

  best_model = mae_tune_series[mae_tune_series == min(mae_tune_series)].index[0]
  best_model_list.append(best_model)

best_model_series = pd.Series(best_model_list, index = regression_datasets)

Found local copy...


In [132]:
best_model_series

caco2_wang                   ExtraTreesRegressor(n_estimators=300, random_s...
lipophilicity_astrazeneca    AdaBoostRegressor(base_estimator=DecisionTreeR...
solubility_aqsoldb           ExtraTreesRegressor(n_estimators=400, random_s...
dtype: object

In [None]:
### best_model_series ###


# caco2_wang                   ExtraTreesRegressor(n_estimators=300, random_s...
# lipophilicity_astrazeneca    AdaBoostRegressor(base_estimator=DecisionTreeR...
# solubility_aqsoldb           ExtraTreesRegressor(n_estimators=400, random_s...
# ppbr_az            (ExtraTreeRegressor(random_state=209652396), E...
# vdss_lombardo      (DecisionTreeRegressor(random_state=209652396)...
# half_life_obach    (DecisionTreeRegressor(random_state=209652396)...
# clearance_microsome_az     (DecisionTreeRegressor(random_state=209652396)...
# clearance_hepatocyte_az    (ExtraTreeRegressor(random_state=209652396), E...
# ld50_zhu                   (DecisionTreeRegressor(max_features='auto', ra...

In [133]:
best_model_list

[ExtraTreesRegressor(n_estimators=300, random_state=0),
 AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.2,
                   n_estimators=450, random_state=0),
 ExtraTreesRegressor(n_estimators=400, random_state=0)]

In [None]:
### best_model_list ###


#[ExtraTreesRegressor(n_estimators=300, random_state=0),
# AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.2,
#                   n_estimators=450, random_state=0),
# ExtraTreesRegressor(n_estimators=400, random_state=0)]
# ExtraTreesRegressor(n_estimators=450, random_state=0),
# AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.2,
#                   n_estimators=450, random_state=0),
# AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.05,
#                   n_estimators=350, random_state=0)
# AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=0.05,
#                   n_estimators=200, random_state=0),
# ExtraTreesRegressor(n_estimators=350, random_state=0),
# RandomForestRegressor(n_estimators=400, random_state=0)]

* Calculate performance of selected regression models

In [135]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
predictions_list = []

regression_datasets = ['caco2_wang', 
                       'lipophilicity_astrazeneca', 
                       'solubility_aqsoldb', 
                       #'ppbr_az', 
                       #'vdss_lombardo', 
                       #'half_life_obach', 
                       #'clearance_microsome_az',
                       #'clearance_hepatocyte_az', 
                       #'ld50_zhu'
                      ]
for seed in [1, 2, 3, 4, 5]:
  predictions = {}
  for reg in regression_datasets:
    dataset = reg
    benchmark = group.get(reg) 
    name = benchmark['name']
    train_val, test = benchmark['train_val'], benchmark['test']

    # split the train_val set into train & validation set
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'scaffold', seed = seed)

    x_train = molecular_descriptors(train)
    x_valid = molecular_descriptors(valid)
    x_test = molecular_descriptors(test)

    # target column
    y_train = train.Y
    y_valid = valid.Y
    y_test = test.Y

    # merging traning and validation set
    x_train_val = pd.concat([x_train, x_valid])
    y_train_val = pd.concat([y_train, y_valid], axis=0)

    # Replace NaN values with 0
    x_train_val = np.nan_to_num(x_train_val, nan=0, posinf=0)
    x_test = np.nan_to_num(x_test, nan=0, posinf=0)

    # model training and prediction
    model =  best_model_series[reg] 

    model.fit(x_train_val, y_train_val)
    y_pred_test = model.predict(x_test)

    predictions[name] = y_pred_test

  predictions_list.append(predictions)

results = group.evaluate_many(predictions_list)

Found local copy...
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 2236.05it/s]
generating training, validation splits...
100%|██████████| 3360/3360 [00:01<00:00, 2460.90it/s]
generating training, validation splits...
100%|██████████| 7985/7985 [00:01<00:00, 4787.74it/s]
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 2215.21it/s]
generating training, validation splits...
100%|██████████| 3360/3360 [00:01<00:00, 2318.23it/s]
generating training, validation splits...
100%|██████████| 7985/7985 [00:01<00:00, 4696.27it/s]
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 2283.71it/s]
generating training, validation splits...
100%|██████████| 3360/3360 [00:01<00:00, 2449.52it/s]
generating training, validation splits...
100%|██████████| 7985/7985 [00:01<00:00, 4698.00it/s]
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 2267.59it/s]
generating training, validat

In [136]:
results

{'caco2_wang': [0.323, 0.002],
 'lipophilicity_astrazeneca': [0.615, 0.003],
 'solubility_aqsoldb': [0.832, 0.003]}

In [None]:
#{'caco2_wang': [0.323, 0.002],                 (2)
# 'lipophilicity_astrazeneca': [0.615, 0.003],  (8)
# 'solubility_aqsoldb': [0.832, 0.003]          (3)
# 'half_life_obach': [0.424, 0.013],            (1)
# 'ppbr_az': [8.506, 0.034],                    (2)
# 'vdss_lombardo': [0.545, 0.008]               (5)
# 'clearance_hepatocyte_az': [0.442, 0.004],    (1)
# 'clearance_microsome_az': [0.547, 0.011],     (6)
# 'ld50_zhu': [0.639, 0.001]}                   (6)

# Classification Problems

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score as acc

* Search best ML models

In [72]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')


best_Cmodel_list = []

classification_datasets = ['hia_hou', 
                       'pgp_broccatelli', 
                       'bioavailability_ma', 
                       'bbb_martins', 
                       'cyp2d6_veith', 
                       #'cyp3a4_veith', 
                       #'cyp2c9_veith',
                       #'cyp2d6_substrate_carbonmangels', 
                       #'cyp3a4_substrate_carbonmangels',
                       #'cyp2c9_substrate_carbonmangels',
                       #'herg',
                       #'ames',
                       #'dili'
                       ]

for clf_data in classification_datasets:
  
  KNN = []
  DT = []
  Bag = []
  RF = []
  ET = []
  AB = []
  XG = []

  benchmark = group.get(clf_data)
  name = benchmark['name']

  # split the dataset into train_val & test set
  train_val, test = benchmark['train_val'], benchmark['test']

  # feature extracting
  x_train_val = molecular_descriptors(train_val)
  x_test = molecular_descriptors(test)

  # Replace NaN values with 0
  x_train_val = np.nan_to_num(x_train_val, nan=0, posinf=0)
  x_test = np.nan_to_num(x_test, nan=0, posinf=0)

  # target data
  y_train_val = train_val.Y
  y_test = test.Y

  # neighbors

  knn = KNeighborsClassifier()
  knn.fit(x_train_val, y_train_val)
  y_pred_test = knn.predict(x_test)
  KNN.append(acc(y_test, y_pred_test))

  # tree

  dt = DecisionTreeClassifier(random_state=0)
  dt.fit(x_train_val, y_train_val)
  y_pred_test = dt.predict(x_test)
  DT.append(acc(y_test, y_pred_test))

  # ensemble

  bag = BaggingClassifier(DecisionTreeClassifier(), random_state=0)
  bag.fit(x_train_val, y_train_val)
  y_pred_test = bag.predict(x_test)
  Bag.append(acc(y_test, y_pred_test))

  rf = RandomForestClassifier(random_state=0)
  rf.fit(x_train_val, y_train_val)
  y_pred_test = rf.predict(x_test)
  RF.append(acc(y_test, y_pred_test))

  et = ExtraTreesClassifier(random_state=0)
  et.fit(x_train_val, y_train_val)
  y_pred_test = et.predict(x_test)
  ET.append(acc(y_test, y_pred_test))

  ada = AdaBoostClassifier(DecisionTreeClassifier(),random_state=0)
  ada.fit(x_train_val, y_train_val)
  y_pred_test = ada.predict(x_test)
  AB.append(acc(y_test, y_pred_test))

  xg = XGBClassifier(random_state=0)
  xg.fit(x_train_val, y_train_val)
  y_pred_test = xg.predict(x_test)
  XG.append(acc(y_test, y_pred_test))

  # Find out which model gives highest accuracy
  a = []
  models = ['K_Neighbors', 'Decision_Tree', 'Bagging', 'Random_Forest', 'Extra_Trees', 'Ada_Boost', 'XG_Boost']

  for ml_acc in [KNN, DT, Bag, RF, ET, AB, XG]:
    a.append(ml_acc)
  a = pd.Series(a, index = models)

  # Search best parameters of best_model for full train_val set
  acc_tune = []
  best_Cmodel_store = []

  for high_acc in [-1, -2]:
      best_model_name = a[a == np.sort(a)[high_acc][0]].index[0]

      ml_Cmodel = [knn, dt, bag, rf, et, ada, xg]
      best_model = ml_Cmodel[models.index(best_model_name)]

      if best_model_name == 'Decision_Tree':
        best_model = DecisionTreeClassifier(random_state=0)
        best_Cmodel_store.append(best_model)
        acc_tune.append(DT[0])

      elif best_model_name == 'K_Neighbors':
        parameters = {'n_neighbors': np.arange(2,10,2)}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train_val, y_train_val)

        best_param = rs_cv.best_params_['n_neighbors']

        best_model = KNeighborsClassifier(n_neighbors = best_param)
        best_Cmodel_store.append(best_model)

        y_p = rs_cv.predict(x_test)
        acc_tune.append(acc(y_test, y_p))

      elif best_model_name == 'Bagging' or best_model_name == 'Random_Forest' or best_model_name == 'Extra_Trees':
        parameters = {'n_estimators': np.arange(100,550,50)}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train_val, y_train_val)

        best_param = rs_cv.best_params_['n_estimators']

        if best_model_name == 'Bagging':
          best_model = BaggingClassifier(DecisionTreeClassifier(), n_estimators = best_param, random_state=0)
        elif best_model_name == 'Random_Forest':
          best_model = RandomForestClassifier(n_estimators = best_param, random_state=0)
        else:
          best_model = ExtraTreesClassifier(n_estimators = best_param, random_state=0)

        best_Cmodel_store.append(best_model)

        y_p = rs_cv.predict(x_test)
        acc_tune.append(acc(y_test, y_p))

      else:
        parameters = {'n_estimators': np.arange(100,550,50), 'learning_rate': [0.005, 0.05, 0.08, 0.1, 0.2, 0.3]}
        rs_cv = RandomizedSearchCV(best_model, parameters)
        rs_cv.fit(x_train_val, y_train_val)

        best_param1 = rs_cv.best_params_['n_estimators']
        best_param2 = rs_cv.best_params_['learning_rate']

        if best_model_name == 'Ada_Boost':
          best_model = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators = best_param1, learning_rate = best_param2, random_state=0)
        else:
          best_model = XGBClassifier(n_estimators = best_param1, learning_rate = best_param2, random_state=0)
          
        best_Cmodel_store.append(best_model)

        y_p = rs_cv.predict(x_test)
        acc_tune.append(acc(y_test, y_p))

  acc_tune_series = pd.Series(acc_tune, index = best_Cmodel_store)

  best_Cmodel = acc_tune_series[acc_tune_series == max(acc_tune_series)].index[0]
  best_Cmodel_list.append(best_model)

best_Cmodel_series = pd.Series(best_Cmodel_list, index = classification_datasets)

Found local copy...


In [73]:
best_Cmodel_series

hia_hou                          DecisionTreeClassifier(random_state=0)
pgp_broccatelli       ExtraTreesClassifier(n_estimators=250, random_...
bioavailability_ma    ExtraTreesClassifier(n_estimators=450, random_...
bbb_martins                        ExtraTreesClassifier(random_state=0)
cyp2d6_veith          RandomForestClassifier(n_estimators=250, rando...
dtype: object

In [None]:
### best_Cmodel_series ###


# hia_hou                          DecisionTreeClassifier(random_state=0)
# pgp_broccatelli       ExtraTreesClassifier(n_estimators=250, random_...
# bioavailability_ma    ExtraTreesClassifier(n_estimators=450, random_...
# bbb_martins                        ExtraTreesClassifier(random_state=0)
# cyp2d6_veith          RandomForestClassifier(n_estimators=250, rando...
# cyp3a4_veith                      (ExtraTreeClassifier(random_state=209652396), ...
# cyp2d6_substrate_carbonmangels    (DecisionTreeClassifier(random_state=208755735...
# cyp3a4_substrate_carbonmangels    (DecisionTreeClassifier(max_features='auto', r...
# cyp2c9_substrate_carbonmangels    (ExtraTreeClassifier(random_state=209652396), ...
# ames    (DecisionTreeClassifier(max_features='auto', r...
# dili    (DecisionTreeClassifier(max_features='auto', r...
# herg    (ExtraTreeClassifier(random_state=209652396), ...
# cyp2c9_veith    (DecisionTreeClassifier(random_state=209652396...

In [74]:
best_Cmodel_list

[DecisionTreeClassifier(random_state=0),
 ExtraTreesClassifier(n_estimators=250, random_state=0),
 ExtraTreesClassifier(n_estimators=450, random_state=0),
 ExtraTreesClassifier(random_state=0),
 RandomForestClassifier(n_estimators=250, random_state=0)]

In [None]:
### best_Cmodel_list ###


#[DecisionTreeClassifier(random_state=0),
# ExtraTreesClassifier(n_estimators=250, random_state=0),
# ExtraTreesClassifier(n_estimators=450, random_state=0),
# ExtraTreesClassifier(random_state=0),
# RandomForestClassifier(n_estimators=250, random_state=0),
# ExtraTreesClassifier(n_estimators=450, random_state=0),
# BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=150,
#                   random_state=0),
# RandomForestClassifier(random_state=0),
# ExtraTreesClassifier(n_estimators=150, random_state=0)
# RandomForestClassifier(n_estimators=500, random_state=0),
# RandomForestClassifier(n_estimators=500, random_state=0)
# ExtraTreesClassifier(n_estimators=350, random_state=0)
# AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), learning_rate=0.1,
#                    n_estimators=400, random_state=0)]

* Calculate performance of selected classification models

In [75]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
predictions_list = []

classification_datasets = ['hia_hou', 
                       'pgp_broccatelli', 
                       'bioavailability_ma', 
                       'bbb_martins', 
                       'cyp2d6_veith', 
                       #'cyp3a4_veith', 
                       #'cyp2c9_veith',
                       #'cyp2d6_substrate_carbonmangels', 
                       #'cyp3a4_substrate_carbonmangels',
                       #'cyp2c9_substrate_carbonmangels',
                       #'herg',
                       #'ames',
                       #'dili'
                       ]


for seed in [1, 2, 3, 4, 5]:
    predictions = {}
    for clf in classification_datasets:
        dataset = clf
        benchmark = group.get(clf) 
        name = benchmark['name']
        train_val, test = benchmark['train_val'], benchmark['test']
        train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)

        ## --- train your model --- ##

        x_train = molecular_descriptors(train)
        y_train = train.Y
        
        x_valid = molecular_descriptors(valid)
        y_valid = valid.Y

        x_test = molecular_descriptors(test)
        y_test = test.Y

        # merging traning and validation set
        x_train_val = pd.concat([x_train, x_valid])
        y_train_val = pd.concat([y_train, y_valid], axis=0)

        # Replace NaN values with 0
        x_train_val = np.nan_to_num(x_train_val, nan=0, posinf=0)
        x_test = np.nan_to_num(x_test, nan=0, posinf=0)

        model = best_Cmodel_series[clf]       

        model.fit(x_train_val, y_train_val)
        y_pred_test = model.predict(x_test)
        

        predictions[name] = y_pred_test
    predictions_list.append(predictions)

results = group.evaluate_many(predictions_list)

Found local copy...
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 3091.73it/s]
generating training, validation splits...
100%|██████████| 973/973 [00:00<00:00, 2636.69it/s]
generating training, validation splits...
100%|██████████| 512/512 [00:00<00:00, 1724.54it/s]
generating training, validation splits...
100%|██████████| 1624/1624 [00:00<00:00, 2806.27it/s]
generating training, validation splits...
100%|██████████| 10504/10504 [00:03<00:00, 2667.76it/s]
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 3324.33it/s]
generating training, validation splits...
100%|██████████| 973/973 [00:00<00:00, 2596.81it/s]
generating training, validation splits...
100%|██████████| 512/512 [00:00<00:00, 3025.94it/s]
generating training, validation splits...
100%|██████████| 1624/1624 [00:00<00:00, 1815.19it/s]
generating training, validation splits...
100%|██████████| 10504/10504 [00:06<00:00, 1602.58it/s]
generating training, validat

In [76]:
results

{'bbb_martins': [0.745, 0.0],
 'bioavailability_ma': [0.583, 0.0],
 'cyp2d6_veith': [0.365, 0.003],
 'hia_hou': [0.856, 0.0],
 'pgp_broccatelli': [0.822, 0.0]}

In [79]:
results

{'cyp2c9_veith': [0.532, 0.018], 'herg': [0.718, 0.0]}

In [None]:
#{'bbb_martins': [0.745, 0.0],                        (x)
# 'bioavailability_ma': [0.583, 0.0],                 (8)
# 'cyp2d6_veith': [0.365, 0.003],                     (x)
# 'hia_hou': [0.856, 0.0],                            (9)
# 'pgp_broccatelli': [0.822, 0.0]                     (x)
# 'cyp2c9_substrate_carbonmangels': [0.302, 0.0],     (x)
# 'cyp2d6_substrate_carbonmangels': [0.476, 0.015],   (x)
# 'cyp3a4_substrate_carbonmangels': [0.607, 0.017],   (7)
# 'cyp3a4_veith': [0.637, 0.0]                        (x)
# 'ames': [0.722, 0.003],                             (x)
# 'dili': [0.824, 0.005]                              (10)
# 'herg': [0.718, 0.0]                                (x)
# 'cyp2c9_veith': [0.532, 0.018]}                     (x)