# Install Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
pip install PyTDC

In [None]:
pip install rdkit-pypi

# Creating feature extractor function

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [None]:
# 27 molecular descriptors

def molecular_descriptors(table):

  descriptors = pd.DataFrame()

  mol = [Chem.MolFromSmiles(drug) for drug in table.Drug]

  # Exact molecular weight of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.ExactMolWt(i) for i in mol])
  descriptors['Exact_MW'] = Nilavo[0]

  # FpDensityMorgan1
  Nilavo = []
  Nilavo.append([Descriptors.FpDensityMorgan1(i) for i in mol])
  descriptors['FpDensityMorgan1'] = Nilavo[0]

  # FpDensityMorgan2
  Nilavo = []
  Nilavo.append([Descriptors.FpDensityMorgan2(i) for i in mol])
  descriptors['FpDensityMorgan2'] = Nilavo[0]

  # FpDensityMorgan3
  Nilavo = []
  Nilavo.append([Descriptors.FpDensityMorgan3(i) for i in mol])
  descriptors['FpDensityMorgan3'] = Nilavo[0]

  # Average molecular weight of the molecule ignoring hydrogens
  Nilavo = []
  Nilavo.append([Descriptors.HeavyAtomMolWt(i) for i in mol])
  descriptors['HeavyAtomMolWt'] = Nilavo[0]

  # Average molecular weight of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.MolWt(i) for i in mol])
  descriptors['MolWt'] = Nilavo[0]

  # Number of radical electrons of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.NumRadicalElectrons(i) for i in mol])
  descriptors['NumRadicalElectrons'] = Nilavo[0]

  # Number of valence electrons of the molecule
  Nilavo = []
  Nilavo.append([Descriptors.NumValenceElectrons(i) for i in mol])
  descriptors['NumValenceElectrons'] = Nilavo[0]

  # Log of partition coefficient
  Nilavo = []
  Nilavo.append([Descriptors.MolLogP(i) for i in mol])
  descriptors['Partition_Coefficient'] = Nilavo[0]


  ### Lipinski Descriptors ###
  # Fraction of C atoms that are SP3 hybridized
  Nilavo = []
  Nilavo.append([Lipinski.FractionCSP3(i) for i in mol])
  descriptors['FractionCSP3'] = Nilavo[0]

  # Number of heavy atoms a molecule
  Nilavo = []
  Nilavo.append([Lipinski.HeavyAtomCount(i) for i in mol])
  descriptors['Heavy_atoms'] = Nilavo[0]

  # Number of NHs or OHs
  Nilavo = []
  Nilavo.append([Lipinski.NHOHCount(i) for i in mol])
  descriptors['NHs/OHs'] = Nilavo[0]

  # Number of Nitrogens and Oxygens
  Nilavo = []
  Nilavo.append([Lipinski.NOCount(i) for i in mol])
  descriptors['N&O'] = Nilavo[0]

  # Number of aliphatic (containing at least one non-aromatic bond) carbocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAliphaticCarbocycles(i) for i in mol])
  descriptors['Aliphatic_carbocycles'] = Nilavo[0]

  # Number of aliphatic (containing at least one non-aromatic bond) heterocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAliphaticHeterocycles(i) for i in mol])
  descriptors['Aliphatic_heterocycles'] = Nilavo[0]

  # Number of aliphatic (containing at least one non-aromatic bond) rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAliphaticRings(i) for i in mol])
  descriptors['Aliphatic_rings'] = Nilavo[0]

  # Nmber of aromatic carbocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAromaticCarbocycles(i) for i in mol])
  descriptors['Aromatic_carbocycles'] = Nilavo[0]

  # Number of aromatic heterocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAromaticHeterocycles(i) for i in mol])
  descriptors['Aromatic_heterocycles'] = Nilavo[0]

  # Number of aromatic rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumAromaticRings(i) for i in mol])
  descriptors['Aromatic_rings'] = Nilavo[0]

  # Number of Hydrogen Bond Acceptors
  Nilavo = []
  Nilavo.append([Lipinski.NumHAcceptors(i) for i in mol])
  descriptors['HAcceptors'] = Nilavo[0]

  # Number of Hydrogen Bond Donors
  Nilavo = []
  Nilavo.append([Lipinski.NumHDonors(i) for i in mol])
  descriptors['HDonors'] = Nilavo[0]

  # Number of Heteroatoms
  Nilavo = []
  Nilavo.append([Lipinski.NumHeteroatoms(i) for i in mol])
  descriptors['Heteroatoms'] = Nilavo[0]

  # Number of Rotatable Bonds
  Nilavo = []
  Nilavo.append([Lipinski.NumRotatableBonds(i) for i in mol])
  descriptors['Rotatable_Bonds'] = Nilavo[0]

  # Number of saturated carbocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumSaturatedCarbocycles(i) for i in mol])
  descriptors['Saturated_Carbocycles'] = Nilavo[0]

  # Number of saturated heterocycles for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumSaturatedHeterocycles(i) for i in mol])
  descriptors['Saturated_Heterocycles'] = Nilavo[0]

  # Number of saturated rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.NumSaturatedRings(i) for i in mol])
  descriptors['Saturated_Rings'] = Nilavo[0]

  # Number of rings for a molecule
  Nilavo = []
  Nilavo.append([Lipinski.RingCount(i) for i in mol])
  descriptors['Rings'] = Nilavo[0]

  return descriptors

# Choosing best model

Regression data

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor

from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse

In [None]:
regression_datasets = ['caco2_wang', 
                       'lipophilicity_astrazeneca', 
                       'solubility_aqsoldb', 
                       'ppbr_az', 
                       'vdss_lombardo', 
                       'half_life_obach', 
                       'clearance_microsome_az',
                       'clearance_hepatocyte_az', 
                       'ld50_zhu']

In [None]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')


best_model_list = []
best_model_store = []

for reg_data in regression_datasets:
  
  LR = []
  KNN = []
  DT = []
  Bag = []
  RF = []
  ET = []
  GB = []
  AB = []

  for seed in np.arange(1,6):
    benchmark = group.get(reg_data)
    name = benchmark['name']

    # split the dataset into train_val & test set
    train_val, test = benchmark['train_val'], benchmark['test']

    # split the train_val set into train & validation set
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'scaffold', seed = seed)

      ### I will only use train & valid set to find the best model.
      ### And test set will be used as indipendent test set to generalise my best model's performance.

    # feature extracting
    x_train = molecular_descriptors(train)
    x_valid = molecular_descriptors(valid)

    # target data
    y_train = train.Y
    y_valid = valid.Y


    lin = LinearRegression()
    lin.fit(x_train, y_train)
    y_pred_valid = lin.predict(x_valid)
    LR.append(mae(y_valid, y_pred_valid))

    knn = KNeighborsRegressor()
    knn.fit(x_train, y_train)
    y_pred_valid = knn.predict(x_valid)
    KNN.append(mae(y_valid, y_pred_valid))

    dt = DecisionTreeRegressor(random_state=0)
    dt.fit(x_train, y_train)
    y_pred_valid = dt.predict(x_valid)
    DT.append(mae(y_valid, y_pred_valid))

    bag = BaggingRegressor(DecisionTreeRegressor(), random_state=0)
    bag.fit(x_train, y_train)
    y_pred_valid = bag.predict(x_valid)
    Bag.append(mae(y_valid, y_pred_valid))

    rf = RandomForestRegressor(random_state=0)
    rf.fit(x_train, y_train)
    y_pred_valid = rf.predict(x_valid)
    RF.append(mae(y_valid, y_pred_valid))

    et = ExtraTreesRegressor(random_state=0)
    et.fit(x_train, y_train)
    y_pred_valid = et.predict(x_valid)
    ET.append(mae(y_valid, y_pred_valid))

    grad = GradientBoostingRegressor(random_state=0)
    grad.fit(x_train, y_train)
    y_pred_valid = grad.predict(x_valid)
    GB.append(mae(y_valid, y_pred_valid))

    ada = AdaBoostRegressor(DecisionTreeRegressor(),random_state=0)
    ada.fit(x_train, y_train)
    y_pred_valid = ada.predict(x_valid)
    AB.append(mae(y_valid, y_pred_valid))


  # Find out which model gives lowest average MAE
  avg_mae = []
  models = ['Linear', 'K_Neighbors', 'Decision_Tree', 'Bagging', 'Random_Forest', 'Extra_Trees', 'Gradient_Boosting', 'Ada_Boost']
  for ml_mae in [LR, KNN, DT, Bag, RF, ET, GB, AB]:
    avg_mae.append(np.mean(ml_mae))
  avg_mae = pd.Series(avg_mae, index = models)
  


  best_model_name = avg_mae[avg_mae == np.sort(avg_mae)[0]].index[0]
  best_model_store.append(best_model_name)

ml_model = [lin, knn, dt, bag, rf, et, grad, ada]
for data_set in np.arange(0,9):
  best_model = ml_model[models.index(best_model_store[data_set])]
  best_model_list.append(best_model)

best_model_series = pd.Series(best_model_list, index = regression_datasets)

Downloading Benchmark Group...
100%|██████████| 1.47M/1.47M [00:00<00:00, 12.6MiB/s]
Extracting zip file...
Done!
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 1203.61it/s]
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 2408.39it/s]
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 2367.55it/s]
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 2456.87it/s]
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 2358.69it/s]
generating training, validation splits...
100%|██████████| 3360/3360 [00:01<00:00, 2546.83it/s]
generating training, validation splits...
100%|██████████| 3360/3360 [00:02<00:00, 1445.35it/s]
generating training, validation splits...
100%|██████████| 3360/3360 [00:01<00:00, 2548.49it/s]
generating training, validation splits...
100%|██████████| 3360/3360 [00:04<00:00, 771.68it/s]
generating training, validation s

In [None]:
# 27 molecular descriptors
best_model_series

caco2_wang                   (ExtraTreeRegressor(random_state=209652396), E...
lipophilicity_astrazeneca    (ExtraTreeRegressor(random_state=209652396), E...
solubility_aqsoldb           (ExtraTreeRegressor(random_state=209652396), E...
ppbr_az                      ([DecisionTreeRegressor(criterion='friedman_ms...
vdss_lombardo                (DecisionTreeRegressor(random_state=209652396)...
half_life_obach              (DecisionTreeRegressor(random_state=209652396)...
clearance_microsome_az       (DecisionTreeRegressor(random_state=209652396)...
clearance_hepatocyte_az      (DecisionTreeRegressor(random_state=209652396)...
ld50_zhu                     (ExtraTreeRegressor(random_state=209652396), E...
dtype: object

In [None]:
# 27 molecular descriptors
best_model_series.values

array([ExtraTreesRegressor(random_state=0),
       ExtraTreesRegressor(random_state=0),
       ExtraTreesRegressor(random_state=0),
       GradientBoostingRegressor(random_state=0),
       AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0),
       AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0),
       AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0),
       AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), random_state=0),
       ExtraTreesRegressor(random_state=0)], dtype=object)

Classification data

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

from sklearn.metrics import accuracy_score as acc

In [None]:
classification_datasets = ['hia_hou', 
                       'pgp_broccatelli', 
                       'bioavailability_ma', 
                       'bbb_martins', 
                       'cyp2d6_veith', 
                       'cyp3a4_veith', 
                       'cyp2c9_veith',
                       'cyp2d6_substrate_carbonmangels', 
                       'cyp3a4_substrate_carbonmangels',
                       'cyp2c9_substrate_carbonmangels',
                       'herg',
                       'ames',
                       'dili']

In [None]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')


best_Cmodel_list = []
best_Cmodel_store = []

for clf_data in classification_datasets:
  
  LR = []
  SGD = []
  PT = []
  PAC = []
  LDA = []
  Svc = []
  KNN = []
  NC = []
  GNB = []
  DT = []
  Bag = []
  RF = []
  ET = []
  GB = []
  AB = []
  LP = []
  LS = []

  for seed in np.arange(1,6):
    benchmark = group.get(clf_data)
    name = benchmark['name']

    # split the dataset into train_val & test set
    train_val, test = benchmark['train_val'], benchmark['test']

    # split the train_val set into train & validation set
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'scaffold', seed = seed)

      ### I will only use train & valid set to find the best model.
      ### And test set will be used as indipendent test set to generalise my best model's performance.

    # feature extracting
    x_train = molecular_descriptors(train)
    x_valid = molecular_descriptors(valid)

    # target data
    y_train = train.Y
    y_valid = valid.Y

    # linear_model

    log = LogisticRegression(random_state=0)
    log.fit(x_train, y_train)
    y_pred_valid = log.predict(x_valid)
    LR.append(acc(y_valid, y_pred_valid))

    sgd = SGDClassifier(random_state=0)
    sgd.fit(x_train, y_train)
    y_pred_valid = sgd.predict(x_valid)
    SGD.append(acc(y_valid, y_pred_valid))

    pt = Perceptron(random_state=0)
    pt.fit(x_train, y_train)
    y_pred_valid = pt.predict(x_valid)
    PT.append(acc(y_valid, y_pred_valid))

    pac = PassiveAggressiveClassifier(random_state=0)
    pac.fit(x_train, y_train)
    y_pred_valid = pac.predict(x_valid)
    PAC.append(acc(y_valid, y_pred_valid))

    # discriminant_analysis

    lda = LinearDiscriminantAnalysis()
    lda.fit(x_train, y_train)
    y_pred_valid = lda.predict(x_valid)
    LDA.append(acc(y_valid, y_pred_valid))

    # svm

    svc = SVC()
    svc.fit(x_train, y_train)
    y_pred_valid = svc.predict(x_valid)
    Svc.append(acc(y_valid, y_pred_valid))

    # neighbors

    knn = KNeighborsClassifier()
    knn.fit(x_train, y_train)
    y_pred_valid = knn.predict(x_valid)
    KNN.append(acc(y_valid, y_pred_valid))

    nc = NearestCentroid()
    nc.fit(x_train, y_train)
    y_pred_valid = nc.predict(x_valid)
    NC.append(acc(y_valid, y_pred_valid))

    # naive_bayes

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred_valid = gnb.predict(x_valid)
    GNB.append(acc(y_valid, y_pred_valid))

    # tree

    dt = DecisionTreeClassifier(random_state=0)
    dt.fit(x_train, y_train)
    y_pred_valid = dt.predict(x_valid)
    DT.append(acc(y_valid, y_pred_valid))

    # ensemble

    bag = BaggingClassifier(DecisionTreeClassifier(), random_state=0)
    bag.fit(x_train, y_train)
    y_pred_valid = bag.predict(x_valid)
    Bag.append(acc(y_valid, y_pred_valid))

    rf = RandomForestClassifier(random_state=0)
    rf.fit(x_train, y_train)
    y_pred_valid = rf.predict(x_valid)
    RF.append(acc(y_valid, y_pred_valid))

    et = ExtraTreesClassifier(random_state=0)
    et.fit(x_train, y_train)
    y_pred_valid = et.predict(x_valid)
    ET.append(acc(y_valid, y_pred_valid))

    grad = GradientBoostingClassifier(random_state=0)
    grad.fit(x_train, y_train)
    y_pred_valid = grad.predict(x_valid)
    GB.append(acc(y_valid, y_pred_valid))

    ada = AdaBoostClassifier(DecisionTreeClassifier(),random_state=0)
    ada.fit(x_train, y_train)
    y_pred_valid = ada.predict(x_valid)
    AB.append(acc(y_valid, y_pred_valid))

    # semi_supervised

    lp = LabelPropagation()
    lp.fit(x_train, y_train)
    y_pred_valid = lp.predict(x_valid)
    LP.append(acc(y_valid, y_pred_valid))

    ls = LabelSpreading()
    ls.fit(x_train, y_train)
    y_pred_valid = ls.predict(x_valid)
    LS.append(acc(y_valid, y_pred_valid))


  # Find out which model gives lowest average MAE
  avg_acc = []
  models = ['Logistic', 'SGD', 'Perceptron', 'Passive_Aggressive', 'Linear_Discriminant_Analysis', 'SVM', 
            'K_Neighbors', 'Nearest_Centroid', 'Gaussian_NB', 'Decision_Tree', 'Bagging', 'Random_Forest', 'Extra_Trees', 'Gradient_Boosting', 'Ada_Boost',
            'Label_Propagation', 'Label_Spreading']
  for ml_acc in [LR, SGD, PT, PAC, LDA, Svc, KNN, NC, GNB, DT, Bag, RF, ET, GB, AB, LP, LS]:
    avg_acc.append(np.mean(ml_acc))
  avg_acc = pd.Series(avg_acc, index = models)
  


  best_Cmodel_name = avg_acc[avg_acc == np.sort(avg_acc)[-1]].index[0]
  best_Cmodel_store.append(best_Cmodel_name)

best_Cmodel_list = []
ml_model = [log, sgd, pt, pac, lda, svc, knn, nc, gnb, dt, bag, rf, et, grad, ada, lp, ls]

for data_set in np.arange(0,13):
  best_model = ml_model[models.index(best_Cmodel_store[data_set])]
  best_Cmodel_list.append(best_model)

best_Cmodel_series = pd.Series(best_Cmodel_list, index = classification_datasets)

Found local copy...
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 2958.09it/s]
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 2535.83it/s]
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 2545.62it/s]
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 2535.49it/s]
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 2515.19it/s]
generating training, validation splits...
100%|██████████| 973/973 [00:00<00:00, 2303.29it/s]
generating training, validation splits...
100%|██████████| 973/973 [00:00<00:00, 2362.11it/s]
generating training, validation splits...
100%|██████████| 973/973 [00:00<00:00, 2329.64it/s]
generating training, validation splits...
100%|██████████| 973/973 [00:00<00:00, 2338.57it/s]
generating training, validation splits...
100%|██████████| 973/973 [00:00<00:00, 2317.69it/s]
generating training, validation splits..

In [None]:
# 27 molecular descriptors
best_Cmodel_series

hia_hou                                                LinearDiscriminantAnalysis()
pgp_broccatelli                   (DecisionTreeClassifier(max_features='auto', r...
bioavailability_ma                                                     Perceptron()
bbb_martins                       (DecisionTreeClassifier(max_features='auto', r...
cyp2d6_veith                      (DecisionTreeClassifier(max_features='auto', r...
cyp3a4_veith                      (DecisionTreeClassifier(max_features='auto', r...
cyp2c9_veith                      (DecisionTreeClassifier(max_features='auto', r...
cyp2d6_substrate_carbonmangels    (ExtraTreeClassifier(random_state=209652396), ...
cyp3a4_substrate_carbonmangels                         LinearDiscriminantAnalysis()
cyp2c9_substrate_carbonmangels    (ExtraTreeClassifier(random_state=209652396), ...
herg                              (DecisionTreeClassifier(random_state=208755735...
ames                              (ExtraTreeClassifier(random_state=20965239

In [None]:
# 27 molecular descriptors
best_Cmodel_series.values

array([LinearDiscriminantAnalysis(),
       RandomForestClassifier(random_state=0), Perceptron(),
       RandomForestClassifier(random_state=0),
       RandomForestClassifier(random_state=0),
       RandomForestClassifier(random_state=0),
       RandomForestClassifier(random_state=0),
       ExtraTreesClassifier(random_state=0), LinearDiscriminantAnalysis(),
       ExtraTreesClassifier(random_state=0),
       BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=0),
       ExtraTreesClassifier(random_state=0),
       LogisticRegression(random_state=0)], dtype=object)

# Train on train data 
# &
# Predict for test data

In [None]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
predictions_list = []

for seed in [1, 2, 3, 4, 5]:
    predictions = {}

    for benchmark in group:
        name = benchmark['name']
        train_val, test = benchmark['train_val'], benchmark['test']
        train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)

        ## --- train your model --- ##

        x_train = molecular_descriptors(train)
        x_valid = molecular_descriptors(valid)
        x_test = molecular_descriptors(test)

        # target column
        y_train = train.Y
        y_valid = valid.Y
        y_test = test.Y

        # merging traning and validation set
        #x_train_valid = pd.concat([x_train, x_valid])
        #y_train_valid = pd.concat([y_train, y_valid], axis=0)

        if name in regression_datasets:
          model = best_model_series[name]

        elif name in classification_datasets:
          model = best_Cmodel_series[name]

        # fit and predict
        model.fit(x_train, y_train)
        y_pred_test = model.predict(x_test)
        

        predictions[name] = y_pred_test
    predictions_list.append(predictions)

results = group.evaluate_many(predictions_list)

Found local copy...
--- caco2_wang ---
generating training, validation splits...
100%|██████████| 728/728 [00:00<00:00, 2294.29it/s]
--- hia_hou ---
generating training, validation splits...
100%|██████████| 461/461 [00:00<00:00, 2636.97it/s]
--- pgp_broccatelli ---
generating training, validation splits...
100%|██████████| 973/973 [00:00<00:00, 2291.20it/s]
--- bioavailability_ma ---
generating training, validation splits...
100%|██████████| 512/512 [00:00<00:00, 3001.41it/s]
--- lipophilicity_astrazeneca ---
generating training, validation splits...
100%|██████████| 3360/3360 [00:01<00:00, 2563.81it/s]
--- solubility_aqsoldb ---
generating training, validation splits...
100%|██████████| 7985/7985 [00:01<00:00, 4777.13it/s]
--- bbb_martins ---
generating training, validation splits...
100%|██████████| 1624/1624 [00:00<00:00, 2659.96it/s]
--- ppbr_az ---
generating training, validation splits...
100%|██████████| 2231/2231 [00:00<00:00, 2264.14it/s]
--- vdss_lombardo ---
generating trai

In [None]:
# 27 molecular descriptors (train on train set)
results

{'ames': [0.687, 0.009],
 'bbb_martins': [0.77, 0.017],
 'bioavailability_ma': [0.504, 0.006],
 'caco2_wang': [0.337, 0.011],
 'clearance_hepatocyte_az': [0.297, 0.058],
 'clearance_microsome_az': [0.325, 0.04],
 'cyp2c9_substrate_carbonmangels': [0.291, 0.014],
 'cyp2c9_veith': [0.519, 0.006],
 'cyp2d6_substrate_carbonmangels': [0.41, 0.018],
 'cyp2d6_veith': [0.292, 0.008],
 'cyp3a4_substrate_carbonmangels': [0.625, 0.014],
 'cyp3a4_veith': [0.618, 0.006],
 'dili': [0.823, 0.026],
 'half_life_obach': [0.294, 0.041],
 'herg': [0.674, 0.018],
 'hia_hou': [0.783, 0.014],
 'ld50_zhu': [0.673, 0.002],
 'lipophilicity_astrazeneca': [0.727, 0.007],
 'pgp_broccatelli': [0.807, 0.019],
 'ppbr_az': [8.816, 0.08],
 'solubility_aqsoldb': [0.897, 0.02],
 'vdss_lombardo': [0.465, 0.039]}

In [None]:
# Rank of models using 27 molecular descriptors (train on train set)

# DataSet                       TDC LeaderBoard Rank

# ames                              XXX
# bbb_martins                       XXX
# bioavailability_ma                XXX
# caco2_wang                        1st
# clearance_hepatocyte_az           6th
# clearance_microsome_az            9th
# cyp2c9_substrate_carbonmangels    XXX
# cyp2c9_veith                      XXX
# cyp2d6_substrate_carbonmangels    XXX
# cyp2d6_veith                      XXX
# cyp3a4_substrate_carbonmangels    5th
# cyp3a4_veith                      XXX
# dili                              9th
# half_life_obach                   3rd
# herg                              XXX
# hia_hou                           XXX
# ld50_zhu                          5th
# lipophilicity_astrazeneca         8th
# pgp_broccatelli                   XXX
# ppbr_az                           1st
# solubility_aqsoldb                3rd
# vdss_lombardo                     6th