In [3]:
"""This example implements RF experiments from https://pubs.acs.org/doi/abs/10.1021/acs.jcim.6b00290"""
import sys
import os
import deepchem
import deepchem as dc
import tempfile, shutil
from bace_datasets import load_bace
from deepchem.hyper import HyperparamOpt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from deepchem.models.sklearn_models import SklearnModel
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.utils.evaluate import Evaluator


def bace_rf_model(mode="classification", split="20-80"):
  """Train random forests on BACE dataset."""
  (bace_tasks, (train, valid, test, crystal), transformers) = load_bace(
      mode=mode, transform=False, split=split)

  if mode == "regression":
    r2_metric = Metric(metrics.r2_score)
    rms_metric = Metric(metrics.rms_score)
    mae_metric = Metric(metrics.mae_score)
    all_metrics = [r2_metric, rms_metric, mae_metric]
    metric = r2_metric
    model_class = RandomForestRegressor

    def rf_model_builder(model_params, model_dir):
      sklearn_model = RandomForestRegressor(**model_params)
      return SklearnModel(sklearn_model, model_dir)
  elif mode == "classification":
    roc_auc_metric = Metric(metrics.roc_auc_score)
    accuracy_metric = Metric(metrics.accuracy_score)
    mcc_metric = Metric(metrics.matthews_corrcoef)
    # Note sensitivity = recall
    recall_metric = Metric(metrics.recall_score)
    model_class = RandomForestClassifier
    all_metrics = [accuracy_metric, mcc_metric, recall_metric, roc_auc_metric]
    metric = roc_auc_metric

    def rf_model_builder(model_params, model_dir):
      sklearn_model = RandomForestClassifier(**model_params)
      return SklearnModel(sklearn_model, model_dir)
  else:
    raise ValueError("Invalid mode %s" % mode)

  params_dict = {
      "n_estimators": [10, 100],
      "max_features": ["auto", "sqrt", "log2", None],
  }

  optimizer = HyperparamOpt(rf_model_builder)
  best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search(
      params_dict, train, valid, transformers, metric=metric)

  if len(train) > 0:
    rf_train_evaluator = Evaluator(best_rf, train, transformers)
    csv_out = "rf_%s_%s_train.csv" % (mode, split)
    stats_out = "rf_%s_%s_train_stats.txt" % (mode, split)
    rf_train_score = rf_train_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Train set scores: %s" % (str(rf_train_score)))

  if len(valid) > 0:
    rf_valid_evaluator = Evaluator(best_rf, valid, transformers)
    csv_out = "rf_%s_%s_valid.csv" % (mode, split)
    stats_out = "rf_%s_%s_valid_stats.txt" % (mode, split)
    rf_valid_score = rf_valid_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Valid set scores: %s" % (str(rf_valid_score)))

  if len(test) > 0:
    rf_test_evaluator = Evaluator(best_rf, test, transformers)
    csv_out = "rf_%s_%s_test.csv" % (mode, split)
    stats_out = "rf_%s_%s_test_stats.txt" % (mode, split)
    rf_test_score = rf_test_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Test set: %s" % (str(rf_test_score)))

  if len(crystal) > 0:
    rf_crystal_evaluator = Evaluator(best_rf, crystal, transformers)
    csv_out = "rf_%s_%s_crystal.csv" % (mode, split)
    stats_out = "rf_%s_%s_crystal_stats.txt" % (mode, split)
    rf_crystal_score = rf_crystal_evaluator.compute_model_performance(
        all_metrics, csv_out=csv_out, stats_out=stats_out)
    print("RF Crystal set: %s" % (str(rf_crystal_score)))


if __name__ == "__main__":
  print("Classifier RF 20-80:")
  print("--------------------------------")
  bace_rf_model(mode="classification", split="20-80")
  print("Classifier RF 80-20:")
  print("--------------------------------")
  bace_rf_model(mode="classification", split="80-20")

  print("Regressor RF 20-80:")
  print("--------------------------------")
  bace_rf_model(mode="regression", split="20-80")
  print("Regressor RF 80-20:")
  print("--------------------------------")
  bace_rf_model(mode="regression", split="80-20")


ModuleNotFoundError: No module named 'bace_datasets'

In [2]:
##!/usr/bin/env python3
## -*- coding: utf-8 -*-
"""
Created on Tue Oct 18 15:53:27 2016

@author: Michael Wu

Benchmark test:

Giving classification performances of:
    Random forest(rf), MultitaskDNN(tf),
    RobustMultitaskDNN(tf_robust),
    Logistic regression(logreg), IRV(irv)
    Graph convolution(graphconv), xgboost(xgb),
    Directed acyclic graph(dag), Weave(weave)
on datasets: bace_c, bbbp, clintox, hiv, muv, pcba, sider, tox21, toxcast

Giving regression performances of:
    MultitaskDNN(tf_regression),
    Fit Transformer MultitaskDNN(tf_regression_ft),
    Random forest(rf_regression),
    Graph convolution regression(graphconvreg),
    xgboost(xgb_regression), Deep tensor neural net(dtnn),
    Directed acyclic graph(dag_regression),
    Weave(weave_regression)
on datasets: bace_r, chembl, clearance, delaney(ESOL), hopv, kaggle, lipo,
             nci, pdbbind, ppb, qm7, qm7b, qm8, qm9, sampl(FreeSolv)

Hyperparameters and all benchmark scripts for MoleculeNet are available at:
http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/Hyperparameter_MoleculeNetv3.tar.gz

"""
#from __future__ import print_function
#from __future__ import division
#from __future__ import unicode_literals

import os
import numpy as np
import deepchem as dc
import argparse

parser = argparse.ArgumentParser(
    description='Deepchem benchmark: ' +
    'giving performances of different learning models on datasets')
parser.add_argument(
    '-s',
    action='append',
    dest='splitter_args',
    default=[],
    help='Choice of splitting function: index, random, scaffold, stratified')
parser.add_argument(
    '-m',
    action='append',
    dest='model_args',
    default=[],
    help='Choice of model: tf, tf_robust, logreg, rf, irv, graphconv, xgb,' + \
         ' dag, weave, tf_regression, tf_regression_ft, rf_regression, ' + \
         'graphconvreg, xgb_regression, dtnn, dag_regression, weave_regression')
parser.add_argument(
    '-d',
    action='append',
    dest='dataset_args',
    default=[],
    help='Choice of dataset: bace_c, bace_r, bbbp, chembl, clearance, ' +
    'clintox, delaney, hiv, hopv, kaggle, lipo, muv, nci, pcba, pcba_146, pcba_2475 '
    + 'pdbbind, ppb, qm7, qm7b, qm8, qm9, sampl, sider, tox21, toxcast')
parser.add_argument(
    '-t',
    action='store_true',
    dest='test',
    default=False,
    help='Evalute performance on test set')
parser.add_argument(
    '--seed',
    action='append',
    dest='seed_args',
    default=[],
    help='Choice of random seed')
args = parser.parse_args()
#Datasets and models used in the benchmark test
splitters = args.splitter_args
models = args.model_args
datasets = args.dataset_args
test = args.test
if len(args.seed_args) > 0:
  seed = int(args.seed_args[0])
else:
  seed = 123

if len(splitters) == 0:
  splitters = ['index', 'random', 'scaffold']
if len(models) == 0:
  models = [
      'tf', 'tf_robust', 'logreg', 'graphconv', 'irv', 'tf_regression',
      'tf_regression_ft', 'graphconvreg', 'weave', 'weave_regression', 'dtnn'
  ]
  #irv, rf, rf_regression should be assigned manually
if len(datasets) == 0:
  datasets = [
      'bace_c', 'bace_r', 'bbbp', 'clearance', 'clintox', 'delaney', 'hiv',
      'hopv', 'lipo', 'muv', 'pdbbind', 'ppb', 'qm7b', 'qm8', 'qm9', 'sampl',
      'sider', 'tox21', 'toxcast'
  ]

for dataset in datasets:
  for split in splitters:
    for model in models:
      np.random.seed(seed)
      dc.molnet.run_benchmark(
          [dataset], str(model), split=split, test=test, seed=seed)


usage: ipykernel_launcher.py [-h] [-s SPLITTER_ARGS] [-m MODEL_ARGS]
                             [-d DATASET_ARGS] [-t] [--seed SEED_ARGS]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/ella/.local/share/jupyter/runtime/kernel-6589eac0-f5d7-4481-bb12-35b567c43963.json


SystemExit: 2