In [1]:
import tensorflow as tf
import deepchem as dc

import numpy as np

np.random.seed(123)

from deepchem.feat import Featurizer

In [2]:
import sys
sys.path.insert(0, "/home/zhengxu/github/drug/seq2seq-fingerprint/")

from unsupervised.seq2seq_model import FingerprintFetcher

In [3]:
# Define our seq2seq featurizer.

from rdkit import Chem

class Seq2seqFeaturizer(Featurizer):
    """Seq2seq Featurizer."""

    def __init__(self, model_dir, vocab_dir):
        """Define the seq2seq feature."""
        self.fetcher = FingerprintFetcher(model_dir, vocab_dir)
        
    def _featurize(self, mol):
        """
        Calculate features for a single molecule.
        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        # This is a bit hacky. I have no idea why we have to start from mol instead of original smile.
        smile = Chem.MolToSmiles(mol)
        fp, _ = self.fetcher.decode(smile)
        return fp

In [4]:
# Initailize the featurizer and cache it.
sess = tf.InteractiveSession()
featurizer = Seq2seqFeaturizer("/home/zhengxu/expr/test/gru-4-256", "/home/zhengxu/expr/test/pretrain/pm2.vocab")

Loading seq2seq model definition from /home/zhengxu/expr/test/gru-4-256/model.json...
Loading model weights from checkpoint_dir: /home/zhengxu/expr/test/gru-4-256/weights/


In [13]:
# Build up specific model builder.

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR, NuSVR
# Building scikit random forest model

# Use this class to select different models for different task/dataset/split.
class SKLearnModelSelector(object):
    
    DATASET_MAPPING = {
        "delaney": {
            "index": (RandomForestRegressor, {}),
            "random": (RandomForestRegressor, {}),
            "scaffold": (RandomForestRegressor, {})
        },
        "sampl": {
            "index": (RandomForestRegressor, {}),
            "random": (RandomForestRegressor, {}),
            "scaffold": (RandomForestRegressor, {})
        }
    }
    
    def __init__(self, dataset, split):
        """Input dataset and split."""
        self.dataset = dataset
        self.split = split
        
    def __call__(self, task):
        model_class, model_hparam = self.DATASET_MAPPING[self.dataset][self.split]
        sklearn_model = model_class(**model_hparam)
        return dc.models.sklearn_models.SklearnModel(sklearn_model, task)


In [14]:
from deepchem.molnet.run_benchmark import load_dataset, benchmark_model
from itertools import product

metric = [dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)]

datasets = [
    # 'kaggle', # does not allow featurizer input
    'delaney', # we discard it here to save the testing time.
    # 'nci', # Could be very large. discard it if we do not use it.
    # 'chembl', # too long to run, discard.
    'sampl'
]
splits = [
    "index",
    "random",
    "scaffold"
]

for dataset, split in product(datasets, splits):
    print("="*80)
    print("Dataset: %s, split: %s" % (dataset, split))
    tasks, all_datasets, transformers = load_dataset(dataset, featurizer, split)
    reg_model = dc.models.multitask.SingletaskToMultitask(tasks, SKLearnModelSelector(dataset, split))
    train, val, test, t = benchmark_model(reg_model, all_datasets, transformers, metric, test=True)
    print(train, val, test)
    print("t = %.10f" % t)
    print("="*80)

Dataset: delaney, split: index
-------------------------------------
Loading dataset: delaney
-------------------------------------
Splitting function: index
About to featurize Delaney dataset.
Loading raw samples now.
shard_size: 8192
About to start loading CSV from /tmp/delaney-processed.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 0 took 50.006 s
TIMING: dataset construction took 50.163 s
Loading dataset from disk.
About to transform data
TIMING: dataset construction took 0.174 s
Loading dataset from disk.
TIMING: dataset construction took 0.144 s
Loading dataset from disk.
TIMING: dataset construction took 0.038 s
Loading dataset from disk.
TIMING: dataset construction took 0.040 s
Loading dataset from disk.
About to initialize singletask to multitask model
Initializing directory for task measured log solubility in mols per litre
About to create task-specific datasets
Splitting multitask dataset into singletask datasets
TI

In [None]:
sess.close()