<a href="https://colab.research.google.com/github/PabloCGarcia/bioDeepLearning/blob/main/Clase10_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Clase de introducción de deepchem, experimentos con deep learning con datasets conocidos

In [None]:
#!curl -Lo conda_installer.py https://raw.githubusercontent.com/deepchem/deepchem/master/scripts/colab_install.py
#import conda_installer
#conda_installer.install()
#!/root/miniconda/bin/conda info -e
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
import condacolab
condacolab.check()

In [None]:
!conda install -y -c rdkit rdkit==2020.09.2

In [None]:
!conda install -y -c conda-forge openmm
!conda install -y -c omnia pdbfixer

In [None]:
import sys
sys.path.insert(0, "/usr/local/lib/python3.7/site-packages/")

In [None]:
!pip install --pre deepchem

In [None]:
import deepchem as dc
dc.__version__

In [None]:
!wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/lipo.csv
!wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/hppb.csv
!wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/clearance.csv

In [None]:
import numpy as np
np.random.seed(123)
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm

import tensorflow as tf
tf.random.set_seed(123)
import deepchem as dc
from deepchem.models.graph_models import GraphConvModel

BATCH_SIZE = 128
# Set to higher values to get better numbers
MAX_EPOCH = 50
LR = 1e-3
LMBDA = 1e-4

In [None]:
def load_dataset(dataset_file, featurizer='ECFP', split='index'):
  tasks = ['target']

  if featurizer == 'ECFP':
    featurizer = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = dc.feat.ConvMolFeaturizer()

  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smile", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  transformers = [
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
  ]
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'scaffold': dc.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return tasks, (train, valid, test), transformers

In [None]:
def experiment(dataset_file, method='GraphConv', split='scaffold'):
  featurizer = 'ECFP'
  if method == 'GraphConv':
    featurizer = 'GraphConv'
  tasks, datasets, transformers = load_dataset(
      dataset_file, featurizer=featurizer, split=split)
  train, val, test = datasets

  model = None
  if method == 'GraphConv':
    #Opciones
    #GraphConvModel(n_tasks: int, graph_conv_layers: List[int] = [64, 64], 
    #dense_layer_size: int = 128, dropout: float = 0.0, 
    #mode: str = 'classification', number_atom_features: int = 75, 
    #n_classes: int = 2, batch_size: int = 100, 
    #batch_normalize: bool = True, uncertainty: bool = False)

    #Original
    model = GraphConvModel(len(tasks), batch_size=BATCH_SIZE, mode="regression")

    #Opcion1
    #model = GraphConvModel(len(tasks), batch_size=BATCH_SIZE, dropout=0.5, mode="regression")

    #Opcion2
    #model = GraphConvModel(len(tasks), batch_size=BATCH_SIZE, dropout=0.5, number_atom_features=64, mode="regression")

    #Opcion3
    #model = GraphConvModel(len(tasks), batch_size=BATCH_SIZE, dropout=0.3, dense_layer_size = 128, mode="regression")

    #Opcion4
    #model = GraphConvModel(len(tasks), graph_conv_layers=[2048, 1024], batch_size=BATCH_SIZE, dropout=0.3, dense_layer_size = 128, mode="regression")


  elif method == 'RF':

    def model_builder_rf(model_dir):
      sklearn_model = RandomForestRegressor(n_estimators=100)
      return dc.models.SklearnModel(sklearn_model, model_dir)

    model = dc.models.SingletaskToMultitask(tasks, model_builder_rf)
  elif method == 'SVR':

    def model_builder_svr(model_dir):
      sklearn_model = svm.SVR(kernel='linear')
      return dc.models.SklearnModel(sklearn_model, model_dir)

    model = dc.models.SingletaskToMultitask(tasks, model_builder_svr)

  return model, train, val, test, transformers

In [None]:
#from keras.utils.vis_utils import plot_model
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)


def benchmark(  MODEL = "GraphConv", SPLIT = "scaffold",  DATASET = "hppb.csv"):


  metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
  
  print("About to build model")
  model, train, val, test, transformers = experiment(
      DATASET, method=MODEL, split=SPLIT)
  #plot_model(model.model,show_shapes=True, show_layer_names=True)
  
  if MODEL == 'GraphConv':
    print("running GraphConv search")
    best_val_score = 0.0
    train_score = 0.0
    for l in range(0, MAX_EPOCH):
      print("epoch %d" % l, end=" ")
      if (l+1) % 10 == 0:
        print()
      model.fit(train, nb_epoch=1)
      latest_train_score = model.evaluate(train, [metric],
                                          transformers)['mean-pearson_r2_score']
      latest_val_score = model.evaluate(val, [metric],
                                        transformers)['mean-pearson_r2_score']
      if latest_val_score > best_val_score:
        best_val_score = latest_val_score
        train_score = latest_train_score
    print()
    print((MODEL, SPLIT, DATASET, train_score, best_val_score))
    print(model.model.summary())
  else:
    model.fit(train)
    train_score = model.evaluate(train, [metric],
                                 transformers)['mean-pearson_r2_score']
    val_score = model.evaluate(val, [metric],
                               transformers)['mean-pearson_r2_score']
    print()   
    print((MODEL, SPLIT, DATASET, train_score, val_score))

In [None]:
benchmark()

In [None]:
benchmark(  "GraphConv", "scaffold",  "lipo.csv")

In [None]:
benchmark("RF", "scaffold",  "lipo.csv")

In [None]:
benchmark(  "SVR", "scaffold",  "lipo.csv")