In [None]:
# %%capture --no-stderr
# ! pip install rdkit deepchem torch_geometric dgllife
# ! pip install -f https://download.pytorch.org/whl/cu118/torch_stable.html torch==2.2.1+cu118
# ! pip install  dgl -f https://data.dgl.ai/wheels/torch-2.2/cu121/repo.html

In [4]:
import deepchem as dc

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
2024-09-13 13:59:02.718331: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-13 13:59:02.803911: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-13 13:59:02.825236: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-13 13:59:02.965247: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow w

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (/home/kalki/imported/career/open_source/venv/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [66]:
# imports
from deepchem.feat.molecule_featurizers import MATFeaturizer, MolGraphConvFeaturizer, DMPNNFeaturizer
from deepchem.models.torch_models import GCNModel, MATModel, DMPNNModel
from deepchem.data import NumpyDataset, CSVLoader
import numpy as numpy
from sklearn.model_selection import train_test_split
from deepchem.metrics import mean_squared_error, Metric, accuracy_score
import pandas as pd
from deepchem.models.losses import SparseSoftmaxCrossEntropy
import torch

In [6]:
featurizer_mapper = {
    "GCN": MolGraphConvFeaturizer,
    "MAT": MATFeaturizer,
    "DMPNN": DMPNNFeaturizer
}

model_mapper = {
    "GCN": GCNModel,
    "MAT": MATModel,
    "DMPNN": DMPNNModel
}

In [83]:
def crossEntropyLoss(output, labels):
    ce_loss = torch.nn.CrossEntropyLoss(reduction='none')
    # Convert (batch_size, tasks, classes) to (batch_size, classes, tasks)
    # CrossEntropyLoss only supports (batch_size, classes, tasks)
    if len(output.shape) == 3:
        output = output.permute(0, 2, 1)

    if len(labels.shape) == len(output.shape):
        labels = labels.squeeze(-1)
    return ce_loss(torch.tensor(output), torch.tensor(labels))

In [87]:
class PolymerDiscriminatorPipeline():
  def __init__(self, task: str, model_name: str, batch_size: int = 3):
    ALLOWED_MODELS = ["GCN", "MAT", "DMPNN"]

    if task not in ["regression", "classification"]:
      raise ValueError("Task must be either 'regression' or 'classification'")

    if model_name not in ALLOWED_MODELS:
      raise ValueError(f"Model must be one of {ALLOWED_MODELS}")

    self.task = task
    self.model_name = model_name
    self.batch_size = batch_size
    self.model = None


  def _prepare_data(self, df, train_ratio: float = 0.8):
    train_df, test_df = train_test_split(df, test_size=1-train_ratio, random_state=42)
    train_df.to_csv("train.csv")
    test_df.to_csv("test.csv")
    return "train.csv", "test.csv"

  def _featurize(self, train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    featurizer = featurizer_mapper[self.model_name]()

    X_train = featurizer.featurize(train_df["smiles"].values)
    y_train = train_df["value"].values

    X_test = featurizer.featurize(test_df["smiles"].values)
    y_test = test_df["value"].values

    return X_train, y_train, X_test, y_test

  def _prepare_input(self, X_train, y_train, X_test, y_test):
    train_dataset = NumpyDataset(X_train, y_train)
    test_dataset = NumpyDataset(X_test, y_test)
    return train_dataset, test_dataset

  def _train(self, train_dataset, num_epochs):
    if self.task == "regression":
      model = model_mapper[self.model_name](mode=self.task, batch_size=self.batch_size, n_tasks = 1)
    else:
      max_target_encode = train_dataset.y.max()
      model = model_mapper[self.model_name](mode=self.task, batch_size=self.batch_size, n_tasks = 1, n_classes = max_target_encode + 1)
    train_loss = model.fit(train_dataset, nb_epoch=num_epochs)
    return model, train_loss

  def _evaluate(self, model, test_dataset):
    if self.task == "regression":
      metric = Metric(mean_squared_error)
      test_loss = model.evaluate(test_dataset, [metric])
    else:
      pred = model.predict(test_dataset)
      test_loss = crossEntropyLoss(pred, test_dataset.y)
      test_loss = test_loss.mean().item()
    return test_loss

  def __call__(self, df, num_epochs, train_ratio: float = 0.8):
    train_path, test_path = self._prepare_data(df, train_ratio)
    if self.model_name == "MAT":
      data_loader = CSVLoader(tasks=['value'], feature_field='smiles', featurizer=featurizer_mapper[self.model_name]())
      train_dataset = data_loader.create_dataset(train_path)
      test_dataset = data_loader.create_dataset(test_path)
    else:
      X_train, y_train, X_test, y_test = self._featurize(train_path, test_path)
      train_dataset, test_dataset = self._prepare_input(X_train, y_train, X_test, y_test)
    model, train_loss = self._train(train_dataset, num_epochs)
    test_loss = self._evaluate(model, test_dataset)
    report = {
        "model" : self.model_name,
        "task" : self.task,
        "train_loss" : train_loss,
        "test_loss" : test_loss
    }
    self.model = model
    return report

  def _predict(self):
    ...

### Regression Application

In [None]:
! wget "https://media.githubusercontent.com/media/ChangwenXu98/TransPolymer/master/data/Xc.csv"

In [32]:
reg_df = pd.read_csv("Xc.csv")
print("Number of data points", reg_df.shape[0])
reg_df.head()

Number of data points 432


Unnamed: 0,smiles,value
0,*C*,47.8
1,*CC(*)C,44.47
2,*CC(*)CC,34.04
3,*CC(*)CCC,20.01
4,*CC(*)CC(C)C,21.64


In [26]:
gcn_reg_pipeline = PolymerDiscriminatorPipeline(task="regression", model_name="GCN")
gcn_reg_report = gcn_reg_pipeline(reg_df, num_epochs = 10)
print("GCN regression report >>", gcn_reg_report)



GCN regression report >> {'model': 'GCN', 'task': 'regression', 'train_loss': 332.1333984375, 'test_loss': {'mean_squared_error': 436.2612318083791}}


In [34]:
mat_reg_pipeline = PolymerDiscriminatorPipeline(task="regression", model_name="MAT")
mat_reg_report = mat_reg_pipeline(reg_df, num_epochs = 1)

In [35]:
print("MAT regression report >>", mat_reg_report)

MAT regression report >> {'model': 'MAT', 'task': 'regression', 'train_loss': 429.86886160714283, 'test_loss': {'mean_squared_error': 608.1135541170149}}


In [36]:
dmpnn_reg_pipeline = PolymerDiscriminatorPipeline(task="regression", model_name="DMPNN")
dmpnn_reg_report = dmpnn_reg_pipeline(reg_df, num_epochs = 10)
print("DMPNN regression report >>", dmpnn_reg_report)

DMPNN regression report >> {'model': 'DMPNN', 'task': 'regression', 'train_loss': 489.2893359375, 'test_loss': {'mean_squared_error': 539.4306195095185}}


### Classification Application

In [43]:
class_df = pd.read_csv("OPV_cat_split.csv")

In [88]:
gcn_class_pipeline = PolymerDiscriminatorPipeline(task="classification", model_name="GCN")
gcn_class_report = gcn_class_pipeline(class_df, num_epochs = 2)
print("GCN classification report >>", gcn_class_report)

In [None]:
dmpnn_class_pipeline = PolymerDiscriminatorPipeline(task="classification", model_name="DMPNN")
dmpnn_class_report = dmpnn_class_pipeline(class_df, num_epochs = 2)
print("DMPNN classification report >>", dmpnn_class_pipeline )