In [None]:
# %%capture --no-stderr
# ! pip install rdkit deepchem torch_geometric dgllife
# ! pip install -f https://download.pytorch.org/whl/cu118/torch_stable.html torch==2.2.1+cu118
# ! pip install  dgl -f https://data.dgl.ai/wheels/torch-2.2/cu121/repo.html

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Collecting deepchem
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting torch_geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dgllife
  Downloading dgllife-0.3.2-py3-none-any.whl.metadata (667 bytes)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading deepchem-2.8.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 

In [1]:
# imports
from deepchem.feat import MATFeaturizer, MolGraphConvFeaturizer, DMPNNFeaturizer
from deepchem.models.torch_models import GCNModel, MATModel, DMPNNModel
from deepchem.data import NumpyDataset
import numpy as numpy
from sklearn.model_selection import train_test_split
from deepchem.metrics import mean_squared_error, Metric
import pandas as pd

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
2024-09-12 14:45:05.238797: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-12 14:45:05.259580: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-12 14:45:05.265758: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-12 14:45:05.281434: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow w

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################

Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'pydantic'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (/home/kalki/imported/career/open_source/venv/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
featurizer_mapper = {
    "GCN": MolGraphConvFeaturizer(),
    "MAT": MATFeaturizer(),
    "DMPNN": DMPNNFeaturizer()
}

model_mapper = {
    "GCN": GCNModel,
    "MAT": MATModel,
    "DMPNN": DMPNNModel
}

In [3]:
class PolymerDiscriminatorPipeline():
  def __init__(self, task: str, model_name: str, batch_size: int = 3):
    ALLOWED_MODELS = ["GCN", "MAT", "DMPNN"]

    if task not in ["regression", "classification"]:
      raise ValueError("Task must be either 'regression' or 'classification'")

    if model_name not in ALLOWED_MODELS:
      raise ValueError(f"Model must be one of {ALLOWED_MODELS}")

    self.task = task
    self.model_name = model_name
    self.batch_size = batch_size
    self.model = None


  def _prepare_data(self, df, train_ratio: float = 0.8):
    train_df, test_df = train_test_split(df, test_size=1-train_ratio, random_state=42)
    train_df.to_csv("train.csv")
    test_df.to_csv("test.csv")
    return "train.csv", "test.csv"

  def _featurize(self, train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    featurizer = featurizer_mapper[self.model_name]

    X_train = featurizer.featurize(train_df["smiles"].values)
    y_train = train_df["value"].values

    X_test = featurizer.featurize(test_df["smiles"].values)
    y_test = test_df["value"].values

    return X_train, y_train, X_test, y_test

  def _prepare_input(self, X_train, y_train, X_test, y_test):
    train_dataset = NumpyDataset(X_train, y_train)
    test_dataset = NumpyDataset(X_test, y_test)
    print("train_dataset >>")
    return train_dataset, test_dataset

  def _train(self, train_dataset, num_epochs):
    model = model_mapper[self.model_name](mode=self.task, batch_size=self.batch_size, n_tasks = 1)
    train_loss = model.fit(train_dataset, nb_epoch=num_epochs)
    return model, train_loss

  def _evaluate(self, model, test_dataset):
    metric = Metric(mean_squared_error)
    test_loss = model.evaluate(test_dataset, [metric])
    return test_loss

  def __call__(self, df, num_epochs, train_ratio: float = 0.8):
    train_path, test_path = self._prepare_data(df, train_ratio)
    X_train, y_train, X_test, y_test = self._featurize(train_path, test_path)
    train_dataset, test_dataset = self._prepare_input(X_train, y_train, X_test, y_test)
    model, train_loss = self._train(train_dataset, num_epochs)
    test_loss = self._evaluate(model, test_dataset)
    report = {
        "model" : self.model_name,
        "task" : self.task,
        "train_loss" : train_loss,
        "test_loss" : test_loss
    }
    self.model = model
    return report

  def _predict(self):
    ...

In [4]:
! wget "https://media.githubusercontent.com/media/ChangwenXu98/TransPolymer/master/data/Xc.csv"

--2024-09-12 14:45:08--  https://media.githubusercontent.com/media/ChangwenXu98/TransPolymer/master/data/Xc.csv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19747 (19K) [text/plain]
Saving to: ‘Xc.csv.1’


2024-09-12 14:45:08 (5.85 MB/s) - ‘Xc.csv.1’ saved [19747/19747]



In [6]:
df = pd.read_csv("./Xc.csv")
print("Number of data points", df.shape[0])
df.head()

Number of data points 432


Unnamed: 0,smiles,value
0,*C*,47.8
1,*CC(*)C,44.47
2,*CC(*)CC,34.04
3,*CC(*)CCC,20.01
4,*CC(*)CC(C)C,21.64


In [None]:
pipeline = PolymerDiscriminatorPipeline(task="regression", model_name="")
report = pipeline(df, num_epochs = 10)

In [None]:
print(report)

{'model': 'GCN', 'task': 'regression', 'train_loss': 454.73515625, 'test_loss': {'mean_squared_error': 518.6210412223277}}
