In [91]:
from dflow import Step, Workflow, upload_artifact
from dflow.python import OP, OPIO, Artifact, OPIOSign, PythonOPTemplate, upload_packages
from pathlib import Path
from subprocess import call

import deepchem as dc
import xgboost as xgb
from deepchem.data import NumpyDataset, DiskDataset
from deepchem.models import Model
import sys, os
import torch
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [92]:
tr_dataset = upload_artifact("trainBBBP.csv")
te_dataset = upload_artifact("testBBBP.csv")
val_dataset = upload_artifact("validBBBP.csv")

In [93]:
# S0
class SetupAndLoad(OP):
    def __init__(self):
        pass

    @classmethod
    def get_input_sign(cls):
        return OPIOSign({
            "train_file": Artifact(Path),
            "test_file": Artifact(Path),
            "valid_file": Artifact(Path),
        })

    @classmethod
    def get_output_sign(cls):
        return OPIOSign({
            "train_dataset_dir": Artifact(Path), 
            "test_dataset_dir": Artifact(Path),
            "valid_dataset_dir": Artifact(Path),
        })

    @OP.exec_sign_check
    def execute(self, op_in: OPIO) -> OPIO:
        train_file_path = str(op_in["train_file"])
        test_file_path = str(op_in["test_file"])
        valid_file_path = str(op_in["valid_file"])

        smile_str = 'SMILES'
        loader = dc.data.CSVLoader(
            tasks=['targets'], feature_field=smile_str, 
            featurizer=dc.feat.CircularFingerprint(size=256))

        train_dataset = loader.create_dataset(train_file_path)
        test_dataset = loader.create_dataset(test_file_path)
        valid_dataset = loader.create_dataset(valid_file_path)

        # Use the data_dir attribute to get the directory where the dataset is stored
        op_out = OPIO({
            "train_dataset_dir": Path(train_dataset.data_dir),
            "test_dataset_dir": Path(test_dataset.data_dir),
            "valid_dataset_dir": Path(valid_dataset.data_dir),
        })
        return op_out

In [94]:
# S1
class TransformData(OP):
    def __init__(self):
        pass

    @classmethod
    def get_input_sign(cls):
        return OPIOSign({
            "train_dataset_dir": Artifact(Path),
            "test_dataset_dir": Artifact(Path),
            "valid_dataset_dir": Artifact(Path),
        })

    @classmethod
    def get_output_sign(cls):
        return OPIOSign({
            "transformed_train_dir": Artifact(Path),
            "transformed_test_dir": Artifact(Path),
            "transformed_valid_dir": Artifact(Path),
        })

    @OP.exec_sign_check
    def execute(self, op_in: OPIO) -> OPIO:
        # Access the dataset paths from the artifacts
        train_dataset_path = str(op_in["train_dataset_dir"])
        test_dataset_path = str(op_in["test_dataset_dir"])
        valid_dataset_path = str(op_in["valid_dataset_dir"])

        # Load datasets
        train_dataset = DiskDataset(train_dataset_path)
        test_dataset = DiskDataset(test_dataset_path)
        valid_dataset = DiskDataset(valid_dataset_path)

        # Apply transformations
        transformer = dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)
        transformed_train = transformer.transform(train_dataset)
        transformed_test = transformer.transform(test_dataset)
        transformed_valid = transformer.transform(valid_dataset)

        # Return the data_dir of transformed datasets as artifacts
        op_out = OPIO({
            "transformed_train_dir": Path(transformed_train.data_dir),
            "transformed_test_dir": Path(transformed_test.data_dir),
            "transformed_valid_dir": Path(transformed_valid.data_dir),
        })
        return op_out

In [95]:
# S2
class TrainModel(OP):
    def __init__(self):
        pass

    @classmethod
    def get_input_sign(cls):
        return OPIOSign({
            "transformed_train": Artifact(Path),
            "transformed_valid": Artifact(Path),
        })

    @classmethod
    def get_output_sign(cls):
        return OPIOSign({
            "model_path": Artifact(Path),
        })

    @OP.exec_sign_check
    def execute(self, op_in: OPIO) -> OPIO:

        # Load transformed datasets from the provided paths
        transformed_train = DiskDataset(op_in["transformed_train"])
        transformed_valid = DiskDataset(op_in["transformed_valid"])

        # Train model
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        xgb_model = xgb.XGBClassifier(
            n_estimators=100, 
            learning_rate=0.05, 
            tree_method='gpu_hist' if device.type == 'cuda' else 'hist'
        )

        dc_model = dc.models.GBDTModel(xgb_model, mode="classification")
        print("Fitting model...")
        dc_model.fit_with_eval(transformed_train, transformed_valid)

        # Save the underlying XGBoost model
        model_path = os.path.join(dc_model.model_dir, "xgb_model.json")
        dc_model.model.save_model(model_path)

        op_out = OPIO({
            "model_path": Path(model_path),
        })

        return op_out

In [96]:
# S3
class EvaluateModel(OP):
    def __init__(self):
        pass

    @classmethod
    def get_input_sign(cls):
        return OPIOSign({
            "model": Artifact(Path),
            "transformed_test": Artifact(Path),
        })

    @classmethod
    def get_output_sign(cls):
        return OPIOSign({
            "evaluation_metrics": float,  # AUC score as a float
        })

    @OP.exec_sign_check
    def execute(self, op_in: OPIO) -> OPIO:
        model_path = str(op_in["model"])
        transformed_test = DiskDataset(op_in["transformed_test"])

        xgb_model = xgb.XGBClassifier(
            n_estimators=10, 
            learning_rate=0.05
        )
        xgb_model.load_model(model_path)
        # Initialize DeepChem’s GBDTModel using the XGBoost model
        loaded_model = dc.models.GBDTModel(xgb_model, mode="classification")

        metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
        evaluation_metrics = loaded_model.evaluate(transformed_test, [metric])
        auc_score = evaluation_metrics["roc_auc_score"]
        print("AUC score: ", auc_score)

        op_out = OPIO({
            "evaluation_metrics": auc_score,
        })

        return op_out

In [97]:
package_list = [
    # "/Users/star/anaconda3/lib/python3.11/site-packages/deepchem", # deepchem
    # "/Users/star/anaconda3/lib/python3.11/site-packages/xgboost", # xgboost
    # "/Users/star/anaconda3/lib/python3.11/site-packages/pandas", # pandas
    # "/Users/star/anaconda3/lib/python3.11/site-packages/pytz", # pytz
    # "/Users/star/anaconda3/lib/python3.11/site-packages/dateutil", # dateutil
    "/Users/star/anaconda3/lib/python3.11/site-packages/rdkit", # rdkit
]

In [98]:
# Step 0: Setup and Load Data
step0 = Step(
    name="setup-and-load",
    template=PythonOPTemplate(
        SetupAndLoad,
        image="starliu714/python:deepchem",
        # python_packages=package_list,
    ),
    artifacts={
        "train_file": tr_dataset,
        "test_file": te_dataset,
        "valid_file": val_dataset,
    }
)

# Step 1: Transform Data
step1 = Step(
    name="transform-data",
    template=PythonOPTemplate(
        TransformData,
        image="starliu714/python:deepchem",
    ),
    artifacts={
        "train_dataset_dir": step0.outputs.artifacts["train_dataset_dir"],
        "test_dataset_dir": step0.outputs.artifacts["test_dataset_dir"],
        "valid_dataset_dir": step0.outputs.artifacts["valid_dataset_dir"],
    }
)

# Step 2: Train Model
# It should take the output of step1 as input
step2 = Step(
    name="train-model",
    template=PythonOPTemplate(
        TrainModel,
        image="starliu714/python:deepchem",
    ),
    artifacts={
        "transformed_train": step1.outputs.artifacts["transformed_train_dir"],
        "transformed_valid": step1.outputs.artifacts["transformed_valid_dir"],
    }
)

# Step 3: Evaluate Model
step3 = Step(
    name="evaluate-model",
    template=PythonOPTemplate(
        EvaluateModel,
        image="starliu714/python:deepchem",
        python_packages=package_list,
    ),
    artifacts={
        "model": step2.outputs.artifacts["model_path"],  # Model artifact from step2
        "transformed_test": step1.outputs.artifacts["transformed_test_dir"],  # Test dataset artifact from step1
    }
)


In [99]:
# Create and submit workflow
wf = Workflow(name="xgboost")
wf.add(step0)
wf.add(step1)
wf.add(step2)
wf.add(step3)
wf.submit()

Workflow has been submitted (ID: xgboost-6f6zg, UID: 68c9d900-6405-47de-b0df-6cda841294f4)
Workflow link: https://127.0.0.1:2746/workflows/argo/xgboost-6f6zg


{'metadata': {'name': 'xgboost-6f6zg', 'generateName': 'xgboost-', 'namespace': 'argo', 'uid': '68c9d900-6405-47de-b0df-6cda841294f4', 'resourceVersion': '256093', 'generation': 1, 'creationTimestamp': '2024-01-14T15:05:20Z', 'labels': {'workflows.argoproj.io/creator': 'system-serviceaccount-argo-argo-server'}, 'managedFields': [{'manager': 'argo', 'operation': 'Update', 'apiVersion': 'argoproj.io/v1alpha1', 'time': '2024-01-14T15:05:20Z', 'fieldsType': 'FieldsV1', 'fieldsV1': {'f:metadata': {'f:generateName': {}, 'f:labels': {'.': {}, 'f:workflows.argoproj.io/creator': {}}}, 'f:spec': {}, 'f:status': {}}}]}, 'spec': {'templates': [{'name': 'xgboost-steps', 'inputs': {}, 'outputs': {}, 'metadata': {}, 'steps': [[{'name': 'setup-and-load', 'template': 'setupandload-qwm24', 'arguments': {'artifacts': [{'name': 'train_file', 'path': '/tmp/inputs/artifacts/train_file', 's3': {'key': 'upload/bbc7239c-ba59-4a73-a4c7-1b4fc7a73d87/tmpjaw2f_19.tgz'}}, {'name': 'test_file', 'path': '/tmp/inputs/