In [None]:
import os
import sys
import pandas as pd

import smartcadd
from smartcadd.dataset import IterableDataset
from smartcadd.pipeline import BasicCompoundPipeline

DATA_DIR = "./example_data"
MODEL_DIR = "./example_models"

os.listdir(DATA_DIR)

## Building a Pipeline out of Dummy filters


### Create a Dataset Iterator


In [None]:
dataset = IterableDataset(
    root_dir=DATA_DIR,
    batch_size=10,
)
len(dataset)

### Build the pipeline


In [None]:
from smartcadd.filters import DummyFilter

pipeline = BasicCompoundPipeline(
    data_loader=dataset,
    filters=[
        DummyFilter(),
        DummyFilter(),
    ],
)

print(pipeline)

### Loop through dataset iterator and run filters for each batch


In [None]:
pipeline_results = pipeline.run_filters(steps=1)
pipeline_results

## Let's build a basic pipeline with real filters


### Wrap a pretrained model for use in ModelFilter


In [None]:
os.listdir(MODEL_DIR)

In [5]:
from smartcadd.model_wrappers import AttentiveFPWrapper

MODEL_PARAMS_PATH = os.path.join(MODEL_DIR, "attentive_fp_model_params.pt")

model_wrapper = AttentiveFPWrapper(
    model_params_path=MODEL_PARAMS_PATH,
    mode="classification",
    n_tasks=1,
    dropout=0.3,
    num_timesteps=2,  # defaut=2
    num_layers=2,  # default = 2
    graph_feat_size=200,  # default = 200
    number_atom_features=30,  # default = 30
    number_bond_features=11,  # default = 11
    batch_size=100,
    learning_rate=0.001,
    regularization_loss=None,
    device="cpu",
)
model_wrapper.load()

### Build basic pipeline and add ModelFilter


In [None]:
from smartcadd.filters import ModelFilter, ADMETFilter

deep_learning_pipeline = BasicCompoundPipeline(
    data_loader=dataset,
    filters=[
        ModelFilter(
            model_wrapper=model_wrapper,
            target=1,
            threshold=0.5,
            output_dir="./data",
            save_results=True,
        ),
    ],
)
print(deep_learning_pipeline)

### Run the pipeline


In [None]:
deep_learning_pipeline.run_filters(steps=1)

### Check results in saved csv


In [None]:
deep_learning_results = pd.read_csv("./data/model_filtered.csv")
deep_learning_results[deep_learning_results["Prediction"] > 0.0]

## Building a custom filter using the Filter Interface


In [9]:
from smartcadd.filters import Filter
from random import random


class RandomFilter(Filter):
    """
    An example filter that randomly selects compounds
    """

    def __init__(self, threshold):
        super().__init__()

        self.threshold = threshold

    def run(self, batch):

        # select a random number of compounds to keep
        mask = [random() > self.threshold for _ in range(len(batch))]
        return [compound for compound, keep in zip(batch, mask) if keep]

### Append new filter to existing Pipeline


In [None]:
deep_learning_pipeline.append_filter(RandomFilter(threshold=0.3))
print(deep_learning_pipeline)

In [None]:
filtered_compounds = deep_learning_pipeline.run_filters(steps=5)
print(f"Length of results after filtering: {len(filtered_compounds)}")

### ADMET Filtering


In [None]:
from smartcadd.filters import ADMETFilter

ALERT_COLLECTION_PATH = os.path.join(DATA_DIR, "alert_collection.csv")

admet_pipeline = BasicCompoundPipeline(
    data_loader=dataset,
    filters=[
        ADMETFilter(
            alert_collection_path=ALERT_COLLECTION_PATH,
            output_dir="./data",
            save_results=True,
        ),
    ],
)
print(admet_pipeline)

In [None]:
admet_filtered_compounds = admet_pipeline.run_filters(steps=5)
print(f"Length of batch after filtering: {len(admet_filtered_compounds)}")

### Check results in saved csv


In [None]:
import pandas as pd

admet_results = pd.read_csv("./data/ADMET_filtered.csv")
admet_results[admet_results["keep"] == False]

## 2D Pharmacophore Filtering


In [15]:
from smartcadd.data import Compound

template_compound_smiles = {
    "rilpivirine": "CC1=CC(=CC(=C1NC2=NC(=NC=C2)NC3=CC=C(C=C3)C#N)C)/C=C/C#N",
    "etravirine": "CC1=CC(=CC(=C1OC2=NC(=NC(=C2Br)N)NC3=CC=C(C=C3)C#N)C)C#N",
}

template_compounds = [
    Compound(smiles=smiles, id=drug)
    for drug, smiles in template_compound_smiles.items()
]

### Check minimum values of template compounds for 2D Pharmacophore Filtering


In [None]:
min_df = pd.concat(compound.to_df() for compound in template_compounds)
min_df.min(axis=0).to_dict()

In [None]:
from smartcadd.filters import PharmacophoreFilter2D

pharmacophore_pipeline = BasicCompoundPipeline(
    data_loader=dataset,
    filters=[
        PharmacophoreFilter2D(
            template_compounds=template_compounds,
            output_dir="./data",
            save_results=True,
        ),
    ],
)
print(pharmacophore_pipeline)

In [None]:
pharmacophore_filtered_compounds = pharmacophore_pipeline.run_filters(steps=20)
print(
    f"Length of batch after filtering: {len(pharmacophore_filtered_compounds)}"
)

## Data Type Conversion Modules and Geometry Optimization


In [None]:
from smartcadd.modules import SMILETo3D, XTBOptimization

geometry_optimization_pipeline = BasicCompoundPipeline(
    data_loader=dataset,
    filters=[
        SMILETo3D(
            modify=True,
            output_dir="./data",
            save_results=True,
        ),
    ],
)
print(geometry_optimization_pipeline)
print("Saving 3D coordinates to ./data/3D_coordinates.csv")

In [20]:
transformed_compounds = geometry_optimization_pipeline.run_filters(steps=1)

### Load newly created pdb file paths


In [None]:
pdb_csv = pd.read_csv("./data/3D_coordinates.csv")
pdb_csv

## Geometry Optimization using XTB


In [None]:
from smartcadd.modules import XTBOptimization

geometry_optimization_pipeline.append_filter(
    XTBOptimization(
        from_file=True,
        pdb_dir="./data/3D_coordinates",
        output_dir="./data",
        n_processes=16,
        save_results=True,
    )
)
print(geometry_optimization_pipeline)

In [None]:
optimized_compounds = geometry_optimization_pipeline.run_filters(steps=1)