In [1]:
import os
import sys
import pandas as pd

sys.path.append("/users/ejlaird/Projects/SmartCADD/")

import smartcadd
from smartcadd.dataset import IterableDataset
from smartcadd.pipeline import BasicCompoundPipeline

# set working directory
os.chdir("/users/ejlaird/Projects/SmartCADD/examples/")

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (/lustre/work/client/users/ejlaird/.conda/envs/smartcadd-test/lib/python3.11/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/lustre/work/client/users/ejlaird/.conda/envs/smartcadd-test/lib/python3.11/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax mo

## Building a Pipeline out of Dummy filters

### Create a Dataset Iterator

In [2]:
dataset = IterableDataset(
    root_dir="/work/users/ejlaird/data/chemistry",
    batch_size=10,
)
len(dataset)

50000

### Build the pipeline

In [3]:
from smartcadd.filters import DummyFilter

pipeline = BasicCompoundPipeline(
    data_loader=dataset,
    filters=[
        DummyFilter(),
        DummyFilter(),
    ],
)

print(pipeline)


BasicCompoundPipeline: [ (1) DummyFilter -> (2) DummyFilter -> Done ]


### Loop through dataset iterator and run filters for each batch

In [4]:
for i, batch in enumerate(pipeline.get_data()):
    filtered = pipeline.run_filters(batch)
    print(filtered[0].smiles)
    if i > 2:
        break

CC(C)(O)C1CC[NH+](Cc2ccc3nc(-c4c5cc[nH]c5ccc4F)nc(N4CCOCC4)c3n2)CC1
C[n+]1ccc(NC(=O)c2ccc(NC(=O)c3ccc(C(=O)Nc4ccc(C(=O)Nc5cc[n+](C)cc5)cc4)cc3)cc2)cc1
O=C(NCC[NH+]1CCCCC1)Nc1ccc(Cc2ccc(NC(=O)NCC[NH+]3CCCCC3)cc2)cc1
C[NH+]1CCC(NC(=O)c2cccc(-c3c[nH]nc3[C@@H]3CCC[N@H+](CCOCc4ccccc4)C3)c2)CC1


## Let's build a basic pipeline with real filters

### Wrap a pretrained model for use in ModelFilter

In [5]:
from smartcadd.model_wrappers import AttentiveFPWrapper

MODEL_PARAMS_PATH = (
    "/work/users/ejlaird/models/chemistry/attentive_fp_model_params.pt"
)

model_wrapper = AttentiveFPWrapper(
    model_params_path=MODEL_PARAMS_PATH,
    mode="classification",
    n_tasks=1,
    dropout=0.3,
    num_timesteps=2,  # defaut=2
    num_layers=2,  # default = 2
    graph_feat_size=200,  # default = 200
    number_atom_features=30,  # default = 30
    number_bond_features=11,  # default = 11
    batch_size=100,
    learning_rate=0.001,
    regularization_loss=None,
    device="cpu",
)
model_wrapper.load()

### Build basic pipeline and add ModelFilter

In [6]:
from smartcadd.filters import ModelFilter, ADMETFilter

deep_learning_pipeline = BasicCompoundPipeline(
    data_loader=dataset,
    filters=[
        ModelFilter(
            model_wrapper=model_wrapper,
            target=1,
            threshold=0.5,
            output_dir="./data",
            save_results=True,
        ),
    ],
)
print(deep_learning_pipeline)


BasicCompoundPipeline: [ (1) ModelFilter -> Done ]


### Run the pipeline


In [7]:
for batch in deep_learning_pipeline.get_data():
    print(f"Length of batch before filtering: {len(batch)}")
    filtered = deep_learning_pipeline.run_filters(batch)
    print(f"Length of batch after filtering: {len(filtered)}")
    break

Length of batch before filtering: 10
Length of batch after filtering: 3


### Check results in saved csv

In [8]:
deep_learning_results = pd.read_csv("./data/model_filtered.csv")
deep_learning_results[deep_learning_results["Prediction"] > 0.0]

Unnamed: 0,SMILES,ID,Prediction
5,O=C(Nc1cccc(C2=[NH+]CCN2)c1)c1ccc2cc(C(=O)Nc3c...,ZINC000001689092,0.998
6,O=C(Nc1ccc(C2=[NH+]CCN2)c(Cl)c1)c1ccc(C(=O)Nc2...,ZINC000001689095,0.837
7,O=C(Nc1ccc(C2=[NH+]CCN2)cc1)c1cc(Cl)c(C(=O)Nc2...,ZINC000001689504,0.833


## Building a custom filter using the Filter Interface

In [9]:
from smartcadd.filters import Filter
from random import random


class RandomFilter(Filter):
    """
    An example filter that randomly selects compounds
    """

    def __init__(self, threshold):
        super().__init__()

        self.threshold = threshold

    def run(self, batch):

        # select a random number of compounds to keep
        mask = [random() > self.threshold for _ in range(len(batch))]
        return [compound for compound, keep in zip(batch, mask) if keep]

### Append new filter to existing Pipeline

In [10]:
deep_learning_pipeline.append_filter(RandomFilter(threshold=0.3))
print(deep_learning_pipeline)


BasicCompoundPipeline: [ (1) ModelFilter -> (2) RandomFilter -> Done ]


In [11]:
stop = 5
for i, batch in enumerate(deep_learning_pipeline.get_data()):
    print(f"Length of batch before filtering: {len(batch)}")
    filtered = deep_learning_pipeline.run_filters(batch)
    print(f"Length of batch after filtering: {len(filtered)}")

    if i > stop:
        break

Length of batch before filtering: 10
Length of batch after filtering: 2
Length of batch before filtering: 10
Length of batch after filtering: 1
Length of batch before filtering: 10
Length of batch after filtering: 1
Length of batch before filtering: 10
Length of batch after filtering: 0
Length of batch before filtering: 10
Length of batch after filtering: 0
Length of batch before filtering: 10
Length of batch after filtering: 0
Length of batch before filtering: 10
Length of batch after filtering: 0


### ADMET Filtering

In [12]:
from smartcadd.filters import ADMETFilter

ALERT_COLLECTION_PATH = (
    "/work/users/ejlaird/data/chemistry/alert_collection.csv"
)

admet_pipeline = BasicCompoundPipeline(
    data_loader=dataset,
    filters=[
        ADMETFilter(
            alert_collection_path=ALERT_COLLECTION_PATH,
            output_dir="./data",
            save_results=True,
        ),
    ],
)
print(admet_pipeline)


BasicCompoundPipeline: [ (1) ADMETFilter -> Done ]


In [13]:
stop = 5
for i, batch in enumerate(admet_pipeline.get_data()):
    print(f"Length of batch before filtering: {len(batch)}")
    filtered = admet_pipeline.run_filters(batch)
    print(f"Length of batch after filtering: {len(filtered)}")
    if i > stop:
        break

Length of batch before filtering: 10
Length of batch after filtering: 10
Length of batch before filtering: 10
Length of batch after filtering: 10
Length of batch before filtering: 10
Length of batch after filtering: 10
Length of batch before filtering: 10
Length of batch after filtering: 8
Length of batch before filtering: 10
Length of batch after filtering: 0
Length of batch before filtering: 10
Length of batch after filtering: 2
Length of batch before filtering: 10
Length of batch after filtering: 0


### Check results in saved csv

In [14]:
import pandas as pd

admet_results = pd.read_csv("./data/ADMET_filtered.csv")
admet_results[admet_results["keep"] == False]

Unnamed: 0,smiles,id,status,MolWt,MolLogP,NumHDonors,NumHAcceptors,TPSA,CalcNumRotatableBonds,NumAromaticRings,total_N_aro_members,total_N_ali_members,total_aro_N_count,total_aro_O_count,total_ali_O_count,total_ali_N_count,keep
38,COc1ccc(N2C(=O)/C(=C\c3cc(C)n(-c4ccc(Cl)c(C(=O...,ZINC000000681904,OK,538.989,2.95954,1,8,112.93,6,3,17,6,1,0,0,2,False
39,Cc1ccc(N2C(=O)/C(=C/c3cc(Br)c([O-])c(Br)c3)C(=...,ZINC000000714676,OK,509.199,3.73334,1,4,72.47,2,2,12,6,0,0,0,2,False
40,CCOc1ccc(N2C(=O)/C(=C/c3ccc(OCc4ccc(C(=O)[O-])...,ZINC000001201230,OK,531.566,2.8677,1,8,117.23,9,3,18,6,0,0,0,2,False
41,COc1cc(Br)c(Br)c([C@H]2Nc3sc4c(c3C(=O)N2)CC[N@...,ZINC000001250478,OK,504.224,2.4122,4,5,75.03,2,2,15,8,0,0,0,3,False
42,CCc1ccc(N2NC(=O)/C(=C/c3ccc(-c4ccc(S(=O)(=O)[N...,ZINC000002126864,OK,519.584,4.821,1,7,123.68,7,4,22,5,1,1,0,2,False
43,COc1cc(/C=C2\C(=O)NC(=S)N(c3ccc(C)cc3)C2=O)ccc...,ZINC000002228145,OK,501.54,2.77742,1,7,108.0,7,3,18,6,0,0,0,2,False
44,COc1ccc(C=C2C(=O)[N-]C(=S)N(C3CCCCC3)C2=O)cc1C...,ZINC000002286664,OK,506.604,4.9761,1,6,99.04,7,2,12,12,0,0,0,2,False
45,CCOc1ccc(N2C(=O)/C(=C/c3ccc(OCc4cccc(C(=O)[O-]...,ZINC000002308579,OK,501.54,2.8591,1,7,108.0,8,3,18,6,0,0,0,2,False
46,COc1ccc(N2C(=O)/C(=C/c3cc(C)n(-c4ccc(Cl)c(C(=O...,ZINC000002783351,OK,508.963,2.95094,1,7,103.7,5,3,17,6,1,0,0,2,False
47,COc1cc(/C=C2\C(=O)NC(=S)N(c3ccc(Cl)cc3)C2=O)cc...,ZINC000002908281,OK,521.958,3.1224,1,7,108.0,7,3,18,6,0,0,0,2,False
