In [1]:
import numpy as np
import os
import torch
from denoising_diffusion_pytorch import Unet1D, GaussianDiffusion1D, Trainer1D
from denoising_diffusion_pytorch.utils import CfgNode as CN
from denoising_diffusion_pytorch.utils import token2smiles
from denoising_diffusion_pytorch.metrics import *
import pandas as pd
from deepchem.feat.smiles_tokenizer import SmilesTokenizer
from rdkit import Chem
from rdkit.Chem import Draw, AllChem, rdchem
import matplotlib.pyplot as plt
from IPython.display import Image, display
import sys
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from diffusion1D.pipeline import * 


  from .autonotebook import tqdm as notebook_tqdm
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/ec2-user/anaconda3/envs/denoising-diffusion-pytorch/lib/python3.8/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [10]:

pipeline = diffusion1D()

# Data preprocessing
data_config = pipeline.get_default_data_config()
data_config.task = "conditional"
data_config.file_path = "./diffusion1D/htp_md.csv"
data_config.block_size = 64

print(data_config)
train_dataset, test_dataset, vocab_size = pipeline.data_preprocessing(data_config)

# Load the model
model_config = pipeline.get_default_model_config()
model_config.vocab_size = vocab_size
model_config.block_size = data_config.block_size
pipeline.load_model(model_config)

# Train
train_config = pipeline.get_default_train_config()
train_config.ckpts_path = "./diffusion1D/ckpts"
# train_config.pretrain = 100 ## Refer to model-100.pt (step = 10000)
print(train_config.device)
train_config.num_steps = 0
pipeline.train(train_config)


# Generate
generate_config = pipeline.get_default_generate_config()
generate_config.model_index = 10
assert generate_config.task == data_config.task
print(generate_config)

results = pipeline.generate(generate_config)
print(results)

# Evaluate
print(pipeline.evaluate())

block_size: 64
train_test_split: (0.8, 0.2)
task: conditional
file_path: ./htp_md.csv

                              mol_smiles  conductivity
0           NC(=O)CSCC(CO[Cu])OC(=O)[Au]             1
1       CCC(F)C(=O)NC(CO[Cu])COC(=O)[Au]             0
2            CCSCCN(CCN[Cu])CCOC(=O)[Au]             1
3          C#CCN(CCOCCO[Cu])CCOC(=O)[Au]             1
4          CCC(COC(=O)[Au])C(=O)NCCO[Cu]             0
...                                  ...           ...
11403  COCC(CNC(COC(=O)[Au])C(=O)O)O[Cu]             0
11404     CC(C)COCCNC(CN[Cu])CNC(=O)[Au]             1
11405         COCC(CNCCCO[Cu])OC(=O)[Au]             1
11406    CCNCCNC(=O)CC(CNC(=O)[Au])O[Cu]             0
11407  CCCC(OC(=O)[Au])C(=O)NOCC(C)N[Cu]             0

[11408 rows x 2 columns]
['[PAD]', '[CLS]', '[SEP]', 'C', '(', ')', 'O', '=', 'N', 'F', 'Cl', 'S', '[O-]', '#', '[N+]', 'P', '[Si]', '[Cu]', '[PH]', '[Ag]', '[Ac]', '[Au]']
[0, 12, 13, 16, 17, 18, 19, 22, 23, 27, 28, 34, 36, 38, 41, 45, 47, 75, 115, 12

O=C([Au]))CCC[N+]CCO[Cu][Ag]
CCCC=O)CC=[Cu])C(=O()CCOC(=O)[Au][Ac]
CC(CN[Cu])C(OO)=CCCCC(OC(=O)[Au][Ac]
CCCC)CC(COC(=O)[Au])O[Cu][Ac]
CCC(OO))CCNCCCO[Cu])COC(=O)[Au][Ac]
CCClCCN(CCN[Cu])CCOC(=O)[Au][Ag]
CCCCC(CN(C)CCOCCN[Cu])OC(=O)[Au][Ag]
CFCC=CCCCOC(=O)[Au])CCCO[Cu][Ag]
CCCCCO[Cu])FCCCCOC(=O)[Au][Ag]
CC(C(=(CCN[Cu])C(=O)(CCCNC(=O)[Au][Ag]
CCCCC=CCCOC(=O)[Au])CCCO[Cu][Ag]
CC=(CCO[Cu])CC=C(=O)[Au][Ag]
CCCC)C(CCCOC(=O)[Au])C(=))NCC(C(O[Cu][Ac]
CNCC)CC(=O(=C(NO)CCO[Cu])OC(=O)[Au][Ac]
CCCC=CCCCO[Cu])OC(=O)[Au][Ag]
COCCCCNCCCO[Cu])CC=C(=O)[Au][Ag]
C)CCFCCCCOC(=O)[Au])O[Cu][Ag]
CC(OC(=O)[Au])CC(=O)NCC(CCCCCCO[Cu][Ac]
==CCC=CCOC(=O)[Au])O[Cu][Ag]
COC(OCCN(CC=[Cu])CCOC(=O)[Au][Ac]
CCCCN[Cu])CFC(=O)C(C(OC(=O)[Au][Ac]
CCCF[Cu])OCC=C(OO)C(C)CCCOC(=O)[Au][Ac]
CNCCCF[Cu])NCCCCCCCOC(=O)[Au][Ag]
O=C([Au])OCC)CCCCO[Cu][Ag]
C=CCCCO[Cu])=CCOC(=O)[Au][Ag]
O=C([Au]))CCOCCCCClO)CCC(CCN[Cu][Ag]
CC=CCCCOC(=O)[Au])O[Cu][Ac]
(C(OO)CC=CCCO[Cu])CCOC(=O)[Au][Ag]
O=C([Au]))CCFCC)CCO[Cu][Ag]
CCCCCO[Cu])=CCCOCCOC(=