In [None]:
#pip install -U bayesianflow_for_chem

Collecting bayesianflow_for_chem
  Using cached bayesianflow_for_chem-1.2.7-py3-none-any.whl.metadata (5.9 kB)
Collecting torch>=2.3.1 (from bayesianflow_for_chem)
  Using cached torch-2.7.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting loralib>=0.1.2 (from bayesianflow_for_chem)
  Using cached loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting lightning>=2.2.0 (from bayesianflow_for_chem)
  Using cached lightning-2.5.1.post0-py3-none-any.whl.metadata (39 kB)
Collecting fsspec<2026.0,>=2022.5.0 (from fsspec[http]<2026.0,>=2022.5.0->lightning>=2.2.0->bayesianflow_for_chem)
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning>=2.2.0->bayesianflow_for_chem)
  Using cached lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting packaging<25.0,>=20.0 (from lightning>=2.2.0->bayesianflow_for_chem)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting torchmetrics<3.0


[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Dataset Handling

In [6]:
# Download your dataset file (e.g., ESOL form MoleculeNet) and split the file:
from bayesianflow_for_chem.tool import split_dataset

split_dataset("../data/delaney-processed.csv", method="scaffold") #https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv


In [7]:
# Load the split data:
from bayesianflow_for_chem.data import smiles2token, collate, CSVData

dataset = CSVData("../data/delaney-processed_train.csv")
dataset[0]

{'Compound ID': ['Amigdalin'],
 'ESOL predicted log solubility in mols per litre': ['-0.9740000000000001'],
 'Minimum Degree': ['1'],
 'Molecular Weight': ['457.4320000000001'],
 'Number of H-Bond Donors': ['7'],
 'Number of Rings': ['3'],
 'Number of Rotatable Bonds': ['7'],
 'Polar Surface Area': ['202.31999999999996'],
 'measured log solubility in mols per litre': ['-0.77'],
 'smiles': ['OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O ']}

In [8]:
# Create a mapping function to tokenise the dataset and select values:
import torch

def encode(x):
  smiles = x["smiles"][0]
  value = [float(i) for i in x["measured log solubility in mols per litre"]]
  return {"token": smiles2token(smiles), "value": torch.tensor(value)}

dataset.map(encode)
dataset[0]

{'token': tensor([  1,  39,  37,  37,  25,  39,  37,   7,  39,  37,  37,  24,  39,  37,
           7,  39,  37,   7,  37,  11,  38,   8, 151,  23, 151, 151, 151, 151,
         151,  23,   8,  37,   7,  39,   8,  37,   7,  39,   8,  37,  24,  39,
           8,  37,   7,  39,   8,  37,   7,  39,   8,  37,  25,  39,   2]),
 'value': tensor([-0.7700])}

In [9]:
#Wrap the dataset in torch.utils.data.DataLoader: 
dataloader = torch.utils.data.DataLoader(dataset, 32, collate_fn=collate)

# Load Pre-Trained Model

In [None]:
# pre-trained models avaiable on Hugging Face:
# https://huggingface.co/suenoomozawa/ChemBFN
# model downloaded manually and saved in the current directory
from bayesianflow_for_chem.model import load_model 
model = load_model("../models/zinc15_190m.pt", "ChemBFN_190m", device="cpu")

In [15]:
# Import the model from a checkpoint:
from bayesianflow_for_chem import ChemBFN

model = ChemBFN.from_checkpoint("../models/zinc15_190m.pt")
#model.enable_lora(r=4, ...)

# quantise the model to reduce memory usage and speed up inference:
# Note: this is an optional step, but it can significantly reduce the memory usage and speed
from bayesianflow_for_chem.tool import quantise_model

m = quantise_model(model)