In this tutorial you'll learn how to submit a dataset to a QCFractal instance (also called backend or server).

In [1]:
import os
import dotenv
import random

from tqdm.auto import tqdm

import datamol as dm
import pandas as pd

from openff.toolkit import Molecule

import qcelemental as qcel

from qcportal import PortalClient
from qcportal.record_models import PriorityEnum

from qcportal.singlepoint.dataset_models import SinglepointDatasetNewEntry
from qcportal.singlepoint.record_models import QCSpecification
from qcportal.singlepoint.record_models import SinglepointDriver

_ = dotenv.load_dotenv("../../openfractal_test_secrets.env")

Login to the instance by initializing a `PortalClient` object.

## Prepare the dataset

First we pull a random subset of `datamol.data.freesolv()`, create `openff.Molecule` objects from it and generate one conformer per molecule.

In [2]:
def get_toy_molecules(
    n_molecules: int = 10,
    progress: bool = True,
    progress_leave: bool = False,
):
    # Get some data
    data = dm.data.freesolv()
    data = data.sample(n=n_molecules)

    def _process(smiles):
        # Convert to OFF mol
        mol = Molecule.from_smiles(smiles)
        assert mol is not None

        # Generate a conformer
        mol.generate_conformers(n_conformers=1)

        return mol

    # Generate conformers
    data["mol"] = dm.parallelized(
        _process,
        data["smiles"],
        progress=progress,
        tqdm_kwargs=dict(leave=progress_leave),
    )

    data = data.reset_index(drop=True)
    return data


data = get_toy_molecules(n_molecules=10)

data

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,iupac,smiles,expt,calc,mol
0,"methyl 2,2,2-trifluoroacetate",COC(=O)C(F)(F)F,-1.1,-1.353,Molecule with name '' and SMILES '[H][C]([H])(...
1,"1,2-dinitroxypropane",C[C@@H](CO[N+](=O)[O-])O[N+](=O)[O-],-4.95,-5.646,Molecule with name '' and SMILES '[H][C]([H])(...
2,butan-1-amine,CCCCN,-4.24,-2.961,Molecule with name '' and SMILES '[H][N]([H])[...
3,alachlor,CCc1cccc(c1N(COC)C(=O)CCl)CC,-8.21,-6.851,Molecule with name '' and SMILES '[H][c]1[c]([...
4,trimethoxymethylbenzene,COC(c1ccccc1)(OC)OC,-4.04,-5.559,Molecule with name '' and SMILES '[H][c]1[c]([...
5,chlorobenzene,c1ccc(cc1)Cl,-1.12,-0.475,Molecule with name '' and SMILES '[H][c]1[c]([...
6,cyclohexanol,C1CCC(CC1)O,-5.46,-4.178,Molecule with name '' and SMILES '[H][O][C]1([...
7,"2-[(1R)-1-methylpropyl]-4,6-dinitro-phenolate",CC[C@@H](C)c1cc(cc(c1O)[N+](=O)[O-])[N+](=O)[O-],-6.23,-5.378,Molecule with name '' and SMILES '[H][O][c]1[c...
8,ethanamine,CCN,-4.5,-3.156,Molecule with name '' and SMILES '[H][N]([H])[...
9,butan-1-ol,CCCCO,-4.72,-3.232,Molecule with name '' and SMILES '[H][O][C]([H...


## Initialize the `PortalClient`

The client object will allow to interact with any QCfractal instance.

In [3]:
client = PortalClient(
    address="https://openfractal-test-pgzbs3yryq-uc.a.run.app",
    username=os.environ["OPENFRACTAL_USER_1_USERNAME"],
    password=os.environ["OPENFRACTAL_USER_1_PASSWORD"],
)

client

You can display some general informations about this instance:

In [4]:
client.server_info

{'name': 'openfractal-test',
 'manager_heartbeat_frequency': 10,
 'manager_heartbeat_max_missed': 5,
 'version': '0.50b11.post13+gc0062725',
 'api_limits': {'get_records': 1000,
  'add_records': 500,
  'get_dataset_entries': 2000,
  'get_molecules': 1000,
  'add_molecules': 1000,
  'get_managers': 1000,
  'manager_tasks_claim': 200,
  'manager_tasks_return': 10,
  'get_server_stats': 25,
  'get_access_logs': 1000,
  'get_error_logs': 100,
  'get_internal_jobs': 1000},
 'client_version_lower_limit': '0.50b11',
 'client_version_upper_limit': '1',
 'manager_version_lower_limit': '0.50b11',
 'manager_version_upper_limit': '1',
 'motd': ''}

## Create a new dataset on the server

In [5]:
# Generate a random suffix for your dataset
dataset_suffix = "".join(random.choices([str(i) for i in range(10)], k=10))

dataset_name = f"dataset_demo_{dataset_suffix}"

dataset_name

'dataset_demo_4719875601'

In [6]:
kwargs = {}
kwargs["dataset_type"] = "singlepoint"
kwargs["name"] = dataset_name
kwargs["description"] = "my great dataset!"
# the tag allows you to restrict this dataset to only specific managers
kwargs["tags"] = ["demo_local"]  
kwargs["group"] = None
kwargs["provenance"] = {}
kwargs["visibility"] = True
kwargs["default_tag"] = "*"
kwargs["default_priority"] = PriorityEnum.normal
kwargs["metadata"] = {}
kwargs["owner_group"] = None

ds = client.add_dataset(**kwargs)

ds

SinglepointDataset(id=3, dataset_type='singlepoint', name='dataset_demo_4719875601', description='my great dataset!', tagline='', tags=['demo_local'], group='default', visibility=True, provenance={}, default_tag='*', default_priority=<PriorityEnum.normal: 1>, owner_user='admin_default', owner_group=None, metadata={}, extras={}, entry_names_=[], specifications_={}, entries_={}, record_map_={}, contributed_values_=None, auto_fetch_missing=True)

## Build "entries" from the dataset

An entry is a single data point object that hold a 3D atomistic system (also called a molecule). You can associate custom attributes to a given molecule.

In [7]:
chunk_size = 5
progress = True
progress_leave = False


def _create_entry(i, row):
    kwargs = {}
    kwargs["name"] = f"mol_{i}"
    kwargs["molecule"] = row["mol"].to_qcschema()
    kwargs["additional_keywords"] = {}
    kwargs["attributes"] = row.drop("mol").to_dict()
    kwargs["comment"] = None
    return SinglepointDatasetNewEntry(**kwargs)


# We build and send the entry by chunk in case of large dataset
for i in tqdm(range(0, len(data), chunk_size)):
    # Get the rows
    rows = data.iloc[i : i + chunk_size]

    # Build the entries
    entries = dm.parallelized(
        _create_entry,
        rows.iterrows(),
        arg_type="args",
        total=len(rows),
        progress=progress,
        tqdm_kwargs=dict(leave=progress_leave),
    )

    # Send the entries to the server
    insert_md = ds.add_entries(entries)
    assert insert_md.success

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

You can check the newly submitted entries:

In [16]:
list(ds.iterate_entries())

[SinglepointDatasetEntry(name='mol_0', molecule=Molecule(name='C3F3H3O2', formula='C3F3H3O2', hash='d1d3bf2'), additional_keywords={}, attributes={'calc': -1.353, 'expt': -1.1, 'iupac': 'methyl 2,2,2-trifluoroacetate', 'smiles': 'COC(=O)C(F)(F)F'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_1', molecule=Molecule(name='C3H6N2O6', formula='C3H6N2O6', hash='0c1ebf9'), additional_keywords={}, attributes={'calc': -5.646, 'expt': -4.95, 'iupac': '1,2-dinitroxypropane', 'smiles': 'C[C@@H](CO[N+](=O)[O-])O[N+](=O)[O-]'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_2', molecule=Molecule(name='C4H11N', formula='C4H11N', hash='5fff2f0'), additional_keywords={}, attributes={'calc': -2.961, 'expt': -4.24, 'iupac': 'butan-1-amine', 'smiles': 'CCCCN'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_3', molecule=Molecule(name='C14ClH20NO2', formula='C14ClH20NO2', hash='0355c46'), additional_keywords={}, attributes={'calc': -6.

## Create the QM specification

The QM specification defines a QM protocol that will be executed on a Dataset.

Here we choose a cheap level of theory: `hf/sto-3g`.

In [8]:
## PSI4 SCF properties
## See https://psicode.org/psi4manual/master/oeprop.html#id2
scf_properties = [
    "MBIS_CHARGES",
    "WIBERG_LOWDIN_INDICES",
    "MAYER_INDICES",
    "LOWDIN_CHARGES",
    "DIPOLE",
    "QUADRUPOLE",
]

## Build the protocols
## One of: all, none or orbitals_and_eigenvalues
protocols = {"wavefunction": "all"}

## Build the specification
kwargs = {}
kwargs["program"] = "psi4"
kwargs["driver"] = SinglepointDriver.gradient
# kwargs["method"] = "wb97m-d3bj"
# kwargs["basis"] = "def2-tzvppd"
kwargs["method"] = "hf"
kwargs["basis"] = "sto-3g"
kwargs["keywords"] = {"wcombine": False, "scf_properties": scf_properties}
kwargs["protocols"] = protocols

specification = QCSpecification(**kwargs)

specification

QCSpecification(program='psi4', driver=<SinglepointDriver.gradient: 'gradient'>, method='hf', basis='sto-3g', keywords={'wcombine': False, 'scf_properties': ['MBIS_CHARGES', 'WIBERG_LOWDIN_INDICES', 'MAYER_INDICES', 'LOWDIN_CHARGES', 'DIPOLE', 'QUADRUPOLE']}, protocols=AtomicResultProtocols(wavefunction=<WavefunctionProtocolEnum.all: 'all'>, stdout=True, error_correction=ErrorCorrectionProtocol(default_policy=True, policies=None), native_files=<NativeFilesProtocolEnum.none: 'none'>))

Now we submit this QM specification (protocol) to the dataset we created above to the server.

**Warning: once you have submitted a specification to a dataset, the compatible managers will start picking up jobs and perform the QM calculcations.**

In [10]:
kwargs = {}
kwargs["name"] = "simple_qm_calculation_demo"
kwargs["specification"] = specification
kwargs["description"] = None
insert_md = ds.add_specification(**kwargs)
assert insert_md

In [15]:
list(ds.iterate_entries())

[SinglepointDatasetEntry(name='mol_0', molecule=Molecule(name='C3F3H3O2', formula='C3F3H3O2', hash='d1d3bf2'), additional_keywords={}, attributes={'calc': -1.353, 'expt': -1.1, 'iupac': 'methyl 2,2,2-trifluoroacetate', 'smiles': 'COC(=O)C(F)(F)F'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_1', molecule=Molecule(name='C3H6N2O6', formula='C3H6N2O6', hash='0c1ebf9'), additional_keywords={}, attributes={'calc': -5.646, 'expt': -4.95, 'iupac': '1,2-dinitroxypropane', 'smiles': 'C[C@@H](CO[N+](=O)[O-])O[N+](=O)[O-]'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_2', molecule=Molecule(name='C4H11N', formula='C4H11N', hash='5fff2f0'), additional_keywords={}, attributes={'calc': -2.961, 'expt': -4.24, 'iupac': 'butan-1-amine', 'smiles': 'CCCCN'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_3', molecule=Molecule(name='C14ClH20NO2', formula='C14ClH20NO2', hash='0355c46'), additional_keywords={}, attributes={'calc': -6.

In [None]:
list(ds.iterate_entries())

## Submit the dataset

In [6]:
client = PortalClient(
    address="https://openfractal-test-pgzbs3yryq-uc.a.run.app",
    username=os.environ["OPENFRACTAL_USER_1_NAME"],
    password=os.environ["OPENFRACTAL_USER_1_PASSWORD"],
)

client.server_info

{'name': 'openfractal-test',
 'manager_heartbeat_frequency': 10,
 'manager_heartbeat_max_missed': 5,
 'version': '0.50b11.post13+gc0062725',
 'api_limits': {'get_records': 1000,
  'add_records': 500,
  'get_dataset_entries': 2000,
  'get_molecules': 1000,
  'add_molecules': 1000,
  'get_managers': 1000,
  'manager_tasks_claim': 200,
  'manager_tasks_return': 10,
  'get_server_stats': 25,
  'get_access_logs': 1000,
  'get_error_logs': 100,
  'get_internal_jobs': 1000},
 'client_version_lower_limit': '0.50b11',
 'client_version_upper_limit': '1',
 'manager_version_lower_limit': '0.50b11',
 'manager_version_upper_limit': '1',
 'motd': ''}

In [3]:
kwargs = {}
kwargs["client"] = client
kwargs["dataset_name"] = "dataset_1"
kwargs["specification_name"] = "qm_hf_sto3g"
kwargs["dataset_type"] = "singlepoint"
kwargs["n_molecules"] = 100
kwargs["chunk_size"] = 1_000
kwargs["progress"] = True
kwargs["progress_leave"] = False

ds = create_submit_toy_dataset(**kwargs)

[32m2023-06-04 18:43:29.761[0m | [1mINFO    [0m | [36mopenfractal_backend.toy_dataset[0m:[36mcreate_submit_toy_dataset[0m:[36m61[0m - [1mCreating dataset dataset_1 of type singlepoint.[0m
[32m2023-06-04 18:43:30.270[0m | [1mINFO    [0m | [36mopenfractal_backend.toy_dataset[0m:[36mcreate_submit_toy_dataset[0m:[36m78[0m - [1mBuilding 100 molecules.[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m2023-06-04 18:43:31.154[0m | [1mINFO    [0m | [36mopenfractal_backend.toy_dataset[0m:[36mcreate_submit_toy_dataset[0m:[36m81[0m - [1mCreating entries for 100 molecules and submitting to the server.[0m


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

[32m2023-06-04 18:43:33.180[0m | [1mINFO    [0m | [36mopenfractal_backend.toy_dataset[0m:[36mcreate_submit_toy_dataset[0m:[36m91[0m - [1mCreating specification qm_hf_sto3g.[0m
[32m2023-06-04 18:43:33.722[0m | [1mINFO    [0m | [36mopenfractal_backend.toy_dataset[0m:[36mcreate_submit_toy_dataset[0m:[36m130[0m - [1mSubmitting the computation for dataset dataset_1.[0m
[32m2023-06-04 18:43:34.122[0m | [1mINFO    [0m | [36mopenfractal_backend.toy_dataset[0m:[36mcreate_submit_toy_dataset[0m:[36m133[0m - [1mDataset dataset_1 submitted.[0m


## Monitoring

Retrieve a dataset by its name.

In [12]:
ds = client.get_dataset("singlepoint", dataset_name)
ds

SinglepointDataset(id=3, dataset_type='singlepoint', name='dataset_demo_4719875601', description='my great dataset!', tagline='', tags=['demo_local'], group='default', visibility=True, provenance={}, default_tag='*', default_priority=<PriorityEnum.normal: 1>, owner_user='admin_default', owner_group=None, metadata={}, extras={}, entry_names_=[], specifications_={}, entries_={}, record_map_={}, contributed_values_=None, auto_fetch_missing=True)

Print a table showing the status for a dataset.

In [13]:
print(ds.status_table())

  specification
---------------


Read the records (some might be completed but some might still be in progress or in failing state.

In [23]:
records_list = []
for r in tqdm(client.query_records(dataset_id=ds.id)):
    # Access this object to fetch the potential errors when any
    r.error
    records_list.append(r.dict())

records = pd.DataFrame(records_list)
records = records.sort_values("id")
records = records.reset_index(drop=True)

records

0it [00:00, ?it/s]

KeyError: 'id'

## Delete a dataset and associated records

Retrieve the ID of a given dataset.

In [28]:
client.list_datasets()

[]

In [27]:
dataset_id = client.get_dataset("singlepoint", dataset_name).id
dataset_id

PortalRequestError: Request failed: Could not find singlepoint dataset with name 'dataset_demo_2090907589' (HTTP status 400)

Delete the dataset and its associated records.

**Warning: this step can't be reversed.**

In [26]:
client.delete_dataset(dataset_id, delete_records=True)

PortalRequestError: Request failed: Could not find dataset with id 2 (HTTP status 400)