In this tutorial you'll learn how to submit a dataset to a QCFractal instance (also called backend or server).

In [1]:
import os
import dotenv
import random

from tqdm.auto import tqdm

import datamol as dm
import pandas as pd

from openff.toolkit import Molecule

import qcelemental as qcel

from qcportal import PortalClient
from qcportal.record_models import PriorityEnum

from qcportal.singlepoint.dataset_models import SinglepointDatasetNewEntry
from qcportal.singlepoint.record_models import QCSpecification
from qcportal.singlepoint.record_models import SinglepointDriver

_ = dotenv.load_dotenv("../../openfractal_test_secrets.env")

## Prepare the dataset

First we pull a random subset of `datamol.data.freesolv()`, create `openff.Molecule` objects from it and generate one conformer per molecule.

In [2]:
def get_toy_molecules(
    n_molecules: int = 10,
    progress: bool = True,
    progress_leave: bool = False,
):
    # Get some data
    data = dm.data.freesolv()
    data = data.sample(n=n_molecules)

    def _process(smiles):
        # Convert to OFF mol
        mol = Molecule.from_smiles(smiles)
        assert mol is not None

        # Generate a conformer
        mol.generate_conformers(n_conformers=1)

        return mol

    # Generate conformers
    data["mol"] = dm.parallelized(
        _process,
        data["smiles"],
        progress=progress,
        tqdm_kwargs=dict(leave=progress_leave),
    )

    data = data.reset_index(drop=True)
    return data


data = get_toy_molecules(n_molecules=10)

data

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,iupac,smiles,expt,calc,mol
0,iodomethane,CI,-0.89,-0.641,Molecule with name '' and SMILES '[H][C]([H])(...
1,"1-(2-hydroxyethylamino)-9,10-anthraquinone",c1ccc2c(c1)C(=O)c3cccc(c3C2=O)NCCO,-14.21,-13.599,Molecule with name '' and SMILES '[H][O][C]([H...
2,pyridine-3-carbaldehyde,c1cc(cnc1)C=O,-7.1,-7.425,Molecule with name '' and SMILES '[H][C](=[O])...
3,dimethyl sulfate,COS(=O)(=O)OC,-5.1,-8.411,Molecule with name '' and SMILES '[H][C]([H])(...
4,heptane,CCCCCCC,2.67,2.925,Molecule with name '' and SMILES '[H][C]([H])(...
5,iodoethane,CCI,-0.74,-0.609,Molecule with name '' and SMILES '[H][C]([H])(...
6,2-methylbutan-2-ol,CCC(C)(C)O,-4.43,-2.933,Molecule with name '' and SMILES '[H][O][C]([C...
7,pentylcyclopentane,CCCCCC1CCCC1,2.55,2.381,Molecule with name '' and SMILES '[H][C]([H])(...
8,dibenzo-p-dioxin,c1ccc2c(c1)Oc3ccccc3O2,-3.15,-4.9,Molecule with name '' and SMILES '[H][c]1[c]([...
9,1-ethyl-2-methylbenzene,CCc1ccccc1C,-0.85,-0.761,Molecule with name '' and SMILES '[H][c]1[c]([...


## Initialize the `PortalClient`

The client object will allow to interact with any QCfractal instance.

In [3]:
client = PortalClient(
    address="https://openfractal-test-pgzbs3yryq-uc.a.run.app",
    username=os.environ["OPENFRACTAL_USER_1_USERNAME"],
    password=os.environ["OPENFRACTAL_USER_1_PASSWORD"],
)

client

You can display some general informations about this instance:

In [4]:
client.server_info

{'name': 'openfractal-test',
 'manager_heartbeat_frequency': 10,
 'manager_heartbeat_max_missed': 5,
 'version': '0.50b11.post13+gc0062725',
 'api_limits': {'get_records': 1000,
  'add_records': 500,
  'get_dataset_entries': 2000,
  'get_molecules': 1000,
  'add_molecules': 1000,
  'get_managers': 1000,
  'manager_tasks_claim': 200,
  'manager_tasks_return': 10,
  'get_server_stats': 25,
  'get_access_logs': 1000,
  'get_error_logs': 100,
  'get_internal_jobs': 1000},
 'client_version_lower_limit': '0.50b11',
 'client_version_upper_limit': '1',
 'manager_version_lower_limit': '0.50b11',
 'manager_version_upper_limit': '1',
 'motd': ''}

## Create a new dataset on the server

In [5]:
# Generate a random suffix for your dataset
dataset_suffix = "".join(random.choices([str(i) for i in range(10)], k=10))

dataset_name = f"dataset_demo_{dataset_suffix}"

dataset_name

'dataset_demo_5077749542'

In [6]:
kwargs = {}
kwargs["dataset_type"] = "singlepoint"
kwargs["name"] = dataset_name
kwargs["description"] = "my great dataset!"
# the tag allows you to restrict this dataset to only specific managers
kwargs["tags"] = ["demo_local"]  
kwargs["group"] = None
kwargs["provenance"] = {}
kwargs["visibility"] = True
kwargs["default_tag"] = "demo_local"
kwargs["default_priority"] = PriorityEnum.normal
kwargs["metadata"] = {}
kwargs["owner_group"] = None

ds = client.add_dataset(**kwargs)

ds

SinglepointDataset(id=4, dataset_type='singlepoint', name='dataset_demo_5077749542', description='my great dataset!', tagline='', tags=['demo_local'], group='default', visibility=True, provenance={}, default_tag='demo_local', default_priority=<PriorityEnum.normal: 1>, owner_user='admin_default', owner_group=None, metadata={}, extras={}, entry_names_=[], specifications_={}, entries_={}, record_map_={}, contributed_values_=None, auto_fetch_missing=True)

## Build "entries" from the dataset

An entry is a single data point object that hold a 3D atomistic system (also called a molecule). You can associate custom attributes to a given molecule.

In [7]:
chunk_size = 5
progress = True
progress_leave = False


def _create_entry(i, row):
    kwargs = {}
    kwargs["name"] = f"mol_{i}"
    kwargs["molecule"] = row["mol"].to_qcschema()
    kwargs["additional_keywords"] = {}
    kwargs["attributes"] = row.drop("mol").to_dict()
    kwargs["comment"] = None
    return SinglepointDatasetNewEntry(**kwargs)


# We build and send the entry by chunk in case of large dataset
for i in tqdm(range(0, len(data), chunk_size)):
    # Get the rows
    rows = data.iloc[i : i + chunk_size]

    # Build the entries
    entries = dm.parallelized(
        _create_entry,
        rows.iterrows(),
        arg_type="args",
        total=len(rows),
        progress=progress,
        tqdm_kwargs=dict(leave=progress_leave),
    )

    # Send the entries to the server
    insert_md = ds.add_entries(entries)
    assert insert_md.success

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

You can check the newly submitted entries:

In [8]:
list(ds.iterate_entries())

[SinglepointDatasetEntry(name='mol_0', molecule=Molecule(name='CH3I', formula='CH3I', hash='f208959'), additional_keywords={}, attributes={'calc': -0.641, 'expt': -0.89, 'iupac': 'iodomethane', 'smiles': 'CI'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_1', molecule=Molecule(name='C16H13NO3', formula='C16H13NO3', hash='b52a050'), additional_keywords={}, attributes={'calc': -13.599, 'expt': -14.21, 'iupac': '1-(2-hydroxyethylamino)-9,10-anthraquinone', 'smiles': 'c1ccc2c(c1)C(=O)c3cccc(c3C2=O)NCCO'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_2', molecule=Molecule(name='C6H5NO', formula='C6H5NO', hash='cf42d3e'), additional_keywords={}, attributes={'calc': -7.425, 'expt': -7.1, 'iupac': 'pyridine-3-carbaldehyde', 'smiles': 'c1cc(cnc1)C=O'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_3', molecule=Molecule(name='C2H6O4S', formula='C2H6O4S', hash='ceecca4'), additional_keywords={}, attributes={'calc': -8.411, 

## Create the QM specification

The QM specification defines a QM protocol that will be executed on a Dataset.

Here we choose a cheap level of theory: `hf/sto-3g`.

In [9]:
## PSI4 SCF properties
## See https://psicode.org/psi4manual/master/oeprop.html#id2
scf_properties = [
    "MBIS_CHARGES",
    "WIBERG_LOWDIN_INDICES",
    "MAYER_INDICES",
    "LOWDIN_CHARGES",
    "DIPOLE",
    "QUADRUPOLE",
]

## Build the protocols
## One of: all, none or orbitals_and_eigenvalues
protocols = {"wavefunction": "all"}

## Build the specification
kwargs = {}
kwargs["program"] = "psi4"
kwargs["driver"] = SinglepointDriver.gradient
# kwargs["method"] = "wb97m-d3bj"
# kwargs["basis"] = "def2-tzvppd"
kwargs["method"] = "hf"
kwargs["basis"] = "sto-3g"
kwargs["keywords"] = {"wcombine": False, "scf_properties": scf_properties}
kwargs["protocols"] = protocols

specification = QCSpecification(**kwargs)

specification

QCSpecification(program='psi4', driver=<SinglepointDriver.gradient: 'gradient'>, method='hf', basis='sto-3g', keywords={'wcombine': False, 'scf_properties': ['MBIS_CHARGES', 'WIBERG_LOWDIN_INDICES', 'MAYER_INDICES', 'LOWDIN_CHARGES', 'DIPOLE', 'QUADRUPOLE']}, protocols=AtomicResultProtocols(wavefunction=<WavefunctionProtocolEnum.all: 'all'>, stdout=True, error_correction=ErrorCorrectionProtocol(default_policy=True, policies=None), native_files=<NativeFilesProtocolEnum.none: 'none'>))

Now we associate this QM specification (protocol) to the dataset we created above to the server.

In [10]:
kwargs = {}
kwargs["name"] = "simple_qm_calculation_demo"
kwargs["specification"] = specification
kwargs["description"] = None
insert_md = ds.add_specification(**kwargs)
assert insert_md

In [11]:
list(ds.iterate_entries())

[SinglepointDatasetEntry(name='mol_0', molecule=Molecule(name='CH3I', formula='CH3I', hash='f208959'), additional_keywords={}, attributes={'calc': -0.641, 'expt': -0.89, 'iupac': 'iodomethane', 'smiles': 'CI'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_1', molecule=Molecule(name='C16H13NO3', formula='C16H13NO3', hash='b52a050'), additional_keywords={}, attributes={'calc': -13.599, 'expt': -14.21, 'iupac': '1-(2-hydroxyethylamino)-9,10-anthraquinone', 'smiles': 'c1ccc2c(c1)C(=O)c3cccc(c3C2=O)NCCO'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_2', molecule=Molecule(name='C6H5NO', formula='C6H5NO', hash='cf42d3e'), additional_keywords={}, attributes={'calc': -7.425, 'expt': -7.1, 'iupac': 'pyridine-3-carbaldehyde', 'smiles': 'c1cc(cnc1)C=O'}, comment=None, local_results=None),
 SinglepointDatasetEntry(name='mol_3', molecule=Molecule(name='C2H6O4S', formula='C2H6O4S', hash='ceecca4'), additional_keywords={}, attributes={'calc': -8.411, 

## Submit the computation

**Warning: once you have submitted a specification to a dataset, the compatible managers will start picking up jobs and perform the QM calculcations.**

In [12]:
ds.submit()

Check the submission worked.

In [13]:
print(ds.status_table())

             specification    waiting
--------------------------  ---------
simple_qm_calculation_demo         10


## Monitoring

Retrieve a dataset by its name.

In [14]:
ds = client.get_dataset("singlepoint", dataset_name)
ds

SinglepointDataset(id=4, dataset_type='singlepoint', name='dataset_demo_5077749542', description='my great dataset!', tagline='', tags=['demo_local'], group='default', visibility=True, provenance={}, default_tag='demo_local', default_priority=<PriorityEnum.normal: 1>, owner_user='admin_default', owner_group=None, metadata={}, extras={}, entry_names_=[], specifications_={}, entries_={}, record_map_={}, contributed_values_=None, auto_fetch_missing=True)

Print a table showing the status for a dataset.

In [15]:
print(ds.status_table())

             specification    waiting
--------------------------  ---------
simple_qm_calculation_demo         10


Read the records (some might be completed but some might still be in progress or in failing state.

In [16]:
records_list = []
for r in tqdm(client.query_records(dataset_id=ds.id)):
    # Access this object to fetch the potential errors when any
    r.error
    records_list.append(r.dict())

records = pd.DataFrame(records_list)
records = records.sort_values("id")
records = records.reset_index(drop=True)

records

0it [00:00, ?it/s]

Unnamed: 0,id,record_type,is_service,properties,extras,status,manager_name,created_on,modified_on,owner_user,owner_group,compute_history_,task_,service_,comments_,native_files_,specification,molecule_id,molecule_,wavefunction_
0,11,singlepoint,False,,,RecordStatusEnum.waiting,,2023-06-12 20:22:35.161042,2023-06-12 20:22:35.161046,admin_default,,[],,,,,"{'program': 'psi4', 'driver': 'SinglepointDriv...",28,,
1,12,singlepoint,False,,,RecordStatusEnum.waiting,,2023-06-12 20:22:35.161047,2023-06-12 20:22:35.161047,admin_default,,[],,,,,"{'program': 'psi4', 'driver': 'SinglepointDriv...",21,,
2,13,singlepoint,False,,,RecordStatusEnum.waiting,,2023-06-12 20:22:35.161048,2023-06-12 20:22:35.161048,admin_default,,[],,,,,"{'program': 'psi4', 'driver': 'SinglepointDriv...",24,,
3,14,singlepoint,False,,,RecordStatusEnum.waiting,,2023-06-12 20:22:35.161049,2023-06-12 20:22:35.161049,admin_default,,[],,,,,"{'program': 'psi4', 'driver': 'SinglepointDriv...",20,,
4,15,singlepoint,False,,,RecordStatusEnum.waiting,,2023-06-12 20:22:35.161050,2023-06-12 20:22:35.161050,admin_default,,[],,,,,"{'program': 'psi4', 'driver': 'SinglepointDriv...",27,,
5,16,singlepoint,False,,,RecordStatusEnum.waiting,,2023-06-12 20:22:35.161051,2023-06-12 20:22:35.161051,admin_default,,[],,,,,"{'program': 'psi4', 'driver': 'SinglepointDriv...",23,,
6,17,singlepoint,False,,,RecordStatusEnum.waiting,,2023-06-12 20:22:35.161052,2023-06-12 20:22:35.161052,admin_default,,[],,,,,"{'program': 'psi4', 'driver': 'SinglepointDriv...",26,,
7,18,singlepoint,False,,,RecordStatusEnum.waiting,,2023-06-12 20:22:35.161053,2023-06-12 20:22:35.161053,admin_default,,[],,,,,"{'program': 'psi4', 'driver': 'SinglepointDriv...",29,,
8,19,singlepoint,False,,,RecordStatusEnum.waiting,,2023-06-12 20:22:35.161054,2023-06-12 20:22:35.161054,admin_default,,[],,,,,"{'program': 'psi4', 'driver': 'SinglepointDriv...",25,,
9,20,singlepoint,False,,,RecordStatusEnum.waiting,,2023-06-12 20:22:35.161054,2023-06-12 20:22:35.161055,admin_default,,[],,,,,"{'program': 'psi4', 'driver': 'SinglepointDriv...",22,,


## Delete a dataset and associated records

**Important:** Before deleting the dataset you just created, you should check the other tutorials where you'll learn how to launch a manager than can perform the QM calculations submitted above!

Retrieve the ID of a given dataset.

In [17]:
client.list_datasets()

[{'id': 4,
  'dataset_type': 'singlepoint',
  'dataset_name': 'dataset_demo_5077749542'}]

In [18]:
dataset_id = client.get_dataset("singlepoint", dataset_name).id
dataset_id

4

Delete the dataset and its associated records.

**Warning: this step can't be reversed.**

In [32]:
client.delete_dataset(dataset_id, delete_records=True)