Merge pull request #4224 from ye-luo/batched-driver-samples

Enable samples input tag in batched drivers
QMCPACK · Apr 4, 2024 · 5921824 · 5921824
2 parents 8966880 + 733099a
commit 5921824
Show file tree

Hide file tree

Showing 15 changed files with 263 additions and 109 deletions.
diff --git a/CMake/TestCxx17Library.cmake b/CMake/TestCxx17Library.cmake
@@ -41,7 +41,7 @@ if(NOT CXX17_LIBRARY_OKAY)
   elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
     message(
       "Compiler detected is <Clang> namely clang++ or a vendor variant (icpx, amdclang++, armclang++).\n  If not using libcxx, ensure a GCC toolchain version equal or greater "
-      "than 9.0 gets picked up. Check with '<Clang> -v'. Or use the --gcc-toolchain compiler option "
+      "than 9.0 gets picked up. Check with '<Clang> -v'. Or use the --gcc-install-dir (--gcc-toolchain deprecated) compiler option "
       "(added to both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS) to point to a newer GCC installation."
     )
   elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")

diff --git a/docs/methods.rst b/docs/methods.rst
@@ -306,39 +306,39 @@ Batched ``vmc`` driver (experimental)
 
   parameters:
 
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | **Name**                       | **Datatype** | **Values**              | **Default** | **Description**                                 |
-  +================================+==============+=========================+=============+=================================================+
-  | ``total_walkers``              | integer      | :math:`> 0`             | 1           | Total number of walkers over all MPI ranks      |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``walkers_per_rank``           | integer      | :math:`> 0`             | 1           | Number of walkers per MPI rank                  |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``crowds``                     | integer      | :math:`> 0`             | dep.        | Number of desynchronized dwalker crowds         |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``blocks``                     | integer      | :math:`\geq 0`          | 1           | Number of blocks                                |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``steps``                      | integer      | :math:`\geq 0`          | 1           | Number of steps per block                       |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``warmupsteps``                | integer      | :math:`\geq 0`          | 0           | Number of steps for warming up                  |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``substeps``                   | integer      | :math:`\geq 0`          | 1           | Number of substeps per step                     |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``usedrift``                   | text         | yes,no                  | yes         | Use the algorithm with drift                    |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``timestep``                   | real         | :math:`> 0`             | 0.1         | Time step for each electron move                |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``samples`` (not ready)        | integer      | :math:`\geq 0`          | 0           | Number of walker samples for in this VMC run    |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``blocks_between_recompute``   | integer      | :math:`\geq 0`          | dep.        | Wavefunction recompute frequency                |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``crowd_serialize_walkers``    | integer      | yes, no                 | no          | Force use of single walker APIs (for testing)   |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``debug_checks``               | text         | see additional info     | dep.        | Turn on/off additional recompute and checks     |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``spin_mass``                  | real         | :math:`\geq 0`          | 1.0         | Effective mass for spin sampling                |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
-  | ``measure_imbalance``          | text         | yes,no                  | no          | Measure load imbalance at the end of each block |
-  +--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | **Name**                       | **Datatype** | **Values**              | **Default** | **Description**                                      |
+  +================================+==============+=========================+=============+======================================================+
+  | ``total_walkers``              | integer      | :math:`> 0`             | 1           | Total number of walkers over all MPI ranks           |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``walkers_per_rank``           | integer      | :math:`> 0`             | 1           | Number of walkers per MPI rank                       |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``crowds``                     | integer      | :math:`> 0`             | dep.        | Number of desynchronized walker crowds               |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``blocks``                     | integer      | :math:`\geq 0`          | 1           | Number of blocks                                     |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``steps``                      | integer      | :math:`\geq 0`          | dep.        | Number of steps per block                            |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``warmupsteps``                | integer      | :math:`\geq 0`          | 0           | Number of steps for warming up                       |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``substeps``                   | integer      | :math:`\geq 0`          | 1           | Number of substeps per step                          |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``usedrift``                   | text         | yes,no                  | yes         | Use the algorithm with drift                         |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``timestep``                   | real         | :math:`> 0`             | 0.1         | Time step for each electron move                     |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``samples``                    | integer      | :math:`\geq 0`          | 0           | Total number of walker samples for this VMC run      |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``blocks_between_recompute``   | integer      | :math:`\geq 0`          | dep.        | Wavefunction recompute frequency                     |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``crowd_serialize_walkers``    | integer      | yes, no                 | no          | Force use of single walker APIs (for testing)        |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``debug_checks``               | text         | see additional info     | dep.        | Turn on/off additional recompute and checks          |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``spin_mass``                  | real         | :math:`\geq 0`          | 1.0         | Effective mass for spin sampling                     |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
+  | ``measure_imbalance``          | text         | yes,no                  | no          | Measure load imbalance at the end of each block      |
+  +--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
 
 
 Additional information:
@@ -356,22 +356,24 @@ Additional information:
 
   If neither ``total_walkers`` nor ``walkers_per_rank`` is provided and there are no walker configurations carried over, ``walkers_per_rank`` is set equal to ``crowds``.
 
-- ``total_walkers`` Total number of walkers summed over all MPI ranks, or equivalently the total number of walkers in the DMC
+- ``total_walkers`` Total number of walkers summed over all MPI ranks, or equivalently the total number of walkers in the QMC
   calculation. If not provided, it is computed as ``walkers_per_rank`` times the number of MPI ranks. If both ``total_walkers``
   and ``walkers_per_rank`` are provided, which is not recommended, ``total_walkers`` must be consistently set equal to
   ``walkers_per_rank`` times the number MPI ranks.
 
 - ``blocks`` This parameter is universal for all the QMC methods. The MC processes are divided into a number of
-  ``blocks``, each containing a number of steps. At the end of each block, the statistics accumulated in the block are dumped into files,
-  e.g., ``scalar.dat``. Typically, each block should have a sufficient number of steps that the I/O at the end of each block is negligible
+  ``blocks``, each containing an equal number of steps. At the end of each block, the statistics accumulated in the block are dumped into files,
+  e.g., ``scalar.dat``. Typically, blocks should have a sufficient number of steps that the I/O at the end of each block is negligible
   compared with the computational cost. Each block should not take so long that monitoring its progress is difficult. There should be a
   sufficient number of ``blocks`` to perform statistical analysis.
 
 - ``warmupsteps`` - ``warmupsteps`` are used only for
   initial equilibration and do not count against the requested step or block count.
   Property measurements are not performed during warm-up steps.
 
-- ``steps`` - ``steps`` are the number of energy and other property measurements to perform per block.
+- ``steps`` - ``steps`` are the number of energy and other property measurements to perform per block. If ``samples`` is provided 
+  in the input file but not ``steps``, its value is chosen based on ``samples`` see below. If neither ``samples`` nor ``steps`` is 
+  provided, ``steps`` is set to one.
 
 - ``substeps``  For each substep, an attempt is made to move each of the electrons once only by either particle-by-particle or an
   all-electron move.  Because the local energy is evaluated only at
@@ -391,13 +393,18 @@ Additional information:
   acceptance ratio should be close to 50% for an efficient
   simulation.
 
-- ``samples`` (not ready)
-
-- ``blocks_between_recompute`` Recompute the accuracy critical determinant part of the wavefunction from scratch: =1 by
-  default when using mixed precision. =10 by default when not using mixed precision. 0 can be set for no recomputation
-  and higher performance, but numerical errors will accumulate over time. Recomputing introduces a performance penalty
-  dependent on system size, but protects against the accumulation of numerical error, particularly in the inverses of
-  the Slater determinants. These have a cubic-scaling cost to recompute.
+- ``samples`` The intended total number of samples that will be made in the QMC section. This is primarily intended for VMC
+  wavefunction optimization. The implementation always obtains at least the requested number but may obtain slightly more samples
+  than requested so as to map efficiently on to the MPI tasks and OpenMP threads. If ``samples`` and ``steps`` are both
+  provided, ``samples`` must be equal or smaller than the product of ``total_walkers``, ``steps`` and ``blocks``. If ``samples`` is
+  provided but ``steps`` is not, ``steps`` is automatically set to be the smallest integer that makes ``samples`` equal or smaller
+  than the product of ``total_walkers``, ``steps`` and ``blocks``.
+
+- ``blocks_between_recompute`` Recompute the accuracy critical determinant part of the wavefunction from scratch: =1 by default when
+  using mixed precision. =10 by default when not using mixed precision. 0 can be set for no recomputation and higher performance,
+  but numerical errors will accumulate over time. Recomputing the determinants introduces a performance penalty dependent on system
+  size, but protects against the accumulation of numerical error, particularly in the inverses of the Slater determinants. These
+  have a cubic-scaling cost to recompute.
 
 - ``debug_checks`` valid values are 'no', 'all', 'checkGL_after_load', 'checkGL_after_moves', 'checkGL_after_tmove'. If the build type is `debug`, the default value is 'all'. Otherwise, the default value is 'no'.
 

diff --git a/src/QMCDrivers/DMC/DMCBatched.cpp b/src/QMCDrivers/DMC/DMCBatched.cpp
@@ -354,10 +354,9 @@ void DMCBatched::runDMCStep(int crowd_id,
   auto& rng = context_for_steps[crowd_id]->get_random_gen();
   crowd.setRNGForHamiltonian(rng);
 
-  const int max_steps  = sft.qmcdrv_input.get_max_steps();
   const IndexType step = sft.step;
   // Are we entering the the last step of a block to recompute at?
-  const bool recompute_this_step  = (sft.is_recomputing_block && (step + 1) == max_steps);
+  const bool recompute_this_step  = (sft.is_recomputing_block && (step + 1) == sft.steps_per_block);
   const bool accumulate_this_step = true;
   const bool spin_move            = sft.population.get_golden_electrons().isSpinor();
   if (spin_move)
@@ -378,6 +377,10 @@ void DMCBatched::process(xmlNodePtr node)
                                 qmcdriver_input_.get_walkers_per_rank(), dmcdriver_input_.get_reserve(),
                                 qmcdriver_input_.get_num_crowds());
 
+    steps_per_block_ =
+        determineStepsPerBlock(awc.global_walkers, qmcdriver_input_.get_requested_samples(),
+                               qmcdriver_input_.get_requested_steps(), qmcdriver_input_.get_max_blocks());
+
     Base::initializeQMC(awc);
   }
   catch (const UniformCommunicateError& ue)
@@ -413,7 +416,7 @@ void DMCBatched::process(xmlNodePtr node)
 
     o << "  Persistent walkers are killed after " << dmcdriver_input_.get_max_age() << " MC sweeps\n";
     o << "  BranchInterval = " << dmcdriver_input_.get_branch_interval() << "\n";
-    o << "  Steps per block = " << qmcdriver_input_.get_max_steps() << "\n";
+    o << "  Steps per block = " << steps_per_block_ << "\n";
     o << "  Number of blocks = " << qmcdriver_input_.get_max_blocks() << "\n";
     app_log() << o.str() << std::endl;
 
@@ -426,7 +429,7 @@ bool DMCBatched::run()
   IndexType num_blocks = qmcdriver_input_.get_max_blocks();
 
   estimator_manager_->startDriverRun();
-  StateForThread dmc_state(qmcdriver_input_, dmcdriver_input_, *drift_modifier_, *branch_engine_, population_);
+  StateForThread dmc_state(qmcdriver_input_, *drift_modifier_, *branch_engine_, population_, steps_per_block_);
 
   LoopTimer<> dmc_loop;
   RunTimeControl<> runtimeControl(run_time_manager, project_data_.getMaxCPUSeconds(), project_data_.getTitle(),
@@ -458,19 +461,19 @@ bool DMCBatched::run()
   {
     {
       ScopeGuard<LoopTimer<>> dmc_local_timer(dmc_loop);
-      estimator_manager_->startBlock(qmcdriver_input_.get_max_steps());
+      estimator_manager_->startBlock(steps_per_block_);
 
       dmc_state.recalculate_properties_period = (qmc_driver_mode_[QMC_UPDATE_MODE])
           ? qmcdriver_input_.get_recalculate_properties_period()
-          : (qmcdriver_input_.get_max_blocks() + 1) * qmcdriver_input_.get_max_steps();
+          : (qmcdriver_input_.get_max_blocks() + 1) * steps_per_block_;
       dmc_state.is_recomputing_block          = qmcdriver_input_.get_blocks_between_recompute()
                    ? (1 + block) % qmcdriver_input_.get_blocks_between_recompute() == 0
                    : false;
 
       for (UPtr<Crowd>& crowd : crowds_)
-        crowd->startBlock(qmcdriver_input_.get_max_steps());
+        crowd->startBlock(steps_per_block_);
 
-      for (int step = 0; step < qmcdriver_input_.get_max_steps(); ++step)
+      for (int step = 0; step < steps_per_block_; ++step)
       {
         ScopedTimer local_timer(timers_.run_steps_timer);
 
@@ -484,7 +487,7 @@ bool DMCBatched::run()
                    std::ref(crowds_));
 
         {
-          const int iter = block * qmcdriver_input_.get_max_steps() + step;
+          const int iter = block * steps_per_block_ + step;
           walker_controller_->branch(iter, population_, iter == 0);
           branch_engine_->updateParamAfterPopControl(walker_controller_->get_ensemble_property(),
                                                      population_.get_golden_electrons().getTotalNum());

diff --git a/src/QMCDrivers/DMC/DMCBatched.h b/src/QMCDrivers/DMC/DMCBatched.h
@@ -47,19 +47,24 @@ class DMCBatched : public QMCDriverNew
   struct StateForThread
   {
     const QMCDriverInput& qmcdrv_input;
-    const DMCDriverInput& dmcdrv_input;
     const DriftModifierBase& drift_modifier;
     const MCPopulation& population;
     SFNBranch& branch_engine;
     IndexType recalculate_properties_period;
-    IndexType step            = -1;
+    const size_t steps_per_block;
+    IndexType step = -1;
     bool is_recomputing_block = false;
+
     StateForThread(const QMCDriverInput& qmci,
-                   const DMCDriverInput& dmci,
                    DriftModifierBase& drift_mod,
                    SFNBranch& branch_eng,
-                   MCPopulation& pop)
-        : qmcdrv_input(qmci), dmcdrv_input(dmci), drift_modifier(drift_mod), population(pop), branch_engine(branch_eng)
+                   MCPopulation& pop,
+                   const size_t steps_per_block)
+        : qmcdrv_input(qmci),
+          drift_modifier(drift_mod),
+          population(pop),
+          branch_engine(branch_eng),
+          steps_per_block(steps_per_block)
     {}
   };