Skip to content

Commit

Permalink
Merge pull request #4224 from ye-luo/batched-driver-samples
Browse files Browse the repository at this point in the history
Enable samples input tag in batched drivers
  • Loading branch information
prckent committed Apr 4, 2024
2 parents 8966880 + 733099a commit 5921824
Show file tree
Hide file tree
Showing 15 changed files with 263 additions and 109 deletions.
2 changes: 1 addition & 1 deletion CMake/TestCxx17Library.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ if(NOT CXX17_LIBRARY_OKAY)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
message(
"Compiler detected is <Clang> namely clang++ or a vendor variant (icpx, amdclang++, armclang++).\n If not using libcxx, ensure a GCC toolchain version equal or greater "
"than 9.0 gets picked up. Check with '<Clang> -v'. Or use the --gcc-toolchain compiler option "
"than 9.0 gets picked up. Check with '<Clang> -v'. Or use the --gcc-install-dir (--gcc-toolchain deprecated) compiler option "
"(added to both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS) to point to a newer GCC installation."
)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
Expand Down
95 changes: 51 additions & 44 deletions docs/methods.rst
Original file line number Diff line number Diff line change
Expand Up @@ -306,39 +306,39 @@ Batched ``vmc`` driver (experimental)

parameters:

+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| **Name** | **Datatype** | **Values** | **Default** | **Description** |
+================================+==============+=========================+=============+=================================================+
| ``total_walkers`` | integer | :math:`> 0` | 1 | Total number of walkers over all MPI ranks |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``walkers_per_rank`` | integer | :math:`> 0` | 1 | Number of walkers per MPI rank |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``crowds`` | integer | :math:`> 0` | dep. | Number of desynchronized dwalker crowds |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``blocks`` | integer | :math:`\geq 0` | 1 | Number of blocks |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``steps`` | integer | :math:`\geq 0` | 1 | Number of steps per block |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``warmupsteps`` | integer | :math:`\geq 0` | 0 | Number of steps for warming up |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``substeps`` | integer | :math:`\geq 0` | 1 | Number of substeps per step |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``usedrift`` | text | yes,no | yes | Use the algorithm with drift |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``timestep`` | real | :math:`> 0` | 0.1 | Time step for each electron move |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``samples`` (not ready) | integer | :math:`\geq 0` | 0 | Number of walker samples for in this VMC run |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``blocks_between_recompute`` | integer | :math:`\geq 0` | dep. | Wavefunction recompute frequency |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``crowd_serialize_walkers`` | integer | yes, no | no | Force use of single walker APIs (for testing) |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``debug_checks`` | text | see additional info | dep. | Turn on/off additional recompute and checks |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``spin_mass`` | real | :math:`\geq 0` | 1.0 | Effective mass for spin sampling |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
| ``measure_imbalance`` | text | yes,no | no | Measure load imbalance at the end of each block |
+--------------------------------+--------------+-------------------------+-------------+-------------------------------------------------+
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| **Name** | **Datatype** | **Values** | **Default** | **Description** |
+================================+==============+=========================+=============+======================================================+
| ``total_walkers`` | integer | :math:`> 0` | 1 | Total number of walkers over all MPI ranks |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``walkers_per_rank`` | integer | :math:`> 0` | 1 | Number of walkers per MPI rank |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``crowds`` | integer | :math:`> 0` | dep. | Number of desynchronized walker crowds |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``blocks`` | integer | :math:`\geq 0` | 1 | Number of blocks |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``steps`` | integer | :math:`\geq 0` | dep. | Number of steps per block |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``warmupsteps`` | integer | :math:`\geq 0` | 0 | Number of steps for warming up |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``substeps`` | integer | :math:`\geq 0` | 1 | Number of substeps per step |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``usedrift`` | text | yes,no | yes | Use the algorithm with drift |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``timestep`` | real | :math:`> 0` | 0.1 | Time step for each electron move |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``samples`` | integer | :math:`\geq 0` | 0 | Total number of walker samples for this VMC run |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``blocks_between_recompute`` | integer | :math:`\geq 0` | dep. | Wavefunction recompute frequency |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``crowd_serialize_walkers`` | integer | yes, no | no | Force use of single walker APIs (for testing) |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``debug_checks`` | text | see additional info | dep. | Turn on/off additional recompute and checks |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``spin_mass`` | real | :math:`\geq 0` | 1.0 | Effective mass for spin sampling |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+
| ``measure_imbalance`` | text | yes,no | no | Measure load imbalance at the end of each block |
+--------------------------------+--------------+-------------------------+-------------+------------------------------------------------------+


Additional information:
Expand All @@ -356,22 +356,24 @@ Additional information:

If neither ``total_walkers`` nor ``walkers_per_rank`` is provided and there are no walker configurations carried over, ``walkers_per_rank`` is set equal to ``crowds``.

- ``total_walkers`` Total number of walkers summed over all MPI ranks, or equivalently the total number of walkers in the DMC
- ``total_walkers`` Total number of walkers summed over all MPI ranks, or equivalently the total number of walkers in the QMC
calculation. If not provided, it is computed as ``walkers_per_rank`` times the number of MPI ranks. If both ``total_walkers``
and ``walkers_per_rank`` are provided, which is not recommended, ``total_walkers`` must be consistently set equal to
``walkers_per_rank`` times the number MPI ranks.

- ``blocks`` This parameter is universal for all the QMC methods. The MC processes are divided into a number of
``blocks``, each containing a number of steps. At the end of each block, the statistics accumulated in the block are dumped into files,
e.g., ``scalar.dat``. Typically, each block should have a sufficient number of steps that the I/O at the end of each block is negligible
``blocks``, each containing an equal number of steps. At the end of each block, the statistics accumulated in the block are dumped into files,
e.g., ``scalar.dat``. Typically, blocks should have a sufficient number of steps that the I/O at the end of each block is negligible
compared with the computational cost. Each block should not take so long that monitoring its progress is difficult. There should be a
sufficient number of ``blocks`` to perform statistical analysis.

- ``warmupsteps`` - ``warmupsteps`` are used only for
initial equilibration and do not count against the requested step or block count.
Property measurements are not performed during warm-up steps.

- ``steps`` - ``steps`` are the number of energy and other property measurements to perform per block.
- ``steps`` - ``steps`` are the number of energy and other property measurements to perform per block. If ``samples`` is provided
in the input file but not ``steps``, its value is chosen based on ``samples`` see below. If neither ``samples`` nor ``steps`` is
provided, ``steps`` is set to one.

- ``substeps`` For each substep, an attempt is made to move each of the electrons once only by either particle-by-particle or an
all-electron move. Because the local energy is evaluated only at
Expand All @@ -391,13 +393,18 @@ Additional information:
acceptance ratio should be close to 50% for an efficient
simulation.

- ``samples`` (not ready)

- ``blocks_between_recompute`` Recompute the accuracy critical determinant part of the wavefunction from scratch: =1 by
default when using mixed precision. =10 by default when not using mixed precision. 0 can be set for no recomputation
and higher performance, but numerical errors will accumulate over time. Recomputing introduces a performance penalty
dependent on system size, but protects against the accumulation of numerical error, particularly in the inverses of
the Slater determinants. These have a cubic-scaling cost to recompute.
- ``samples`` The intended total number of samples that will be made in the QMC section. This is primarily intended for VMC
wavefunction optimization. The implementation always obtains at least the requested number but may obtain slightly more samples
than requested so as to map efficiently on to the MPI tasks and OpenMP threads. If ``samples`` and ``steps`` are both
provided, ``samples`` must be equal or smaller than the product of ``total_walkers``, ``steps`` and ``blocks``. If ``samples`` is
provided but ``steps`` is not, ``steps`` is automatically set to be the smallest integer that makes ``samples`` equal or smaller
than the product of ``total_walkers``, ``steps`` and ``blocks``.

- ``blocks_between_recompute`` Recompute the accuracy critical determinant part of the wavefunction from scratch: =1 by default when
using mixed precision. =10 by default when not using mixed precision. 0 can be set for no recomputation and higher performance,
but numerical errors will accumulate over time. Recomputing the determinants introduces a performance penalty dependent on system
size, but protects against the accumulation of numerical error, particularly in the inverses of the Slater determinants. These
have a cubic-scaling cost to recompute.

- ``debug_checks`` valid values are 'no', 'all', 'checkGL_after_load', 'checkGL_after_moves', 'checkGL_after_tmove'. If the build type is `debug`, the default value is 'all'. Otherwise, the default value is 'no'.

Expand Down
21 changes: 12 additions & 9 deletions src/QMCDrivers/DMC/DMCBatched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,10 +354,9 @@ void DMCBatched::runDMCStep(int crowd_id,
auto& rng = context_for_steps[crowd_id]->get_random_gen();
crowd.setRNGForHamiltonian(rng);

const int max_steps = sft.qmcdrv_input.get_max_steps();
const IndexType step = sft.step;
// Are we entering the the last step of a block to recompute at?
const bool recompute_this_step = (sft.is_recomputing_block && (step + 1) == max_steps);
const bool recompute_this_step = (sft.is_recomputing_block && (step + 1) == sft.steps_per_block);
const bool accumulate_this_step = true;
const bool spin_move = sft.population.get_golden_electrons().isSpinor();
if (spin_move)
Expand All @@ -378,6 +377,10 @@ void DMCBatched::process(xmlNodePtr node)
qmcdriver_input_.get_walkers_per_rank(), dmcdriver_input_.get_reserve(),
qmcdriver_input_.get_num_crowds());

steps_per_block_ =
determineStepsPerBlock(awc.global_walkers, qmcdriver_input_.get_requested_samples(),
qmcdriver_input_.get_requested_steps(), qmcdriver_input_.get_max_blocks());

Base::initializeQMC(awc);
}
catch (const UniformCommunicateError& ue)
Expand Down Expand Up @@ -413,7 +416,7 @@ void DMCBatched::process(xmlNodePtr node)

o << " Persistent walkers are killed after " << dmcdriver_input_.get_max_age() << " MC sweeps\n";
o << " BranchInterval = " << dmcdriver_input_.get_branch_interval() << "\n";
o << " Steps per block = " << qmcdriver_input_.get_max_steps() << "\n";
o << " Steps per block = " << steps_per_block_ << "\n";
o << " Number of blocks = " << qmcdriver_input_.get_max_blocks() << "\n";
app_log() << o.str() << std::endl;

Expand All @@ -426,7 +429,7 @@ bool DMCBatched::run()
IndexType num_blocks = qmcdriver_input_.get_max_blocks();

estimator_manager_->startDriverRun();
StateForThread dmc_state(qmcdriver_input_, dmcdriver_input_, *drift_modifier_, *branch_engine_, population_);
StateForThread dmc_state(qmcdriver_input_, *drift_modifier_, *branch_engine_, population_, steps_per_block_);

LoopTimer<> dmc_loop;
RunTimeControl<> runtimeControl(run_time_manager, project_data_.getMaxCPUSeconds(), project_data_.getTitle(),
Expand Down Expand Up @@ -458,19 +461,19 @@ bool DMCBatched::run()
{
{
ScopeGuard<LoopTimer<>> dmc_local_timer(dmc_loop);
estimator_manager_->startBlock(qmcdriver_input_.get_max_steps());
estimator_manager_->startBlock(steps_per_block_);

dmc_state.recalculate_properties_period = (qmc_driver_mode_[QMC_UPDATE_MODE])
? qmcdriver_input_.get_recalculate_properties_period()
: (qmcdriver_input_.get_max_blocks() + 1) * qmcdriver_input_.get_max_steps();
: (qmcdriver_input_.get_max_blocks() + 1) * steps_per_block_;
dmc_state.is_recomputing_block = qmcdriver_input_.get_blocks_between_recompute()
? (1 + block) % qmcdriver_input_.get_blocks_between_recompute() == 0
: false;

for (UPtr<Crowd>& crowd : crowds_)
crowd->startBlock(qmcdriver_input_.get_max_steps());
crowd->startBlock(steps_per_block_);

for (int step = 0; step < qmcdriver_input_.get_max_steps(); ++step)
for (int step = 0; step < steps_per_block_; ++step)
{
ScopedTimer local_timer(timers_.run_steps_timer);

Expand All @@ -484,7 +487,7 @@ bool DMCBatched::run()
std::ref(crowds_));

{
const int iter = block * qmcdriver_input_.get_max_steps() + step;
const int iter = block * steps_per_block_ + step;
walker_controller_->branch(iter, population_, iter == 0);
branch_engine_->updateParamAfterPopControl(walker_controller_->get_ensemble_property(),
population_.get_golden_electrons().getTotalNum());
Expand Down
15 changes: 10 additions & 5 deletions src/QMCDrivers/DMC/DMCBatched.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,24 @@ class DMCBatched : public QMCDriverNew
struct StateForThread
{
const QMCDriverInput& qmcdrv_input;
const DMCDriverInput& dmcdrv_input;
const DriftModifierBase& drift_modifier;
const MCPopulation& population;
SFNBranch& branch_engine;
IndexType recalculate_properties_period;
IndexType step = -1;
const size_t steps_per_block;
IndexType step = -1;
bool is_recomputing_block = false;

StateForThread(const QMCDriverInput& qmci,
const DMCDriverInput& dmci,
DriftModifierBase& drift_mod,
SFNBranch& branch_eng,
MCPopulation& pop)
: qmcdrv_input(qmci), dmcdrv_input(dmci), drift_modifier(drift_mod), population(pop), branch_engine(branch_eng)
MCPopulation& pop,
const size_t steps_per_block)
: qmcdrv_input(qmci),
drift_modifier(drift_mod),
population(pop),
branch_engine(branch_eng),
steps_per_block(steps_per_block)
{}
};

Expand Down

0 comments on commit 5921824

Please sign in to comment.