diff --git a/libs/core/coroutines/include/hpx/coroutines/thread_enums.hpp b/libs/core/coroutines/include/hpx/coroutines/thread_enums.hpp
index 7abf67d5c3d6..3e8ec091b69d 100644
--- a/libs/core/coroutines/include/hpx/coroutines/thread_enums.hpp
+++ b/libs/core/coroutines/include/hpx/coroutines/thread_enums.hpp
@@ -206,7 +206,7 @@ namespace hpx::threads {
         /// local thread number associated with this hint. Local thread numbers
         /// are indexed from zero. It is up to the scheduler to decide how to
         /// interpret thread numbers that are larger than the number of threads
-        /// available to the scheduler. Typically thread numbers will wrap
+        /// available to the scheduler. Typically, thread numbers will wrap
         /// around when too large.
         thread = 1,
 
@@ -214,7 +214,7 @@ namespace hpx::threads {
         /// NUMA domain associated with this hint. NUMA domains are indexed from
         /// zero. It is up to the scheduler to decide how to interpret NUMA
         /// domain indices that are larger than the number of available NUMA
-        /// domains to the scheduler. Typically indices will wrap around when
+        /// domains to the scheduler. Typically, indices will wrap around when
         /// too large.
         numa = 2,
     };
@@ -295,7 +295,7 @@ namespace hpx::threads {
     }
 
     ///////////////////////////////////////////////////////////////////////////
-    /// \enum thread_placement_hint
+    /// \enum thread_execution_hint
     ///
     /// The type of hint given to the scheduler related running a thread as a
     /// child directly in the context of the parent thread
diff --git a/libs/core/executors/include/hpx/executors/detail/index_queue_spawning.hpp b/libs/core/executors/include/hpx/executors/detail/index_queue_spawning.hpp
index c514f8abb227..a1d2d158fce7 100644
--- a/libs/core/executors/include/hpx/executors/detail/index_queue_spawning.hpp
+++ b/libs/core/executors/include/hpx/executors/detail/index_queue_spawning.hpp
@@ -65,7 +65,8 @@ namespace hpx::parallel::execution::detail {
         template <typename F, typename Ts>
         void do_work_chunk(F&& f, Ts&& ts, std::uint32_t const index) const
         {
-#if HPX_HAVE_ITTNOTIFY != 0 && !defined(HPX_HAVE_APEX)
+#if defined(HPX_HAVE_ITTNOTIFY) && HPX_HAVE_ITTNOTIFY != 0 &&                  \
+    !defined(HPX_HAVE_APEX)
             static hpx::util::itt::event notify_event(
                 "set_value_loop_visitor_static::do_work_chunk(chunking)");
 
@@ -150,7 +151,7 @@ namespace hpx::parallel::execution::detail {
         // Finish the work for one worker thread. If this is not the last worker
         // thread to finish, it will only decrement the counter. If it is the
         // last thread it will call set_exception if there is an exception.
-        // Otherwise it will call set_value on the shared state.
+        // Otherwise, it will call set_value on the shared state.
         void finish() const
         {
             if (--(state->tasks_remaining.data_) == 0)
@@ -438,17 +439,21 @@ namespace hpx::parallel::execution::detail {
 
             // Initialize the queues for all worker threads so that worker
             // threads can start stealing immediately when they start.
-            for (std::uint32_t worker_thread = 0; worker_thread != num_threads;
-                 ++worker_thread)
+            if (hint.placement_mode() == placement::breadth_first ||
+                hint.placement_mode() == placement::breadth_first_reverse)
             {
-                if (hint.placement_mode() == placement::breadth_first ||
-                    hint.placement_mode() == placement::breadth_first_reverse)
+                for (std::uint32_t worker_thread = 0;
+                     worker_thread != num_threads; ++worker_thread)
                 {
                     init_queue_breadth_first(worker_thread, num_chunks);
                 }
-                else
+            }
+            else
+            {
+                // the default for this executor is depth-first placement
+                for (std::uint32_t worker_thread = 0;
+                     worker_thread != num_threads; ++worker_thread)
                 {
-                    // the default for this executor is depth-first placement
                     init_queue_depth_first(worker_thread, num_chunks);
                 }
             }
@@ -546,8 +551,8 @@ namespace hpx::parallel::execution::detail {
             auto launch_data = generate_launch_data();
             std::size_t const size = launch_data.size();
 
-            // Do straight spawning if hierarchical spawning was disabled or we
-            // have less chunks than our threshold.
+            // Do straight spawning if hierarchical spawning was disabled or if
+            // we have less chunks than our threshold.
             if (hierarchical_threshold == 0 || hierarchical_threshold >= size)
             {
                 for (std::size_t i = 0; i != size; ++i)
@@ -558,36 +563,50 @@ namespace hpx::parallel::execution::detail {
                 return;
             }
 
-            auto task = [desc, pool, launch_data](auto b, auto e) {
-                for (std::size_t i = b; i != e - 1; ++i)
+            auto task = [desc, pool, launch_data = HPX_MOVE(launch_data)](
+                            auto b, auto e) mutable {
+                HPX_ASSERT(b != e);
+                for (std::size_t i = b + 1; i != e; ++i)
                 {
                     auto state = launch_data[i].func.state;
                     state->template do_work_task<false>(desc, pool,
-                        launch_data[i].bind_to_core, launch_data[i].func);
+                        launch_data[i].bind_to_core,
+                        HPX_MOVE(launch_data[i].func));
                 }
 
-                // directly execute last task
-                auto state = launch_data[e - 1].func.state;
+                // directly execute first task
+                auto state = launch_data[b].func.state;
                 state->template do_work_task<true>(
-                    desc, pool, false, launch_data[e - 1].func);
+                    desc, pool, false, HPX_MOVE(launch_data[b].func));
             };
 
             // run task on small stack
             auto post_policy = hpx::execution::experimental::with_stacksize(
                 policy, threads::thread_stacksize::small_);
+            auto post_policy_hint =
+                hpx::execution::experimental::get_hint(post_policy);
+            post_policy_hint.mode =
+                hpx::threads::thread_schedule_hint_mode::thread;
+
             std::size_t start = 0;
-            while (true)
+            while (start < size)
             {
+                // place the helper thread on the first core of the thread block
+                post_policy_hint.hint =
+                    first_thread + static_cast<std::uint16_t>(start);
+                auto core_policy = hpx::execution::experimental::with_hint(
+                    post_policy, post_policy_hint);
+
                 auto const stop = start + hierarchical_threshold;
                 if (stop > size)
                 {
                     hpx::detail::post_policy_dispatch<Launch>::call(
-                        post_policy, desc, pool, HPX_MOVE(task), start, size);
+                        core_policy, desc, pool, HPX_MOVE(task), start, size);
                     break;
                 }
 
                 hpx::detail::post_policy_dispatch<Launch>::call(
-                    post_policy, desc, pool, task, start, stop);
+                    core_policy, desc, pool, task, start, stop);
                 start = stop;
             }
         }
diff --git a/libs/core/executors/include/hpx/executors/thread_pool_scheduler_bulk.hpp b/libs/core/executors/include/hpx/executors/thread_pool_scheduler_bulk.hpp
index 63bc1c0efbf0..ad3effaf822d 100644
--- a/libs/core/executors/include/hpx/executors/thread_pool_scheduler_bulk.hpp
+++ b/libs/core/executors/include/hpx/executors/thread_pool_scheduler_bulk.hpp
@@ -131,7 +131,8 @@ namespace hpx::execution::experimental::detail {
         template <typename Ts>
         void do_work_chunk(Ts& ts, std::uint32_t const index) const
         {
-#if HPX_HAVE_ITTNOTIFY != 0 && !defined(HPX_HAVE_APEX)
+#if defined(HPX_HAVE_ITTNOTIFY) && HPX_HAVE_ITTNOTIFY != 0 &&                  \
+    !defined(HPX_HAVE_APEX)
             static hpx::util::itt::event notify_event(
                 "set_value_loop_visitor_static::do_work_chunk(chunking)");
 
@@ -145,7 +146,7 @@ namespace hpx::execution::experimental::detail {
                 (std::min)(i_begin + task_f->chunk_size, task_f->size);
 
             auto it = std::next(hpx::util::begin(op_state->shape), i_begin);
-            for (std::uint32_t i = i_begin; i != i_end; (void) ++it, ++i)
+            for (std::size_t i = i_begin; i != i_end; (void) ++it, ++i)
             {
                 bulk_scheduler_invoke_helper(
                     index_pack_type{}, op_state->f, *it, ts);
@@ -274,7 +275,7 @@ namespace hpx::execution::experimental::detail {
         // Finish the work for one worker thread. If this is not the last worker
         // thread to finish, it will only decrement the counter. If it is the
         // last thread it will call set_error if there is an exception.
-        // Otherwise it will call set_value on the connected receiver.
+        // Otherwise, it will call set_value on the connected receiver.
         void finish() const
         {
             if (--(op_state->tasks_remaining.data_) == 0)