Add template parameter to ParallelFor and launch specifying block size (

AMReX-Codes#2947) By default, amrex::ParallelFor launches AMREX_GPU_MAX_THREADS threads per block. We can now explicitly specfiy the block size with `ParallelFor<BLOCK_SIZE>(...)`, where BLOCK_SIZE should be a multiple of the warp size (e.g., 64, 128, etc.). A similar change has also been made to `launch`. The changes are backward compatible.
Thierry992 · Sep 15, 2022 · 6a5a056 · 6a5a056
1 parent 2cdb9df
commit 6a5a056
Show file tree

Hide file tree

Showing 6 changed files with 1,115 additions and 102 deletions.
diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst
@@ -1001,7 +1001,7 @@ launch function.
 
 ``amrex::ParallelFor()`` expands into different variations of a quadruply-nested
 :cpp:`for` loop depending dimensionality and whether it is being implemented on CPU or GPU.
-The best way to understand this macro is to take a look at the 4D :cpp:`amrex::ParallelFor`
+The best way to understand this function is to take a look at the 4D :cpp:`amrex::ParallelFor`
 that is implemented when ``USE_CUDA=FALSE``. A simplified version is reproduced here:
 
 .. highlight:: c++
@@ -1103,6 +1103,15 @@ bounds, a :cpp:`long` or :cpp:`int` number of elements is passed to bound the si
 passing the number of elements to work on and indexing the pointer to the starting
 element: :cpp:`p[idx + 15]`.
 
+GPU block size
+--------------
+
+By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads
+per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant
+with a default value of 256.  The users can also explcitly specify the
+number of threads per block by :cpp:`ParallelFor<MY_BLOCK_SIZE>(...)`, where
+``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128).  This allows
+the users to do performance tuning for individual kernels.
 
 Launching general kernels
 -------------------------

diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
@@ -30,11 +30,11 @@
 #define AMREX_GPU_Z_STRIDE 1
 
 #ifdef AMREX_USE_CUDA
-#  define AMREX_LAUNCH_KERNEL(blocks, threads, sharedMem, stream, ... ) \
-        amrex::launch_global<AMREX_GPU_MAX_THREADS><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__);
+#  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
+        amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
 #elif defined(AMREX_USE_HIP)
-#  define AMREX_LAUNCH_KERNEL(blocks, threads, sharedMem, stream, ... ) \
-        hipLaunchKernelGGL(launch_global<AMREX_GPU_MAX_THREADS>, blocks, threads, sharedMem, stream, __VA_ARGS__);
+#  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
+        hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
 #endif
 
 
@@ -151,6 +151,28 @@ namespace Gpu {
         dim3 numThreads;
         std::size_t sharedMem = 0;
     };
+
+    template <int MT>
+    ExecutionConfig
+    makeExecutionConfig (Long N) noexcept
+    {
+        ExecutionConfig ec(dim3{}, dim3{});
+        ec.numBlocks.x = (std::max(N,Long(1)) + MT - 1) / MT;
+        ec.numThreads.x = MT;
+        AMREX_ASSERT(MT % Gpu::Device::warp_size == 0);
+        return ec;
+    }
+
+    template <int MT>
+    ExecutionConfig
+    makeExecutionConfig (const Box& box) noexcept
+    {
+        ExecutionConfig ec(dim3{}, dim3{});
+        ec.numBlocks.x = (std::max(box.numPts(),Long(1)) + MT - 1) / MT;
+        ec.numThreads.x = MT;
+        AMREX_ASSERT(MT % Gpu::Device::warp_size == 0);
+        return ec;
+    }
 #endif
 
 }