From 3d03cc50930b5bb3b59cee4c14404a08199421f2 Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Wed, 16 Jul 2025 21:02:33 +0000 Subject: [PATCH] SWDEV-543365 - Add API changes to hipModuleOccupancyMaxPotentialBlockSize --- .../include/hip/amd_detail/hip_api_trace.hpp | 8 +++--- hipamd/src/hip_api_trace.cpp | 8 +++--- hipamd/src/hip_module.cpp | 2 +- hipamd/src/hip_platform.cpp | 27 ++++++++++++------- hipamd/src/hip_platform.hpp | 3 ++- hipamd/src/hip_table_interface.cpp | 11 ++++---- 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/hipamd/include/hip/amd_detail/hip_api_trace.hpp b/hipamd/include/hip/amd_detail/hip_api_trace.hpp index b06c08873d..676785e282 100644 --- a/hipamd/include/hip/amd_detail/hip_api_trace.hpp +++ b/hipamd/include/hip/amd_detail/hip_api_trace.hpp @@ -671,12 +671,14 @@ typedef hipError_t (*t_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor)( typedef hipError_t (*t_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags)( int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags); typedef hipError_t (*t_hipModuleOccupancyMaxPotentialBlockSize)(int* gridSize, int* blockSize, - hipFunction_t f, + hipFunction_t f, + hipOccupancyB2DSize_t + blkSizeToDynSMemSize, size_t dynSharedMemPerBlk, int blockSizeLimit); typedef hipError_t (*t_hipModuleOccupancyMaxPotentialBlockSizeWithFlags)( - int* gridSize, int* blockSize, hipFunction_t f, size_t dynSharedMemPerBlk, int blockSizeLimit, - unsigned int flags); + int* gridSize, int* blockSize, hipFunction_t f, hipOccupancyB2DSize_t blkSizeToDynSMemSize, + size_t dynSharedMemPerBlk, int blockSizeLimit, unsigned int flags); typedef hipError_t (*t_hipModuleUnload)(hipModule_t module); typedef hipError_t (*t_hipOccupancyMaxActiveBlocksPerMultiprocessor)(int* numBlocks, const void* f, int blockSize, diff --git a/hipamd/src/hip_api_trace.cpp b/hipamd/src/hip_api_trace.cpp index 05a0909416..9c6267e0f2 100644 --- a/hipamd/src/hip_api_trace.cpp +++ b/hipamd/src/hip_api_trace.cpp @@ -553,11 +553,13 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, hi hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags); hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, hipFunction_t f, + hipOccupancyB2DSize_t blkSizeToDynSMemSize, size_t dynSharedMemPerBlk, int blockSizeLimit); hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize, - hipFunction_t f, - size_t dynSharedMemPerBlk, - int blockSizeLimit, unsigned int flags); + hipFunction_t f, hipOccupancyB2DSize_t + blkSizeToDynSMemSize, size_t + dynSharedMemPerBlk, int blockSizeLimit, + unsigned int flags); hipError_t hipModuleUnload(hipModule_t module); hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk); diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index e778d474d6..888100013c 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -319,7 +319,7 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& l int block_size = launch_params.local_.product(); hipError_t err = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, &max_blocks_per_grid, &best_block_size, *device, f, block_size, - launch_params.sharedMemBytes_, true); + launch_params.sharedMemBytes_, true, nullptr); if (err != hipSuccess) { return err; } diff --git a/hipamd/src/hip_platform.cpp b/hipamd/src/hip_platform.cpp index 7d91f209cd..3ea58ef25d 100644 --- a/hipamd/src/hip_platform.cpp +++ b/hipamd/src/hip_platform.cpp @@ -349,7 +349,8 @@ hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memor namespace hip_impl { hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( int* maxBlocksPerCU, int* numBlocksPerGrid, int* bestBlockSize, const amd::Device& device, - hipFunction_t func, int inputBlockSize, size_t dynamicSMemSize, bool bCalcPotentialBlkSz) { + hipFunction_t func, int inputBlockSize, size_t dynamicSMemSize, bool bCalcPotentialBlkSz, + hipOccupancyB2DSize_t blockSizeToDynamicSMemSize) { hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func); const amd::Kernel& kernel = *function->kernel(); @@ -420,7 +421,11 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( const int alu_limited_threads = alu_occupancy * wrkGrpInfo->wavefrontSize_; int lds_occupancy_wgs = INT_MAX; - const size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSize; + size_t dynamicSMemSizeFinal = dynamicSMemSize; + if (blockSizeToDynamicSMemSize != nullptr) { + dynamicSMemSizeFinal = (*blockSizeToDynamicSMemSize)(inputBlockSize); + } + const size_t total_used_lds = wrkGrpInfo->usedLDSSize_ + dynamicSMemSizeFinal; if (total_used_lds != 0) { lds_occupancy_wgs = static_cast(device.info().localMemSize_ / total_used_lds); } @@ -475,7 +480,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, cons int best_block_size = 0; hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSizeLimit, - dynSharedMemPerBlk, true); + dynSharedMemPerBlk, true, nullptr); if (ret == hipSuccess) { *blockSize = best_block_size; *gridSize = max_blocks_per_grid; @@ -484,6 +489,7 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, cons } hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, hipFunction_t f, + hipOccupancyB2DSize_t blockSizeToDynamicSMemSize, size_t dynSharedMemPerBlk, int blockSizeLimit) { HIP_INIT_API(hipModuleOccupancyMaxPotentialBlockSize, f, dynSharedMemPerBlk, blockSizeLimit); if ((gridSize == nullptr) || (blockSize == nullptr) || (f == nullptr)) { @@ -495,7 +501,7 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize int best_block_size = 0; hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit, - dynSharedMemPerBlk, true); + dynSharedMemPerBlk, true, blockSizeToDynamicSMemSize); if (ret == hipSuccess) { *blockSize = best_block_size; *gridSize = max_blocks_per_grid; @@ -504,7 +510,8 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize } hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize, - hipFunction_t f, + hipFunction_t f, hipOccupancyB2DSize_t + blockSizeToDynamicSMemSize, size_t dynSharedMemPerBlk, int blockSizeLimit, unsigned int flags) { @@ -522,7 +529,7 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* int best_block_size = 0; hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSizeLimit, - dynSharedMemPerBlk, true); + dynSharedMemPerBlk, true, blockSizeToDynamicSMemSize); if (ret == hipSuccess) { *blockSize = best_block_size; *gridSize = max_blocks_per_grid; @@ -545,7 +552,7 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, hi int best_block_size = 0; hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk, - false); + false, nullptr); *numBlocks = num_blocks; HIP_RETURN(ret); } @@ -567,7 +574,7 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int best_block_size = 0; hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, blockSize, dynSharedMemPerBlk, - false); + false, nullptr); *numBlocks = num_blocks; HIP_RETURN(ret); } @@ -592,7 +599,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, const vo int best_block_size = 0; hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize, - false); + false, nullptr); *numBlocks = num_blocks; HIP_RETURN(ret); } @@ -622,7 +629,7 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, int best_block_size = 0; hipError_t ret = hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks, &max_blocks_per_grid, &best_block_size, device, func, blockSize, dynamicSMemSize, - false); + false, nullptr); *numBlocks = num_blocks; HIP_RETURN(ret); } diff --git a/hipamd/src/hip_platform.hpp b/hipamd/src/hip_platform.hpp index aeeb193618..42e1980531 100644 --- a/hipamd/src/hip_platform.hpp +++ b/hipamd/src/hip_platform.hpp @@ -28,7 +28,8 @@ namespace hip_impl { hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( int* maxBlocksPerCU, int* numBlocksPerGrid, int* bestBlockSize, const amd::Device& device, - hipFunction_t func, int inputBlockSize, size_t dynamicSMemSize, bool bCalcPotentialBlkSz); + hipFunction_t func, int inputBlockSize, size_t dynamicSMemSize, bool bCalcPotentialBlkSz, + hipOccupancyB2DSize_t blockSizeToDynamicSMemSize); } // namespace hip_impl // Unique file descriptor class diff --git a/hipamd/src/hip_table_interface.cpp b/hipamd/src/hip_table_interface.cpp index 5e86b86edd..ea490e5e07 100644 --- a/hipamd/src/hip_table_interface.cpp +++ b/hipamd/src/hip_table_interface.cpp @@ -1298,17 +1298,18 @@ extern "C" hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlag numBlocks, f, blockSize, dynSharedMemPerBlk, flags); } extern "C" hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, - hipFunction_t f, + hipFunction_t f, hipOccupancyB2DSize_t + blkSizeToDynSMemSize, size_t dynSharedMemPerBlk, int blockSizeLimit) { return hip::GetHipDispatchTable()->hipModuleOccupancyMaxPotentialBlockSize_fn( - gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit); + gridSize, blockSize, f, blkSizeToDynSMemSize, dynSharedMemPerBlk, blockSizeLimit); } extern "C" hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags( - int* gridSize, int* blockSize, hipFunction_t f, size_t dynSharedMemPerBlk, int blockSizeLimit, - unsigned int flags) { + int* gridSize, int* blockSize, hipFunction_t f, hipOccupancyB2DSize_t blkSizeToDynSMemSize, + size_t dynSharedMemPerBlk, int blockSizeLimit, unsigned int flags) { return hip::GetHipDispatchTable()->hipModuleOccupancyMaxPotentialBlockSizeWithFlags_fn( - gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit, flags); + gridSize, blockSize, f, blkSizeToDynSMemSize, dynSharedMemPerBlk, blockSizeLimit, flags); } hipError_t hipModuleUnload(hipModule_t module) { return hip::GetHipDispatchTable()->hipModuleUnload_fn(module);