diff --git a/library/include/rocwmma/internal/coop_load.hpp b/library/include/rocwmma/internal/coop_load.hpp index 8a12a5a9..11bb445d 100644 --- a/library/include/rocwmma/internal/coop_load.hpp +++ b/library/include/rocwmma/internal/coop_load.hpp @@ -63,10 +63,7 @@ namespace rocwmma // Outer loop = index 0, // Inner loop = index N-1 - template + template ROCWMMA_DEVICE static inline auto unroll_right(Iterator& out, DataT const* dataPtr, uint32_t ldm, @@ -93,7 +90,6 @@ namespace rocwmma // Recurse to the next nested layer else { -#pragma unroll for(int i = 0; i < strideCount; i++) { unroll_right(out, dataPtr, ldm, strideSpace, strides2d); diff --git a/library/include/rocwmma/internal/coop_store.hpp b/library/include/rocwmma/internal/coop_store.hpp index 0dd6e1d9..9f0f22b7 100644 --- a/library/include/rocwmma/internal/coop_store.hpp +++ b/library/include/rocwmma/internal/coop_store.hpp @@ -64,10 +64,7 @@ namespace rocwmma // Outer loop = index 0, // Inner loop = index N-1 - template + template ROCWMMA_DEVICE static inline auto unroll_right(DataT* dataPtr, Iterator& in, uint32_t ldm, @@ -94,7 +91,6 @@ namespace rocwmma // Recurse to the next nested layer else { -#pragma unroll for(int i = 0; i < strideCount; i++) { unroll_right(dataPtr, in, ldm, strideCounts, strides2d); diff --git a/test/dlrm/lds_mapping_util.hpp b/test/dlrm/lds_mapping_util.hpp index 1016d7eb..f5c8abbb 100644 --- a/test/dlrm/lds_mapping_util.hpp +++ b/test/dlrm/lds_mapping_util.hpp @@ -239,13 +239,6 @@ namespace rocwmma auto waveIndex = get<1>(waveCoord); auto waveCount = get<1>(workgroupDim); - constexpr auto splitCount = std::min((uint32_t)IOTraits::IOCount, - (uint32_t)IOTraits::IOCount); - - static_assert(((uint32_t)IOTraits::IOCount % splitCount == 0) - && ((uint32_t)IOTraits::IOCount % splitCount == 0), - "splitCount is not common divisor of GlobalRead and LocalWrite IOCounts"); - for(int32_t i = 0; i < BlocksX; ++i) { // Issue global read @@ -255,16 +248,14 @@ namespace rocwmma baseA + GlobalAOffsets::dataOffset(make_coord2d(BlockM * i, 0), lda), lda, waveIndex, - waveCount, - splitCount); + waveCount); // Issue local store store_matrix_coop_sync(baseLds + baseOffsetA() + waveOffsetA() + blockOffsetA(i), reinterpret_cast(fetchA), ld(), waveIndex, - waveCount, - splitCount); + waveCount); } } @@ -275,16 +266,10 @@ namespace rocwmma // we need to ensure that splitCounts are the same on both sides of // global fetch and local writes - Otherwise the waves don't have the // same data responsibility. - auto workgroupDim = GlobalBOffsets::workgroupDim(); - auto waveCoord = GlobalBOffsets::waveCoord(); - auto waveIndex = get<0>(waveCoord); - auto waveCount = get<0>(workgroupDim); - constexpr auto splitCount = std::min((uint32_t)IOTraits::IOCount, - (uint32_t)IOTraits::IOCount); - - static_assert(((uint32_t)IOTraits::IOCount % splitCount == 0) - && ((uint32_t)IOTraits::IOCount % splitCount == 0), - "splitCount is not common divisor of GlobalRead and LocalWrite IOCounts"); + auto workgroupDim = GlobalBOffsets::workgroupDim(); + auto waveCoord = GlobalBOffsets::waveCoord(); + auto waveIndex = get<0>(waveCoord); + auto waveCount = get<0>(workgroupDim); for(int32_t i = 0; i < BlocksY; ++i) { @@ -295,16 +280,14 @@ namespace rocwmma baseB + GlobalBOffsets::dataOffset(make_coord2d(0, BlockN * i), ldb), ldb, waveIndex, - waveCount, - splitCount); + waveCount); // Issue local store store_matrix_coop_sync(baseLds + baseOffsetB() + waveOffsetB() + blockOffsetB(i), reinterpret_cast(fetchB), ld(), waveIndex, - waveCount, - splitCount); + waveCount); } } @@ -364,7 +347,6 @@ namespace rocwmma __device__ static inline void prefetchLocalA(FragA* fragsA, DataT const* baseLds) { -#pragma unroll for(int i = 0; i < BlocksX; i++) { prefetchLocalA(fragsA[i], baseLds, i); @@ -373,7 +355,6 @@ namespace rocwmma __device__ static inline void prefetchLocalB(FragB* fragsB, DataT const* baseLds) { -#pragma unroll for(int i = 0; i < BlocksY; i++) { prefetchLocalB(fragsB[i], baseLds, i);