Performance fix for 5.1 and updated changelog(#391)

* Getrf tuning3 (tuning for npvt and batch cases) (#383) * update changelog
ROCm · Mar 16, 2022 · 0e9e2c9 · 0e9e2c9
1 parent 55761c3
commit 0e9e2c9
Show file tree

Hide file tree

Showing 8 changed files with 980 additions and 187 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,13 +2,17 @@
 
 Full documentation for rocSOLVER is available at [rocsolver.readthedocs.io](https://rocsolver.readthedocs.io/en/latest/).
 
-## (Unreleased) rocSOLVER
+## rocSOLVER 3.17.0 for ROCm 5.1.0
+### Optimized
+- Optimized non-pivoting and batch cases of the LU factorization
+
 ### Fixed
 - Fixed missing synchronization in SYTRF with `rocblas_fill_lower` that could potentially
   result in incorrect pivot values.
 - Fixed multi-level logging output to file with the `ROCSOLVER_LOG_PATH`,
   `ROCSOLVER_LOG_TRACE_PATH`, `ROCSOLVER_LOG_BENCH_PATH` and `ROCSOLVER_LOG_PROFILE_PATH`
   environment variables.
+- Fixed performance regression in the batched LU factorization of tiny matrices
 
 
 ## rocSOLVER 3.16.0 for ROCm 5.0.0

diff --git a/clients/gtest/memory_model_gtest.cpp b/clients/gtest/memory_model_gtest.cpp
@@ -56,13 +56,13 @@ class checkin_misc_MEMORY_MODEL : public ::testing::Test
 
     const rocblas_int m = 1500;
     const rocblas_int n = 1500;
-    const rocblas_int m_small = 750;
-    const rocblas_int n_small = 750;
+    const rocblas_int m_small = 65;
+    const rocblas_int n_small = 65;
     const rocblas_int lda = m;
     const rocblas_stride stA = lda * n;
     const rocblas_stride stP = n;
     const rocblas_int bc = 8;
-    const rocblas_int bc_small = 8;
+    const rocblas_int bc_small = 5;
 };
 
 /*************************************/

diff --git a/library/src/include/ideal_sizes.hpp b/library/src/include/ideal_sizes.hpp
@@ -184,22 +184,37 @@
 
 /**************************** getf2/getfr *************************************
 *******************************************************************************/
-#define GETF2_MAX_COLS 64 //always <= wavefront size
-#define GETF2_MAX_THDS 64
+#define GETF2_SPKER_MAX_M 1024 //always <= 1024
+#define GETF2_SPKER_MAX_N 256 //always <= 256
+#define GETF2_SSKER_MAX_M 512 //always <= 512 and <= GETF2_SPKER_MAX_M
+#define GETF2_SSKER_MAX_N 64 //always <= wavefront and <= GETF2_SPKER_MAX_N
 #define GETF2_OPTIM_NGRP \
     16, 15, 8, 8, 8, 8, 8, 8, 6, 6, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-#define GETRF_NUM_INTERVALS 4
-#define GETRF_INTERVALS 64, 512, 1536, 4096
-#define GETRF_BLKSIZES 0, 1, 32, 128, 384
-#define GETRF_BATCH_NUM_INTERVALS 3
-#define GETRF_BATCH_INTERVALS 52, 148, 1376
-#define GETRF_BATCH_BLKSIZES 0, 16, 32, 288
-#define GETRF_NPVT_NUM_INTERVALS 2
-#define GETRF_NPVT_INTERVALS 65, 1536
-#define GETRF_NPVT_BLKSIZES 0, 32, 256
-#define GETRF_NPVT_BATCH_NUM_INTERVALS 3
-#define GETRF_NPVT_BATCH_INTERVALS 33, 148, 1216
-#define GETRF_NPVT_BATCH_BLKSIZES 0, 16, 32, 256
+#define GETRF_NUM_INTERVALS_REAL 4
+#define GETRF_INTERVALS_REAL 64, 512, 1856, 2944
+#define GETRF_BLKSIZES_REAL 0, 1, 32, 256, 512
+#define GETRF_BATCH_NUM_INTERVALS_REAL 9
+#define GETRF_BATCH_INTERVALS_REAL 40, 42, 46, 49, 52, 58, 112, 800, 1024
+#define GETRF_BATCH_BLKSIZES_REAL 0, 32, 0, 16, 0, 32, 1, 32, 64, 160
+#define GETRF_NPVT_NUM_INTERVALS_REAL 2
+#define GETRF_NPVT_INTERVALS_REAL 64, 512
+#define GETRF_NPVT_BLKSIZES_REAL 0, -1, 512
+#define GETRF_NPVT_BATCH_NUM_INTERVALS_REAL 6
+#define GETRF_NPVT_BATCH_INTERVALS_REAL 40, 168, 448, 512, 896, 1408
+#define GETRF_NPVT_BATCH_BLKSIZES_REAL 0, -24, -32, -64, 32, 96, 512
+
+#define GETRF_NUM_INTERVALS_COMPLEX 4
+#define GETRF_INTERVALS_COMPLEX 64, 512, 1024, 2944
+#define GETRF_BLKSIZES_COMPLEX 0, 1, 32, 96, 512
+#define GETRF_BATCH_NUM_INTERVALS_COMPLEX 10
+#define GETRF_BATCH_INTERVALS_COMPLEX 23, 28, 30, 32, 40, 48, 56, 64, 768, 1024
+#define GETRF_BATCH_BLKSIZES_COMPLEX 0, 16, 0, 1, 24, 16, 24, 16, 48, 64, 160
+#define GETRF_NPVT_NUM_INTERVALS_COMPLEX 2
+#define GETRF_NPVT_INTERVALS_COMPLEX 64, 512
+#define GETRF_NPVT_BLKSIZES_COMPLEX 0, -1, 512
+#define GETRF_NPVT_BATCH_NUM_INTERVALS_COMPLEX 5
+#define GETRF_NPVT_BATCH_INTERVALS_COMPLEX 20, 32, 42, 512, 1408
+#define GETRF_NPVT_BATCH_BLKSIZES_COMPLEX 0, -16, -32, -48, 64, 128
 
 /****************************** getri *****************************************
 *******************************************************************************/

diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp
@@ -1,5 +1,5 @@
 /* ************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc.
+ * Copyright (c) 2019-2022 Advanced Micro Devices, Inc.
  * ************************************************************************ */
 
 #pragma once
@@ -882,7 +882,7 @@ ROCSOLVER_KERNEL void gemm_kernel(const rocblas_int m,
 }
 
 /** Optimized kernel that solves a simple triangular system B <- Ax=B
-    with A unit matrix. A and B are sub blocks of the same matrix MM with
+    with A unit lower triangular matrix. A and B are sub blocks of the same matrix MM with
     leading dimension ldim and stride. A and B are
     located in MM by their respective shifts.
 
@@ -892,13 +892,13 @@ ROCSOLVER_KERNEL void gemm_kernel(const rocblas_int m,
     Size of shared memory per group should be:
     lmemsize = hipBlockDim_y * sizeof(T); **/
 template <typename T, typename U>
-ROCSOLVER_KERNEL void trsm2_kernel(const rocblas_int m,
-                                   const rocblas_int n,
-                                   U MM,
-                                   const rocblas_int shiftA,
-                                   const rocblas_int shiftB,
-                                   const rocblas_int ldim,
-                                   const rocblas_stride stride)
+ROCSOLVER_KERNEL void trsm2_lower_kernel(const rocblas_int m,
+                                         const rocblas_int n,
+                                         U MM,
+                                         const rocblas_int shiftA,
+                                         const rocblas_int shiftB,
+                                         const rocblas_int ldim,
+                                         const rocblas_stride stride)
 {
     int id = hipBlockIdx_z;
     int i = hipThreadIdx_x;
@@ -934,3 +934,66 @@ ROCSOLVER_KERNEL void trsm2_kernel(const rocblas_int m,
         B[i + j * ldim] = c;
     }
 }
+
+/** Optimized kernel that solves a simple triangular system B <- xA=B
+    with A non-unit upper triangular matrix. A and B are sub blocks of the same matrix MM with
+    leading dimension ldim and stride. A and B are
+    located in MM by their respective shifts.
+
+    Call this kernel with 'batch_count' groups in z, and enough
+    groups in x to cover all the 'm' right-hand-sides (rows of B).
+    There should be only one group in y with hipBlockDim_y = n.
+    Size of shared memory per group should be:
+    lmemsize = hipBlockDim_x * sizeof(T); **/
+template <typename T, typename U>
+ROCSOLVER_KERNEL void trsm2_upper_kernel(const rocblas_int m,
+                                         const rocblas_int n,
+                                         U MM,
+                                         const rocblas_int shiftA,
+                                         const rocblas_int shiftB,
+                                         const rocblas_int ldim,
+                                         const rocblas_stride stride)
+{
+    int id = hipBlockIdx_z;
+    int j = hipThreadIdx_y;
+    int tx = hipThreadIdx_x;
+    int bdx = hipBlockDim_x;
+    int i = hipBlockIdx_x * bdx + tx;
+
+    // batch instance
+    T* A = load_ptr_batch(MM, id, shiftA, stride);
+    T* B = load_ptr_batch(MM, id, shiftB, stride);
+
+    // shared mem setup
+    extern __shared__ double lmem[];
+    T* b = reinterpret_cast<T*>(lmem);
+    T c, d;
+
+    if(i < m)
+    {
+        // read data
+        c = B[i + j * ldim];
+
+        // solve for right-hand sides
+        for(int k = 0; k < n - 1; ++k)
+        {
+            __syncthreads();
+            if(j == k)
+            {
+                d = A[j + j * ldim];
+                c = d != 0 ? c / d : c;
+                b[tx] = c;
+            }
+            __syncthreads();
+            c -= (j > k) ? A[k + j * ldim] * b[tx] : 0;
+        }
+        if(j == n - 1)
+        {
+            d = A[j + j * ldim];
+            c = d != 0 ? c / d : c;
+        }
+
+        // move results back to global
+        B[i + j * ldim] = c;
+    }
+}