diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 0db93da921..6f6672099d 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -118,7 +118,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - if (GEMM_UNROLL_N <= 8) { + if (0 && GEMM_UNROLL_N <= 8) { LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, @@ -245,7 +245,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - if (GEMM_UNROLL_N <= 8) { + if (0 && GEMM_UNROLL_N <= 8) { + printf("helllo\n"); LASWP_NCOPY(min_jj, off + 1, off + k, b + (- off + jjs * lda) * COMPSIZE, lda, diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c index b637e6db5a..4922b9b52b 100644 --- a/lapack/getrf/getrf_parallel_omp.c +++ b/lapack/getrf/getrf_parallel_omp.c @@ -77,10 +77,21 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; +#if 0 LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, ipiv, sb + k * (jjs - js) * COMPSIZE); +#else + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, +#ifdef COMPLEX + ZERO, +#endif + c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); + + GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sb + (jjs - js) * k * COMPSIZE); +#endif + for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c index a761dee4c4..fcea0ae892 100644 --- a/lapack/getrf/getrf_single.c +++ b/lapack/getrf/getrf_single.c @@ -113,7 +113,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, min_jj = js + jmin - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#if 0 +#if 1 LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, #ifdef COMPLEX ZERO, diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c index c190176314..1b0db5f8c5 100644 --- a/lapack/laswp/generic/laswp_k_1.c +++ b/lapack/laswp/generic/laswp_k_1.c @@ -48,7 +48,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -58,13 +58,34 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { @@ -85,10 +106,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + + i--; + //Main Loop + while (i > 0) { #ifdef OPTERON #ifndef MINUS asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); @@ -172,12 +194,69 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a1 -= 2; #endif i --; - } while (i > 0); } + + //Loop Ending + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c index 1105aee82d..8a8a89bd15 100644 --- a/lapack/laswp/generic/laswp_k_2.c +++ b/lapack/laswp/generic/laswp_k_2.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3; FLOAT *b1, *b2, *b3, *b4; @@ -60,8 +60,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif @@ -69,6 +68,28 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG if (n <= 0) return 0; j = (n >> 1); + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { do { piv = ipiv; @@ -92,10 +113,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); + i = ((rows) >> 1); - if (i > 0) { - do { + // Loop pipeline + i--; + + //Main Loop + while (i > 0) { #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); @@ -202,12 +226,99 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a3 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + + //Remain + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -240,78 +351,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c index e08d49667b..86ee949c4f 100644 --- a/lapack/laswp/generic/laswp_k_4.c +++ b/lapack/laswp/generic/laswp_k_4.c @@ -54,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; @@ -66,14 +66,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 2); if (j > 0) { do { @@ -106,8 +127,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG i = ((k2 - k1) >> 1); - if (i > 0) { - do { + i--; //Loop pipeline + //Main Loop + while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; @@ -259,12 +281,156 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a7 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + + //Loop Ending + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } + } + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + + //Remain + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -312,10 +478,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = ((rows) >> 1); + i--; + + while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; @@ -409,12 +575,97 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a3 -= 2; #endif i --; - } while (i > 0); } - i = ((k2 - k1) & 1); + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + A1 = *a1; B1 = *b1; A3 = *a3; @@ -445,78 +696,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c index a4d4bce991..e3a05dbccb 100644 --- a/lapack/laswp/generic/laswp_k_8.c +++ b/lapack/laswp/generic/laswp_k_8.c @@ -60,9 +60,9 @@ #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, - FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *a9, *a11, *a13, *a15; @@ -79,13 +79,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 3); if (j > 0) { do { @@ -129,50 +151,51 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b15 = b1 + 7 * lda; b16 = b2 + 7 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; - B5 = *b5; - B6 = *b6; - B7 = *b7; - B8 = *b8; + i = (rows >> 1); + i--; + //Loop pipeline + //Main Loop + while (i > 0) { + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; - B9 = *b9; - B10 = *b10; - B11 = *b11; - B12 = *b12; - B13 = *b13; - B14 = *b14; - B15 = *b15; - B16 = *b16; + B9 = *b9; + B10 = *b10; + B11 = *b11; + B12 = *b12; + B13 = *b13; + B14 = *b14; + B15 = *b15; + B16 = *b16; - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; - A5 = *a5; - A6 = *a6; - A7 = *a7; - A8 = *a8; - - A9 = *a9; - A10 = *a10; - A11 = *a11; - A12 = *a12; - A13 = *a13; - A14 = *a14; - A15 = *a15; - A16 = *a16; - - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + A9 = *a9; + A10 = *a10; + A11 = *a11; + A12 = *a12; + A13 = *a13; + A14 = *a14; + A15 = *a15; + A16 = *a16; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; if (b1 == a1) { if (b2 == a1) { @@ -371,51 +394,316 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG } } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; - b5 = b1 + 2 * lda; - b6 = b2 + 2 * lda; - b7 = b1 + 3 * lda; - b8 = b2 + 3 * lda; + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; - b9 = b1 + 4 * lda; - b10 = b2 + 4 * lda; - b11 = b1 + 5 * lda; - b12 = b2 + 5 * lda; - b13 = b1 + 6 * lda; - b14 = b2 + 6 * lda; - b15 = b1 + 7 * lda; - b16 = b2 + 7 * lda; + b9 = b1 + 4 * lda; + b10 = b2 + 4 * lda; + b11 = b1 + 5 * lda; + b12 = b2 + 5 * lda; + b13 = b1 + 6 * lda; + b14 = b2 + 6 * lda; + b15 = b1 + 7 * lda; + b16 = b2 + 7 * lda; #ifndef MINUS - a1 += 2; - a3 += 2; - a5 += 2; - a7 += 2; - a9 += 2; - a11 += 2; - a13 += 2; - a15 += 2; + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + a9 += 2; + a11 += 2; + a13 += 2; + a15 += 2; #else - a1 -= 2; - a3 -= 2; - a5 -= 2; - a7 -= 2; - a9 -= 2; - a11 -= 2; - a13 -= 2; - a15 -= 2; + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; + a9 -= 2; + a11 -= 2; + a13 -= 2; + a15 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + B9 = *b9; + B10 = *b10; + B11 = *b11; + B12 = *b12; + B13 = *b13; + B14 = *b14; + B15 = *b15; + B16 = *b16; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + A9 = *a9; + A10 = *a10; + A11 = *a11; + A12 = *a12; + A13 = *a13; + A14 = *a14; + A15 = *a15; + A16 = *a16; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + + *a10 = B10; + *b10 = A10; + *a12 = B12; + *b12 = A12; + *a14 = B14; + *b14 = A14; + *a16 = B16; + *b16 = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + + *a9 = A10; + *a10 = B10; + *b10 = A9; + *a11 = A12; + *a12 = B12; + *b12 = A11; + *a13 = A14; + *a14 = B14; + *b14 = A13; + *a15 = A16; + *a16 = B16; + *b16 = A15; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + + *a9 = A10; + *a10 = B9; + *b9 = A9; + *a11 = A12; + *a12 = B11; + *b11 = A11; + *a13 = A14; + *a14 = B13; + *b13 = A13; + *a15 = A16; + *a16 = B15; + *b15 = A15; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + + *a9 = B9; + *b9 = A9; + *a11 = B11; + *b11 = A11; + *a13 = B13; + *b13 = A13; + *a15 = B15; + *b15 = A15; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + + *a9 = B9; + *a10 = A9; + *b9 = A10; + *a11 = B11; + *a12 = A11; + *b11 = A12; + *a13 = B13; + *a14 = A13; + *b13 = A14; + *a15 = B15; + *a16 = A15; + *b15 = A16; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + + *a9 = B9; + *a10 = B10; + *b9 = A9; + *b10 = A10; + *a11 = B11; + *a12 = B12; + *b11 = A11; + *b12 = A12; + *a13 = B13; + *a14 = B14; + *b13 = A13; + *b14 = A14; + *a15 = B15; + *a16 = B16; + *b15 = A15; + *b16 = A16; + } + } + + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + a9 += 2; + a11 += 2; + a13 += 2; + a15 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; + a9 -= 2; + a11 -= 2; + a13 -= 2; + a15 -= 2; +#endif + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + + b9 = b1 + 4 * lda; + b11 = b1 + 5 * lda; + b13 = b1 + 6 * lda; + b15 = b1 + 7 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -487,187 +775,327 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; - A5 = *a5; - A6 = *a6; - A7 = *a7; - A8 = *a8; + i = (rows >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; - B5 = *b5; - B6 = *b6; - B7 = *b7; - B8 = *b8; + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; *a3 = A4; - *a4 = A3; + *a4 = B3; + *b3 = A3; *a5 = A6; - *a6 = A5; + *a6 = B5; + *b5 = A5; *a7 = A8; - *a8 = A7; + *a8 = B7; + *b7 = A7; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - *a4 = B4; - *b4 = A4; - *a6 = B6; - *b6 = A6; - *a8 = B8; - *b8 = A8; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; *a2 = A1; - *a3 = A4; + *b1 = A2; + *a3 = B3; *a4 = A3; - *a5 = A6; + *b3 = A4; + *a5 = B5; *a6 = A5; - *a7 = A8; + *b5 = A6; + *a7 = B7; *a8 = A7; + *b7 = A8; } else { - *a1 = A2; + *a1 = B1; *a2 = B2; - *b2 = A1; - *a3 = A4; + *b1 = A1; + *b2 = A2; + *a3 = B3; *a4 = B4; - *b4 = A3; - *a5 = A6; + *b3 = A3; + *b4 = A4; + *a5 = B5; *a6 = B6; - *b6 = A5; - *a7 = A8; - *a8 = B8; - *b8 = A7; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; - *b1 = A1; - *a3 = A4; - *a4 = B3; - *b3 = A3; - *a5 = A6; - *a6 = B5; - *b5 = A5; - *a7 = A8; - *a8 = B7; - *b7 = A7; - } else - if (b2 == a2) { - *a1 = B1; - *b1 = A1; - *a3 = B3; - *b3 = A3; - *a5 = B5; *b5 = A5; + *b6 = A6; *a7 = B7; + *a8 = B8; *b7 = A7; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - *a3 = B3; - *a4 = A3; - *b3 = A4; - *a5 = B5; - *a6 = A5; - *b5 = A6; - *a7 = B7; - *a8 = A7; - *b7 = A8; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - *a3 = B3; - *a4 = B4; - *b3 = A3; - *b4 = A4; - *a5 = B5; - *a6 = B6; - *b5 = A5; - *b6 = A6; - *a7 = B7; - *a8 = B8; - *b7 = A7; - *b8 = A8; - } - } + *b8 = A8; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; - b5 = b1 + 2 * lda; - b6 = b2 + 2 * lda; - b7 = b1 + 3 * lda; - b8 = b2 + 3 * lda; + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; #ifndef MINUS - a1 += 2; - a3 += 2; - a5 += 2; - a7 += 2; + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; #else - a1 -= 2; - a3 -= 2; - a5 -= 2; - a7 -= 2; + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; #endif - i --; - } while (i > 0); + i --; + } + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } } + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + + i = (rows & 1); - i = ((k2 - k1) & 1); - - if (i > 0) { - A1 = *a1; - B1 = *b1; - A3 = *a3; - B3 = *b3; - A5 = *a5; - B5 = *b5; - A7 = *a7; - B7 = *b7; + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; - *a1 = B1; - *b1 = A1; - *a3 = B3; - *b3 = A3; - *a5 = B5; - *b5 = A5; - *a7 = B7; - *b7 = A7; - } + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } - a += 4 * lda; + a += 4 * lda; } if (n & 2) { @@ -692,109 +1120,194 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; + i = ((rows) >> 1); + i--; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; *a3 = A4; - *a4 = A3; + *a4 = B3; + *b3 = A3; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - *a4 = B4; - *b4 = A4; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - *a3 = A4; - *a4 = A3; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - *a3 = A4; - *a4 = B4; - *b4 = A3; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; - *a3 = A4; - *a4 = B3; + *a3 = B3; *b3 = A3; } else - if (b2 == a2) { + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { *a1 = B1; + *a2 = B2; *b1 = A1; + *b2 = A2; *a3 = B3; + *a4 = B4; *b3 = A3; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - *a3 = B3; - *a4 = A3; - *b3 = A4; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - *a3 = B3; - *a4 = B4; - *b3 = A3; - *b4 = A4; - } - } + *b4 = A4; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; #ifndef MINUS - a1 += 2; - a3 += 2; + a1 += 2; + a3 += 2; #else - a1 -= 2; - a3 -= 2; + a1 -= 2; + a3 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + A1 = *a1; B1 = *b1; A3 = *a3; @@ -825,78 +1338,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c index 3dd653baf4..7a62dd9b86 100644 --- a/lapack/laswp/generic/zlaswp_k_1.c +++ b/lapack/laswp/generic/zlaswp_k_1.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -66,6 +66,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { @@ -87,9 +119,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b2 = a + ip2; i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i --; + //Loop pipeline + //Main Loop + while (i > 0) { #ifdef OPTERON #ifndef MINUS asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); @@ -198,12 +231,98 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); + } - - i = ((k2 - k1) & 1); + + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c index a877ef66bd..0fa6858596 100644 --- a/lapack/laswp/generic/zlaswp_k_2.c +++ b/lapack/laswp/generic/zlaswp_k_2.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -68,6 +68,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 1); if (j > 0) { @@ -88,10 +120,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); @@ -246,12 +280,149 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a1 + 0 + lda); + A6 = *(a1 + 1 + lda); + A7 = *(a2 + 0 + lda); + A8 = *(a2 + 1 + lda); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b1 + 0 + lda); + B6 = *(b1 + 1 + lda); + B7 = *(b2 + 0 + lda); + B8 = *(b2 + 1 + lda); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A5; + *(b2 + 1 + lda) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B5; + *(a2 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + *(b1 + 0 + lda) = A7; + *(b1 + 1 + lda) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } + + + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a1 + 0 + lda); @@ -293,10 +464,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -384,12 +557,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c index 4dc5598953..c63a8e2e0f 100644 --- a/lapack/laswp/generic/zlaswp_k_4.c +++ b/lapack/laswp/generic/zlaswp_k_4.c @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; @@ -76,6 +76,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 2); if (j > 0) { @@ -107,10 +139,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -366,12 +400,260 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a7 -= 4; #endif i --; - } while (i > 0); } - i = ((k2 - k1) & 1); + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + A9 = *(a5 + 0); + A10 = *(a5 + 1); + A11 = *(a6 + 0); + A12 = *(a6 + 1); + A13 = *(a7 + 0); + A14 = *(a7 + 1); + A15 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + B9 = *(b5 + 0); + B10 = *(b5 + 1); + B11 = *(b6 + 0); + B12 = *(b6 + 1); + B13 = *(b7 + 0); + B14 = *(b7 + 1); + B15 = *(b8 + 0); + B16 = *(b8 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A9; + *(b6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A13; + *(b8 + 1) = A14; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B9; + *(a6 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B13; + *(a8 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(b5 + 0) = A11; + *(b5 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + *(b7 + 0) = A15; + *(b7 + 1) = A16; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } + +#ifndef MINUS + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; +#else + a1 -= 4; + a3 -= 4; + a5 -= 4; + a7 -= 4; +#endif + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a3 + 0); @@ -435,161 +717,303 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b3 = b1 + lda; b4 = b2 + lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *(a1 + 0); - A2 = *(a1 + 1); - A3 = *(a2 + 0); - A4 = *(a2 + 1); + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); - A5 = *(a3 + 0); - A6 = *(a3 + 1); - A7 = *(a4 + 0); - A8 = *(a4 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); - B1 = *(b1 + 0); - B2 = *(b1 + 1); - B3 = *(b2 + 0); - B4 = *(b2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); - B5 = *(b3 + 0); - B6 = *(b3 + 1); - B7 = *(b4 + 0); - B8 = *(b4 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); - ip1 = *piv * 2; - piv += incx; - ip2 = *piv * 2; - piv += incx; + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } + } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; - *(a2 + 0) = A1; - *(a2 + 1) = A2; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; - *(a4 + 0) = A5; - *(a4 + 1) = A6; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; } else - if (b2 != a2) { - *(a2 + 0) = B3; - *(a2 + 1) = B4; - *(b2 + 0) = A3; - *(b2 + 1) = A4; - *(a4 + 0) = B7; - *(a4 + 1) = B8; - *(b4 + 0) = A7; - *(b4 + 1) = A8; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = A1; - *(a2 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = A5; - *(a4 + 1) = A6; - } else { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = B3; - *(a2 + 1) = B4; - *(b2 + 0) = A1; - *(b2 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = B7; - *(a4 + 1) = B8; - *(b4 + 0) = A5; - *(b4 + 1) = A6; - } - } - } else { - if (b2 == a1) { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = B1; - *(a2 + 1) = B2; + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = B5; - *(a4 + 1) = B6; + *(a3 + 0) = B5; + *(a3 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; } else - if (b2 == a2) { + if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; *(a3 + 0) = B5; *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; *(b3 + 0) = A5; *(b3 + 1) = A6; - } else - if (b2 == b1) { - *(a1 + 0) = B1; - *(a1 + 1) = B2; - *(a2 + 0) = A1; - *(a2 + 1) = A2; - *(b1 + 0) = A3; - *(b1 + 1) = A4; - *(a3 + 0) = B5; - *(a3 + 1) = B6; - *(a4 + 0) = A5; - *(a4 + 1) = A6; - *(b3 + 0) = A7; - *(b3 + 1) = A8; - } else { - *(a1 + 0) = B1; - *(a1 + 1) = B2; - *(a2 + 0) = B3; - *(a2 + 1) = B4; - *(b1 + 0) = A1; - *(b1 + 1) = A2; - *(b2 + 0) = A3; - *(b2 + 1) = A4; - *(a3 + 0) = B5; - *(a3 + 1) = B6; - *(a4 + 0) = B7; - *(a4 + 1) = B8; - *(b3 + 0) = A5; - *(b3 + 1) = A6; - *(b4 + 0) = A7; - *(b4 + 1) = A8; - } - } + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; - b3 = b1 + lda; - b4 = b2 + lda; + b3 = b1 + lda; + b4 = b2 + lda; #ifndef MINUS - a1 += 4; - a3 += 4; + a1 += 4; + a3 += 4; #else - a1 -= 4; - a3 -= 4; + a1 -= 4; + a3 -= 4; #endif - i --; - } while (i > 0); + i --; } - - i = ((k2 - k1) & 1); + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + +#ifndef MINUS + a1 += 4; + a3 += 4; +#else + a1 -= 4; + a3 -= 4; +#endif + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + + b1 = a + ip1; + b3 = b1 + lda; + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a3 + 0); @@ -629,10 +1053,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -720,12 +1146,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/x86/Makefile b/lapack/laswp/x86/Makefile index 105ec4027e..2e9db40522 100644 --- a/lapack/laswp/x86/Makefile +++ b/lapack/laswp/x86/Makefile @@ -17,11 +17,11 @@ ZLASWP = ../generic/zlaswp_k_1.c endif ifndef LASWP -LASWP = ../generic/laswp_k_1.c +LASWP = ../generic/laswp_k.c endif ifndef ZLASWP -ZLASWP = ../generic/zlaswp_k_1.c +ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile diff --git a/lapack/laswp/x86_64/Makefile b/lapack/laswp/x86_64/Makefile index ba07dcf4f8..17fb1f9617 100644 --- a/lapack/laswp/x86_64/Makefile +++ b/lapack/laswp/x86_64/Makefile @@ -22,11 +22,11 @@ ZLASWP = ../generic/zlaswp_k_1.c endif ifndef LASWP -LASWP = ../generic/laswp_k_1.c +LASWP = ../generic/laswp_k.c endif ifndef ZLASWP -ZLASWP = ../generic/zlaswp_k_1.c +ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile diff --git a/patch.for_lapack-3.4.1 b/patch.for_lapack-3.4.1 index 79c74aad2c..ff4954b090 100644 --- a/patch.for_lapack-3.4.1 +++ b/patch.for_lapack-3.4.1 @@ -191,7 +191,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) ilaslr.$(SUFFIX) ilaslc.$(SUFFIX) \ + slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarfgp.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \ + slarrv.$(SUFFIX) slartv.$(SUFFIX) \ -+ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slaswp.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ ++ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ + slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \ + sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \ + sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \ @@ -345,7 +345,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) clarfgp.$(SUFFIX) \ + clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \ + clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \ -+ claswp.$(SUFFIX) clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ ++ clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ + clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \ + cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \ + cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotri.$(SUFFIX) cpstrf.$(SUFFIX) cpstf2.$(SUFFIX) \ @@ -484,7 +484,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) iladlr.$(SUFFIX) iladlc.$(SUFFIX) \ + dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarfgp.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) \ + dlargv.$(SUFFIX) dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \ -+ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlaswp.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ ++ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ + dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \ + dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \ + dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \ @@ -643,7 +643,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + zlarfg.$(SUFFIX) zlarft.$(SUFFIX) zlarfgp.$(SUFFIX) \ + zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \ + zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \ -+ zlassq.$(SUFFIX) zlaswp.$(SUFFIX) zlasyf.$(SUFFIX) \ ++ zlassq.$(SUFFIX) zlasyf.$(SUFFIX) \ + zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) zlauu2.$(SUFFIX) \ + zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \ + zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \